From fcb905f519464e657f367218e13fadbc9082ea5a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 7 Feb 2023 11:08:55 +0200 Subject: [PATCH 001/426] Use LayerMap::replace in eviction (#3544) Follow-up to #3536, to actually use the new `Debug` in replacing the layers, and use replacement with manual eviction endpoint. Turns out the two paths share a lot of handling of `Replacement` but didn't unify the two (need 3). There are also upcoming refactorings from other PRs to this. --- pageserver/src/tenant/timeline.rs | 52 ++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6ae23c584b..ff7b8c932c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -867,15 +867,26 @@ impl Timeline { Ok(Some(true)) } + /// Evicts one layer as in replaces a downloaded layer with a remote layer + /// + /// Returns: + /// - `Ok(Some(true))` when the layer was replaced + /// - `Ok(Some(false))` when the layer was found, but no changes were made + /// - evictee was not yet downloaded + /// - layermap replacement failed + /// - `Ok(None)` when the layer is not found pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + use super::layer_map::Replacement; + let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) }; if local_layer.is_remote_layer() { return Ok(Some(false)); } - let Some(remote_client) = &self.remote_client else { return Ok(Some(false)) }; // ensure the current layer is uploaded for sure - remote_client + self.remote_client + .as_ref() + .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))? .wait_completion() .await .context("wait for layer upload ops to complete")?; @@ -909,13 +920,43 @@ impl Timeline { let gc_lock = self.layer_removal_cs.lock().await; let mut layers = self.layers.write().unwrap(); let mut updates = layers.batch_update(); - self.delete_historic_layer(&gc_lock, local_layer, &mut updates)?; - updates.insert_historic(new_remote_layer); + + let replaced = match updates.replace_historic(&local_layer, new_remote_layer)? { + Replacement::Replaced { .. } => { + let layer_size = local_layer.file_size(); + + if let Err(e) = local_layer.delete() { + error!("failed to remove layer file on evict after replacement: {e:#?}"); + } + + if let Some(layer_size) = layer_size { + self.metrics.resident_physical_size_gauge.sub(layer_size); + } + + true + } + Replacement::NotFound => { + debug!(evicted=?local_layer, "layer was no longer in layer map"); + false + } + Replacement::RemovalBuffered => { + unreachable!("not doing anything else in this batch") + } + Replacement::Unexpected(other) => { + error!( + local_layer.ptr=?Arc::as_ptr(&local_layer), + other.ptr=?Arc::as_ptr(&other), + ?other, + "failed to replace"); + false + } + }; + updates.flush(); drop(layers); drop(gc_lock); - Ok(Some(true)) + Ok(Some(replaced)) } } @@ -3422,6 +3463,7 @@ impl Timeline { error!( expected.ptr = ?Arc::as_ptr(&l), other.ptr = ?Arc::as_ptr(&other), + ?other, "replacing downloaded layer into layermap failed because another layer was found instead of expected" ); } From 1254dc7ee2705d67d228c7e924f4379cc0a27878 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 7 Feb 2023 15:21:15 +0100 Subject: [PATCH 002/426] Fix production deploy: run as root to access docker (#3555) --- .github/workflows/deploy-prod.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index b6800a8f7a..f17e9e3e5e 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -41,6 +41,7 @@ jobs: deploy-prod-new: runs-on: prod container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + options: --user root --privileged if: inputs.deployStorage && inputs.disclamerAcknowledged defaults: run: @@ -169,6 +170,7 @@ jobs: deploy: runs-on: prod container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + options: --user root --privileged if: inputs.deployStorage && inputs.disclamerAcknowledged defaults: run: From c5c14368e3bd11a30c2b29a705865c05611ada2e Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 7 Feb 2023 15:27:31 +0100 Subject: [PATCH 003/426] Fix deploy-prod.yml syntax (#3556) --- .github/workflows/deploy-prod.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index f17e9e3e5e..f4ce7e9afa 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -40,8 +40,9 @@ concurrency: jobs: deploy-prod-new: runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - options: --user root --privileged + container: + image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + options: --user root --privileged if: inputs.deployStorage && inputs.disclamerAcknowledged defaults: run: @@ -169,8 +170,9 @@ jobs: deploy: runs-on: prod - container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - options: --user root --privileged + container: + image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + options: --user root --privileged if: inputs.deployStorage && inputs.disclamerAcknowledged defaults: run: From a6dffb6ef91bcf8995ef44261796463bf13723f4 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 8 Feb 2023 14:25:25 +0200 Subject: [PATCH 004/426] fix: stop using Arc::ptr_eq with dyn Trait (#3558) This changes the way we compare `Arc` in Timeline's `LayerMap` not to use `Arc::ptr_eq` which has been witnessed in development of #3557 to yield wrong results. It gives wrong results because it compares fat pointers, which are `(object, vtable)` tuples for `dyn Trait` and there are no guarantees that the `vtable`s are unique. As in there were multiple vtables for `RemoteLayer` which is why the comparison failed in #3557. This is a known issue in rust, clippy warns against it and rust std might be moving to the solution which has been reproduced on this PR: compare only object pointers by "casting out" the vtable pointer. --- pageserver/src/tenant/layer_map.rs | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 59a358a355..9d8c825220 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -733,14 +733,25 @@ where #[inline(always)] fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { - // FIXME: ptr_eq might fail to return true for 'dyn' references because of multiple vtables - // can be created in compilation. Clippy complains about this. In practice it seems to - // work. + // "dyn Trait" objects are "fat pointers" in that they have two components: + // - pointer to the object + // - pointer to the vtable // - // In future rust versions this might become Arc::as_ptr(left) as *const () == - // Arc::as_ptr(right) as *const (), we could change to that before. - #[allow(clippy::vtable_address_comparisons)] - Arc::ptr_eq(left, right) + // rust does not provide a guarantee that these vtables are unique, but however + // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the + // pointer and the vtable need to be equal. + // + // See: https://github.com/rust-lang/rust/issues/103763 + // + // A future version of rust will most likely use this form below, where we cast each + // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it + // not affect the comparison. + // + // See: https://github.com/rust-lang/rust/pull/106450 + let left = Arc::as_ptr(left) as *const (); + let right = Arc::as_ptr(right) as *const (); + + left == right } } From 7ed93fff06ea19b2ba8496ff4721d3646559fd59 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 7 Feb 2023 11:41:21 +0100 Subject: [PATCH 005/426] refactor: allow for eviction of layers in a batch The auto-eviction PR (#3552) operates in two phaes: 1. find candidate layers 2. evict them. For (2), a batch API like the one added in this commit is useful. Note that this PR requires #3558 to be merged first. Otherwise, the tests won't pass. --- pageserver/src/tenant/timeline.rs | 116 ++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 28 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ff7b8c932c..838df6d884 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -867,30 +867,98 @@ impl Timeline { Ok(Some(true)) } - /// Evicts one layer as in replaces a downloaded layer with a remote layer - /// - /// Returns: - /// - `Ok(Some(true))` when the layer was replaced - /// - `Ok(Some(false))` when the layer was found, but no changes were made - /// - evictee was not yet downloaded - /// - layermap replacement failed - /// - `Ok(None)` when the layer is not found + /// Like [`evict_layer_batch`], but for just one layer. + /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { - use super::layer_map::Replacement; - let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) }; - if local_layer.is_remote_layer() { - return Ok(Some(false)); - } - - // ensure the current layer is uploaded for sure - self.remote_client + let remote_client = self + .remote_client .as_ref() - .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))? + .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?; + + let cancel = CancellationToken::new(); + let results = self + .evict_layer_batch(remote_client, &[local_layer], cancel) + .await?; + assert_eq!(results.len(), 1); + let result: Option> = results.into_iter().next().unwrap(); + match result { + None => anyhow::bail!("task_mgr shutdown requested"), + Some(Ok(b)) => Ok(Some(b)), + Some(Err(e)) => Err(e), + } + } + + /// Try to evict the given `layers_to_evict` by + /// 1. Replacing the given layer object in the layer map with a corresponding [`RemoteLayer`] object. + /// 2. Deleting the now unreferenced layer file from disk. + /// + /// The `remote_client` should be this timeline's `self.remote_client`. + /// We make the caller provide it so that they are responsible for handling the case + /// where someone wants to evict the layer but no remote storage is configured. + /// + /// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`. + /// If `Err()` is returned, no eviction was attempted. + /// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`. + /// Meaning of each `result[i]`: + /// - `Some(Err(...))` if layer replacement failed for an unexpected reason + /// - `Some(Ok(true))` if everything went well. + /// - `Some(Ok(false))` if there was an expected reason why the layer could not be replaced, e.g.: + /// - evictee was not yet downloaded + /// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks) + /// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`. + async fn evict_layer_batch( + &self, + remote_client: &Arc, + layers_to_evict: &[Arc], + cancel: CancellationToken, + ) -> anyhow::Result>>> { + // ensure that the layers have finished uploading + // (don't hold the layer_removal_cs while we do it, we're not removing anything yet) + remote_client .wait_completion() .await .context("wait for layer upload ops to complete")?; + // now lock out layer removal (compaction, gc, timeline deletion) + let layer_removal_guard = self.layer_removal_cs.lock().await; + + // start the batch update + let mut layer_map = self.layers.write().unwrap(); + let mut batch_updates = layer_map.batch_update(); + + let mut results = Vec::with_capacity(layers_to_evict.len()); + + for l in layers_to_evict.iter() { + let res = if cancel.is_cancelled() { + None + } else { + Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates)) + }; + results.push(res); + } + + // commit the updates & release locks + batch_updates.flush(); + drop(layer_map); + drop(layer_removal_guard); + + assert_eq!(results.len(), layers_to_evict.len()); + Ok(results) + } + + fn evict_layer_batch_impl( + &self, + _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, + local_layer: &Arc, + batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, + ) -> anyhow::Result { + use super::layer_map::Replacement; + + if local_layer.is_remote_layer() { + return Ok(false); + } + let layer_metadata = LayerFileMetadata::new( local_layer .file_size() @@ -917,11 +985,7 @@ impl Timeline { ), }); - let gc_lock = self.layer_removal_cs.lock().await; - let mut layers = self.layers.write().unwrap(); - let mut updates = layers.batch_update(); - - let replaced = match updates.replace_historic(&local_layer, new_remote_layer)? { + let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? { Replacement::Replaced { .. } => { let layer_size = local_layer.file_size(); @@ -944,7 +1008,7 @@ impl Timeline { } Replacement::Unexpected(other) => { error!( - local_layer.ptr=?Arc::as_ptr(&local_layer), + local_layer.ptr=?Arc::as_ptr(local_layer), other.ptr=?Arc::as_ptr(&other), ?other, "failed to replace"); @@ -952,11 +1016,7 @@ impl Timeline { } }; - updates.flush(); - drop(layers); - drop(gc_lock); - - Ok(Some(replaced)) + Ok(replaced) } } From 1b9e5e84aa4daf58b5264ccf71e0ab6d254a47d2 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Wed, 8 Feb 2023 16:48:29 +0100 Subject: [PATCH 006/426] Add new storage hosts for placement group test (#3561) To test the placement group setup --- .github/ansible/staging.us-east-2.hosts.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index b46e729e32..9a1a095282 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -31,6 +31,8 @@ storage: ansible_host: i-01e31cdf7e970586a pageserver-3.us-east-2.aws.neon.build: ansible_host: i-0602a0291365ef7cc + pageserver-99.us-east-2.aws.neon.build: + ansible_host: i-0c39491109bb88824 safekeepers: hosts: @@ -40,3 +42,5 @@ storage: ansible_host: i-0171efc3604a7b907 safekeeper-2.us-east-2.aws.neon.build: ansible_host: i-0de0b03a51676a6ce + safekeeper-99.us-east-2.aws.neon.build: + ansible_host: i-0d61b6a2ea32028d5 From 371493ae32a1eef947c4df6da9d8b102e84fcfd2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 Feb 2023 16:08:01 +0000 Subject: [PATCH 007/426] Bump cryptography from 38.0.3 to 39.0.1 (#3565) --- poetry.lock | 81 ++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 42 deletions(-) diff --git a/poetry.lock b/poetry.lock index f14c495556..7e80b1e10a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -865,50 +865,47 @@ files = [ [[package]] name = "cryptography" -version = "38.0.3" +version = "39.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320"}, - {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c"}, - {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0"}, - {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748"}, - {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146"}, - {file = "cryptography-38.0.3-cp36-abi3-win32.whl", hash = "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0"}, - {file = "cryptography-38.0.3-cp36-abi3-win_amd64.whl", hash = "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220"}, - {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd"}, - {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55"}, - {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249"}, - {file = "cryptography-38.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548"}, - {file = "cryptography-38.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a"}, - {file = "cryptography-38.0.3.tar.gz", hash = "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd"}, + {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"}, + {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"}, + {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"}, + {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"}, + {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"}, + {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"}, + {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"}, + {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"}, + {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"}, ] [package.dependencies] cffi = ">=1.12" [package.extras] -docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] -pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] +pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"] sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] +test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"] +test-randomorder = ["pytest-randomly"] +tox = ["tox"] [[package]] name = "docker" @@ -1241,6 +1238,7 @@ category = "main" optional = false python-versions = "*" files = [ + {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"}, {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, ] @@ -1718,6 +1716,7 @@ python-versions = ">=3.6" files = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -1751,6 +1750,7 @@ files = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -1762,6 +1762,7 @@ files = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -1794,18 +1795,7 @@ category = "main" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] @@ -2014,8 +2004,8 @@ files = [ [package.dependencies] pytest = [ - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] [[package]] @@ -2139,6 +2129,13 @@ files = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, From 2040db98ef7301d9b0a4007d3978d34388d40884 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 9 Feb 2023 11:20:10 +0200 Subject: [PATCH 008/426] Add docs for synthetic size calculation (#3328) --------- Co-authored-by: Heikki Linnakangas Co-authored-by: Anastasia Lubennikova --- docs/synthetic-size.md | 335 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 docs/synthetic-size.md diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md new file mode 100644 index 0000000000..8378efc842 --- /dev/null +++ b/docs/synthetic-size.md @@ -0,0 +1,335 @@ +# Synthetic size + +Neon storage has copy-on-write branching, which makes it difficult to +answer the question "how large is my database"? To give one reasonable +answer, we calculate _synthetic size_ for a project. + +The calculation is called "synthetic", because it is based purely on +the user-visible logical size, which is the size that you would see on +a standalone PostgreSQL installation, and the amount of WAL, which is +also the same as what you'd see on a standalone PostgreSQL, for the +same set of updates. + +The synthetic size does *not* depend on the actual physical size +consumed in the storage, or implementation details of the Neon storage +like garbage collection, compaction and compression. There is a +strong *correlation* between the physical size and the synthetic size, +but the synthetic size is designed to be independent of the +implementation details, so that any improvements we make in the +storage system simply reduce our COGS. And vice versa: any bugs or bad +implementation where we keep more data than we would need to, do not +change the synthetic size or incur any costs to the user. + +The synthetic size is calculated for the whole project. It is not +straighforward to attribute size to individual branches. See "What is +the size of an individual branch?" for discussion on those +difficulties. + +The synthetic size is designed to: + +- Take into account the copy-on-write nature of the storage. For + example, if you create a branch, it doesn't immediately add anything + to the synthetic size. It starts to affect the synthetic size only + as it diverges from the parent branch. + +- Be independent of any implementation details of the storage, like + garbage collection, remote storage, or compression. + +## Terms & assumptions + +- logical size is the size of a database *at a given point in + time*. It's the total size of all tables in all databases, as you + see with "\l+" in psql for example, plus the Postgres SLRUs and some + small amount of metadata. NOTE that currently, Neon does not include + the SLRUs and metadata in the logical size. See comment to `get_current_logical_size_non_incremental()`. + +- a "point in time" is defined as an LSN value. You can convert a + timestamp to an LSN, but the storage internally works with LSNs. + +- PITR horizon can be set per-branch. + +- PITR horizon can be set as a time interval, e.g. 5 days or hours, or + as amount of WAL, in bytes. If it's given as a time interval, it's + converted to an LSN for the calculation. + +- PITR horizon can be set to 0, if you don't want to retain any history. + +## Calculation + +Inputs to the calculation are: +- logical size of the database at different points in time, +- amount of WAL generated, and +- the PITR horizon settings + +The synthetic size is based on an idealistic model of the storage +system, where we pretend that the storage consists of two things: +- snapshots, containing a full snapshot of the database, at a given + point in time, and +- WAL. + +In the simple case that the project contains just one branch (main), +and a fixed PITR horizon, the synthetic size is the sum of: + +- the logical size of the branch *at the beginning of the PITR + horizon*, i.e. at the oldest point that you can still recover to, and +- the size of the WAL covering the PITR horizon. + +The snapshot allows you to recover to the beginning of the PITR +horizon, and the WAL allows you to recover from that point to any +point within the horizon. + +``` + WAL + -----------------------#########> + ^ + snapshot + +Legend: + ##### PITR horizon. This is the region that you can still access + with Point-in-time query and you can still create branches + from. + ----- history that has fallen out of the PITR horizon, and can no + longer be accessed +``` + +NOTE: This is not how the storage system actually works! The actual +implementation is also based on snapshots and WAL, but the snapshots +are taken for individual database pages and ranges of pages rather +than the whole database, and it is much more complicated. This model +is a reasonable approximation, however, to make the synthetic size a +useful proxy for the actual storage consumption. + + +## Example: Data is INSERTed + +For example, let's assume that your database contained 10 GB of data +at the beginning of the PITR horizon, and you have since then inserted +5 GB of additional data into it. The additional insertions of 5 GB of +data consume roughly 5 GB of WAL. In that case, the synthetic size is: + +> 10 GB (snapshot) + 5 GB (WAL) = 15 GB + +If you now set the PITR horizon on the project to 0, so that no +historical data is retained, then the beginning PITR horizon would be +at the end of the branch, so the size of the snapshot would be +calculated at the end of the branch, after the insertions. Then the +synthetic size is: + +> 15 GB (snapshot) + 0 GB (WAL) = 15 GB. + +In this case, the synthetic size is the same, regardless of the PITR horizon, +because all the history consists of inserts. The newly inserted data takes +up the same amount of space, whether it's stored as part of the logical +snapshot, or as WAL. (*) + +(*) This is a rough approximation. In reality, the WAL contains +headers and other overhead, and on the other hand, the logical +snapshot includes empty space on pages, so the size of insertions in +WAL can be smaller or greater than the size of the final table after +the insertions. But in most cases, it's in the same ballpark. + +## Example: Data is DELETEd + +Let's look at another example: + +Let's start again with a database that contains 10 GB of data. Then, +you DELETE 5 GB of the data, and run VACUUM to free up the space, so +that the logical size of the database is now only 5 GB. + +Let's assume that the WAL for the deletions and the vacuum take up +100 MB of space. In that case, the synthetic size of the project is: + +> 10 GB (snapshot) + 100 MB (WAL) = 10.1 GB + +This is much larger than the logical size of the database after the +deletions (5 GB). That's because the system still needs to retain the +deleted data, because it's still accessible to queries and branching +in the PITR window. + +If you now set the PITR horizon to 0 or just wait for time to pass so +that the data falls out of the PITR horizon, making the deleted data +inaccessible, the synthetic size shrinks: + +> 5 GB (snapshot) + 0 GB (WAL) = 5 GB + + +# Branching + +Things get more complicated with branching. Branches in Neon are +copy-on-write, which is also reflected in the synthetic size. + +When you create a branch, it doesn't immediately change the synthetic +size at all. The branch point is within the PITR horizon, and all the +data needed to recover to that point in time needs to be retained +anyway. + +However, if you make modifications on the branch, the system needs to +keep the WAL of those modifications. The WAL is included in the +synthetic size. + +## Example: branch and INSERT + +Let's assume that you again start with a 10 GB database. +On the main branch, you insert 2 GB of data. Then you create +a branch at that point, and insert another 3 GB of data on the +main branch, and 1 GB of data on the child branch + +``` + child +#####> + | + | WAL + main ---------###############> + ^ + snapshot +``` + +In this case, the synthetic size consists of: +- the snapshot at the beginning of the PITR horizon (10 GB) +- the WAL on the main branch (2 GB + 3 GB = 5 GB) +- the WAL on the child branch (1 GB) + +Total: 16 GB + +# Diverging branches + +If there is only a small amount of changes in the database on the +different branches, as in the previous example, the synthetic size +consists of a snapshot before the branch point, containing all the +shared data, and the WAL on both branches. However, if the branches +diverge a lot, it is more efficient to store a separate snapshot of +branches. + +## Example: diverging branches + +You start with a 10 GB database. You insert 5 GB of data on the main +branch. Then you create a branch, and immediately delete all the data +on the child branch and insert 5 GB of new data to it. Then you do the +same on the main branch. Let's assume +that the PITR horizon requires keeping the last 1 GB of WAL on the +both branches. + +``` + snapshot + v WAL + child +---------##############> + | + | + main -------------+---------##############> + ^ WAL + snapshot +``` + +In this case, the synthetic size consists of: +- snapshot at the beginning of the PITR horizon on the main branch (4 GB) +- WAL on the main branch (1 GB) +- snapshot at the beginning of the PITR horizon on the child branch (4 GB) +- last 1 GB of WAL on the child branch (1 GB) + +Total: 10 GB + +The alternative way to store this would be to take only one snapshot +at the beginning of branch point, and keep all the WAL on both +branches. However, the size with that method would be larger, as it +would require one 10 GB snapshot, and 5 GB + 5 GB of WAL. It depends +on the amount of changes (WAL) on both branches, and the logical size +at the branch point, which method would result in a smaller synthetic +size. On each branch point, the system performs the calculation with +both methods, and uses the method that is cheaper, i.e. the one that +results in a smaller synthetic size. + +One way to think about this is that when you create a branch, it +starts out as a thin branch that only stores the WAL since the branch +point. As you modify it, and the amount of WAL grows, at some point +it becomes cheaper to store a completely new snapshot of the branch +and truncate the WAL. + + +# What is the size of an individual branch? + +Synthetic size is calculated for the whole project, and includes all +branches. There is no such thing as the size of a branch, because it +is not straighforward to attribute the parts of size to individual +branches. + +## Example: attributing size to branches + +(copied from https://github.com/neondatabase/neon/pull/2884#discussion_r1029365278) + +Imagine that you create two branches, A and B, at the same point from +main branch, and do a couple of small updates on both branches. Then +six months pass, and during those six months the data on the main +branch churns over completely multiple times. The retention period is, +say 1 month. + +``` + +------> A + / +--------------------*-------------------------------> main + \ + +--------> B +``` + +In that situation, the synthetic tenant size would be calculated based +on a "logical snapshot" at the branch point, that is, the logical size +of the database at that point. Plus the WAL on branches A and B. Let's +say that the snapshot size is 10 GB, and the WAL is 1 MB on both +branches A and B. So the total synthetic storage size is 10002 +MB. (Let's ignore the main branch for now, that would be just added to +the sum) + +How would you break that down per branch? I can think of three +different ways to do it, and all of them have their own problems: + +### Subtraction method + +For each branch, calculate how much smaller the total synthetic size +would be, if that branch didn't exist. In other words, how much would +you save if you dropped the branch. With this method, the size of +branches A and B is 1 MB. + +With this method, the 10 GB shared logical snapshot is not included +for A nor B. So the size of all branches is not equal to the total +synthetic size of the tenant. If you drop branch A, you save 1 MB as +you'd expect, but also the size of B suddenly jumps from 1 MB to 10001 +MB, which might feel surprising. + +### Division method + +Divide the common parts evenly across all branches that need +them. With this method, the size of branches A and B would be 5001 MB. + +With this method, the sum of all branches adds up to the total +synthetic size. But it's surprising in other ways: if you drop branch +A, you might think that you save 5001 MB, but in reality you only save +1 MB, and the size of branch B suddenly grows from 5001 to 10001 MB. + +### Addition method + +For each branch, include all the snapshots and WAL that it depends on, +even if some of them are shared by other branches. With this method, +the size of branches A and B would be 10001 MB. + +The surprise with this method is that the sum of all the branches is +larger than the total synthetic size. And if you drop branch A, the +total synthetic size doesn't fall by 10001 MB as you might think. + +# Alternatives + +A sort of cop-out method would be to show the whole tree of branches +graphically, and for each section of WAL or logical snapshot, display +the size of that section. You can then see which branches depend on +which sections, which sections are shared etc. That would be good to +have in the UI anyway. + +Or perhaps calculate per-branch numbers using the subtraction method, +and in addition to that, one more number for "shared size" that +includes all the data that is needed by more than one branch. + +## Which is the right method? + +The bottom line is that it's not straightforward to attribute the +synthetic size to individual branches. There are things we can do, and +all of those methods are pretty straightforward to implement, but they +all have their own problems. What makes sense depends a lot on what +you want to do with the number, what question you are trying to +answer. From f07d6433b6df8e3546daab312ffeb81399c13233 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 9 Feb 2023 13:02:07 +0200 Subject: [PATCH 009/426] fix: one leftover Arc::ptr_eq (#3573) @knizhnik noticed that one instance of `Arc::::ptr_eq` was missed in #3558. Now all `ptr_eq` which remain are in comments. --- pageserver/src/tenant/layer_map.rs | 5 ++++- pageserver/src/tenant/timeline.rs | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 9d8c825220..e446e34f4e 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -731,8 +731,11 @@ where Ok(()) } + /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables. + /// + /// Returns `true` if the two `Arc` point to the same layer, false otherwise. #[inline(always)] - fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { + pub fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { // "dyn Trait" objects are "fat pointers" in that they have two components: // - pointer to the object // - pointer to the vtable diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 838df6d884..e1156e7270 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2374,7 +2374,7 @@ impl Timeline { // Only one thread may call this function at a time (for this // timeline). If two threads tried to flush the same frozen // layer to disk at the same time, that would not work. - assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer)); // release lock on 'layers' } From 7ed9eb4a5601262aa95819a1a9958cb002d17171 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 9 Feb 2023 14:28:20 +0300 Subject: [PATCH 010/426] Add script for safekeeper tenants cleanup (#3452) This script can be used to remove tenant directories on safekeepers for projects which do not longer exist (deleted in the console). To run this script you need to upload it to safekeeper (i.e. with SSH), and run it with python3. Ansible can be used to run this script on multiple safekeepers. Fixes https://github.com/neondatabase/cloud/issues/3356 --- scripts/sk_cleanup_tenants/readme.md | 55 ++++++++++ scripts/sk_cleanup_tenants/remote.yaml | 80 +++++++++++++++ scripts/sk_cleanup_tenants/script.py | 133 +++++++++++++++++++++++++ 3 files changed, 268 insertions(+) create mode 100644 scripts/sk_cleanup_tenants/readme.md create mode 100644 scripts/sk_cleanup_tenants/remote.yaml create mode 100644 scripts/sk_cleanup_tenants/script.py diff --git a/scripts/sk_cleanup_tenants/readme.md b/scripts/sk_cleanup_tenants/readme.md new file mode 100644 index 0000000000..f1bb2d540e --- /dev/null +++ b/scripts/sk_cleanup_tenants/readme.md @@ -0,0 +1,55 @@ +# Cleanup script for safekeeper + +This script can be used to remove tenant directories on safekeepers for projects which do not longer exist (deleted in console). + +To run this script you need to upload it to safekeeper (i.e. with SSH), and run it with python3. Ansible can be used to run this script on multiple safekeepers. + +NOTE: Console queries to check that project is deleted are slow and inefficient. +If you want to run this script on safekeeper with many tenants, consider +making PR to console repo to make projects search by tenant_id faster. + +## How to run on a single node + +``` +zsh nsh safekeeper-0.us-east-2.aws.neon.build + +ls /storage/safekeeper/data/ | grep -v safekeeper > tenants.txt + +mkdir -p /storage/neon-trash/2023-01-01--cleanup + + export CONSOLE_API_TOKEN= +python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run + +cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run + +cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& tee logs.txt +``` + +## How to use ansible (staging) + +``` +cd ~/neon/.github/ansible + +export AWS_DEFAULT_PROFILE=dev + +ansible-playbook -i staging.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml + +# add --extra-vars "api_token=" to set console api token +``` + +## How to use ansible (prod) + +- Change `endpoint` in `script.py` to "https://console.neon.tech/api" + +``` +cd ~/neon/.github/ansible + +export AWS_DEFAULT_PROFILE=prod + +ansible-playbook -i prod.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml + +# add --extra-vars "api_token=" to set console api token +``` + + +> Heavily inspired with script for pageserver cleanup: https://gist.github.com/problame/bafb6ca6334f0145757238e61380c3f1/9bef1845a8291ebfa1f3a51eb79c01d12498b2b5 \ No newline at end of file diff --git a/scripts/sk_cleanup_tenants/remote.yaml b/scripts/sk_cleanup_tenants/remote.yaml new file mode 100644 index 0000000000..c7eeb8516c --- /dev/null +++ b/scripts/sk_cleanup_tenants/remote.yaml @@ -0,0 +1,80 @@ +- name: Test safekeepers + hosts: safekeepers + gather_facts: False + remote_user: "{{ remote_user }}" + + vars: + script_dir: /storage/ansible_sk_cleanup + tenants_file: "{{ script_dir }}/tenants.txt" + trash_dir: /storage/neon-trash/2023-01-01--changeme + + tasks: + + - name: create script directory + file: + path: "{{ script_dir }}" + state: directory + mode: 0755 + tags: + - safekeeper + + - name: create trash dir + file: + path: "{{ trash_dir }}" + state: directory + mode: 0755 + tags: + - safekeeper + + - name: collect all tenant_ids to tenants.txt + shell: + cmd: ls /storage/safekeeper/data/ | grep -v safekeeper > {{ tenants_file }} + tags: + - safekeeper + + - name: count tenants + shell: + cmd: wc -l {{ tenants_file }} + register: tenants_count + tags: + - safekeeper + + - debug: msg="{{ tenants_count.stdout }}" + + - name: fetch safekeeper_id + shell: + cmd: cat /storage/safekeeper/data/safekeeper.id + register: safekeeper_id + tags: + - safekeeper + + - debug: msg="{{ safekeeper_id.stdout }}" + + - name: copy script.py to safekeeper + copy: + src: script.py + dest: "{{ script_dir }}" + mode: 0755 + tags: + - safekeeper + + - name: Run an async task + shell: + chdir: "{{ script_dir }}" + cmd: "cat tenants.txt | python3 script.py --trash-dir {{ trash_dir }} --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& cat > {{ script_dir }}/run-`date +%Y-%m-%d-%H.%M.%S`.log" + args: + executable: /bin/bash + environment: + CONSOLE_API_TOKEN: "{{ api_token }}" + async: 30000 + poll: 0 + register: bg_async_task + + - name: Check on an async task + async_status: + jid: "{{ bg_async_task.ansible_job_id }}" + become: true + register: job_result + until: job_result.finished + retries: 3000 + delay: 10 diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py new file mode 100644 index 0000000000..4d010d85ea --- /dev/null +++ b/scripts/sk_cleanup_tenants/script.py @@ -0,0 +1,133 @@ +import argparse +import logging +import os +import shutil +import sys +from pathlib import Path + +import requests + +level = logging.INFO +logging.basicConfig( + format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%d:%H:%M:%S", + level=level, +) + +parser = argparse.ArgumentParser() +parser.add_argument("--trash-dir", required=True, type=Path) +parser.add_argument("--dry-run", action="store_true") +parser.add_argument("--safekeeper-id", required=True, type=int) +parser.add_argument("--safekeeper-host", required=True, type=str) +args = parser.parse_args() + +access_key = os.getenv("CONSOLE_API_TOKEN") +endpoint: str = "https://console.stage.neon.tech/api" + +trash_dir: Path = args.trash_dir +dry_run: bool = args.dry_run +logging.info(f"dry_run={dry_run}") +sk_id: int = args.safekeeper_id +sk_host: str = args.safekeeper_host + +assert trash_dir.is_dir() + +### + + +def console_get(rel_url): + r = requests.get( + f"{endpoint}{rel_url}", + headers={ + "Authorization": f"Bearer {access_key}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + r.raise_for_status() + return r + + +def tenant_is_deleted_in_console(tenant_id): + r = console_get(f"/v1/admin/projects?search={tenant_id}&show_deleted=true") + r = r.json() + results = r["data"] + assert len(results) == 1, f"unexpected results len: {results}" + r = results[0] + assert r["tenant"] == tenant_id, f"tenant id doesn't match: {r}" + assert r["safekeepers"] is not None, f"safekeepers is None: {r}" + assert any(sk["id"] == sk_id for sk in r["safekeepers"]), f"safekeeper id not found: {r}" + assert "deleted" in r, f"{r}" + return r["deleted"] is True + + +def call_delete_tenant_api(tenant_id): + r = requests.delete(f"http://{sk_host}:7676/v1/tenant/{tenant_id}") + r.raise_for_status() + return r + + +def cleanup_tenant(tenant_id): + + tenant_dir = Path(f"/storage/safekeeper/data/{tenant_id}") + + if not tenant_dir.exists(): + logging.info("tenant directory doesn't exist, assuming it has been cleaned already") + return + + if not tenant_is_deleted_in_console(tenant_id): + logging.info("tenant is not deleted in console, skipping") + return + + logging.info("assertions passed") + + if dry_run: + return + + logging.info("deleting tenant") + + tenant_dir_in_trash = trash_dir / tenant_dir.relative_to("/") + tenant_dir_in_trash.parent.mkdir(parents=True, exist_ok=True) + + assert not tenant_dir_in_trash.exists(), f"{tenant_dir_in_trash}" + assert tenant_dir_in_trash.parent.exists(), f"{tenant_dir_in_trash}" + # double-check + assert tenant_dir.exists(), f"{tenant_dir}" + assert tenant_dir.is_dir(), f"{tenant_dir}" + + logging.info(f"copying {tenant_dir} to {tenant_dir_in_trash}") + shutil.copytree(src=tenant_dir, dst=tenant_dir_in_trash, symlinks=False, dirs_exist_ok=False) + + logging.info(f"deleting {tenant_dir}") + call_delete_tenant_api(tenant_id) + + logging.info("tenant is now deleted, checking that it's gone") + assert not tenant_dir.exists(), f"{tenant_dir}" + + +if os.path.exists("script.pid"): + logging.info( + f"script is already running, with pid={Path('script.pid').read_text()}. Terminate it first." + ) + exit(1) + +with open("script.pid", "w", encoding="utf-8") as f: + f.write(str(os.getpid())) + +logging.info(f"started script.py, pid={os.getpid()}") + +for line in sys.stdin: + tenant_id = line.strip() + try: + logging.info(f"start tenant {tenant_id}") + cleanup_tenant(tenant_id) + logging.info(f"done tenant {tenant_id}") + except KeyboardInterrupt: + print("KeyboardInterrupt exception is caught") + break + except: # noqa: E722 + logging.exception(f"failed to clean up tenant {tenant_id}") + +logging.info(f"finished script.py, pid={os.getpid()}") + +os.remove("script.pid") From 446a39e9690fc2b3e2ca290a73673c39cad8ade5 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 9 Feb 2023 11:44:30 +0100 Subject: [PATCH 011/426] make LayerAccesStatFullDetails Copy Method to_api_model renamed to as_api_model because of Clippy complaint: https://rust-lang.github.io/rust-clippy/master/index.html#wrong_self_convention --- pageserver/src/tenant/storage_layer.rs | 15 +++++++-------- .../src/tenant/storage_layer/delta_layer.rs | 2 +- .../src/tenant/storage_layer/image_layer.rs | 2 +- .../src/tenant/storage_layer/remote_layer.rs | 4 ++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e85359af16..3b9312360b 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -103,7 +103,7 @@ struct LayerAccessStatsInner { last_residence_changes: HistoryBufferWithDropCounter, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] struct LayerAccessStatFullDetails { when: SystemTime, task_kind: TaskKind, @@ -126,7 +126,7 @@ fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 { } impl LayerAccessStatFullDetails { - fn to_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { + fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails { let Self { when, task_kind, @@ -189,14 +189,13 @@ impl LayerAccessStats { task_kind, access_kind, }; - inner - .first_access - .get_or_insert_with(|| this_access.clone()); + inner.first_access.get_or_insert(this_access); inner.count_by_access_kind[access_kind] += 1; inner.task_kind_flag |= task_kind; inner.last_accesses.write(this_access); } - fn to_api_model( + + fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { @@ -217,8 +216,8 @@ impl LayerAccessStats { .iter() .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros .collect(), - first: first_access.as_ref().map(|a| a.to_api_model()), - accesses_history: last_accesses.map(|m| m.to_api_model()), + first: first_access.as_ref().map(|a| a.as_api_model()), + accesses_history: last_accesses.map(|m| m.as_api_model()), residence_events_history: last_residence_changes.clone(), }; match reset { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 9b322faa65..c955995bf2 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -450,7 +450,7 @@ impl PersistentLayer for DeltaLayer { let layer_file_name = self.filename().file_name(); let lsn_range = self.get_lsn_range(); - let access_stats = self.access_stats.to_api_model(reset); + let access_stats = self.access_stats.as_api_model(reset); HistoricLayerInfo::Delta { layer_file_name, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 86c1aee619..8ba901521f 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -269,7 +269,7 @@ impl PersistentLayer for ImageLayer { layer_file_size: Some(self.file_size), lsn_start: lsn_range.start, remote: false, - access_stats: self.access_stats.to_api_model(reset), + access_stats: self.access_stats.as_api_model(reset), } } diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 7391875d0c..51bb4dcc2a 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -171,7 +171,7 @@ impl PersistentLayer for RemoteLayer { lsn_start: lsn_range.start, lsn_end: lsn_range.end, remote: true, - access_stats: self.access_stats.to_api_model(reset), + access_stats: self.access_stats.as_api_model(reset), } } else { HistoricLayerInfo::Image { @@ -179,7 +179,7 @@ impl PersistentLayer for RemoteLayer { layer_file_size: self.layer_metadata.file_size(), lsn_start: lsn_range.start, remote: true, - access_stats: self.access_stats.to_api_model(reset), + access_stats: self.access_stats.as_api_model(reset), } } } From 1fdf01e3bc136fee5a9990086e1bd74434198cde Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 9 Feb 2023 13:55:37 +0200 Subject: [PATCH 012/426] fix: readable Debug for Layers (#3575) #3536 added the custom Debug implementations but it using derived Debug on Key lead to too verbose output. Instead of making `Key`'s `Debug` unconditionally or conditionally do the `Display` variant (for table space'd keys), opted to build a newtype to provide `Debug` for `Range` via `Display` which seemed to work unconditionally. Also orders Key to have: 1. comment, 2. derive, 3. `struct Key`. --- pageserver/src/repository.rs | 2 +- pageserver/src/tenant/storage_layer.rs | 11 ++++++++ .../src/tenant/storage_layer/delta_layer.rs | 4 ++- .../src/tenant/storage_layer/filename.rs | 26 +++++++++++++++++-- .../src/tenant/storage_layer/image_layer.rs | 4 ++- 5 files changed, 42 insertions(+), 5 deletions(-) diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 092503b7c5..047fa761c3 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -7,11 +7,11 @@ use std::fmt; use std::ops::{AddAssign, Range}; use std::time::Duration; -#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. /// /// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs /// for what we actually store in these fields. +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] pub struct Key { pub field1: u8, pub field2: u32, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3b9312360b..6cf38f8737 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -448,3 +448,14 @@ enum PathOrConf { Path(PathBuf), Conf(&'static PageServerConf), } + +/// Range wrapping newtype, which uses display to render Debug. +/// +/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers. +struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range); + +impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}..{}", self.0.start, self.0.end) + } +} diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index c955995bf2..4d1e08322d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -194,8 +194,10 @@ pub struct DeltaLayer { impl std::fmt::Debug for DeltaLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use super::RangeDisplayDebug; + f.debug_struct("DeltaLayer") - .field("key_range", &self.key_range) + .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .field("file_size", &self.file_size) .field("inner", &self.inner) diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index bd3d2c42c1..efd0769886 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -10,12 +10,23 @@ use std::str::FromStr; use utils::lsn::Lsn; // Note: Timeline::load_layer_map() relies on this sort order -#[derive(Debug, PartialEq, Eq, Clone, Hash)] +#[derive(PartialEq, Eq, Clone, Hash)] pub struct DeltaFileName { pub key_range: Range, pub lsn_range: Range, } +impl std::fmt::Debug for DeltaFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use super::RangeDisplayDebug; + + f.debug_struct("DeltaFileName") + .field("key_range", &RangeDisplayDebug(&self.key_range)) + .field("lsn_range", &self.lsn_range) + .finish() + } +} + impl PartialOrd for DeltaFileName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -100,12 +111,23 @@ impl fmt::Display for DeltaFileName { } } -#[derive(Debug, PartialEq, Eq, Clone, Hash)] +#[derive(PartialEq, Eq, Clone, Hash)] pub struct ImageFileName { pub key_range: Range, pub lsn: Lsn, } +impl std::fmt::Debug for ImageFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use super::RangeDisplayDebug; + + f.debug_struct("ImageFileName") + .field("key_range", &RangeDisplayDebug(&self.key_range)) + .field("lsn", &self.lsn) + .finish() + } +} + impl PartialOrd for ImageFileName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 8ba901521f..e48abd38dd 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -119,8 +119,10 @@ pub struct ImageLayer { impl std::fmt::Debug for ImageLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use super::RangeDisplayDebug; + f.debug_struct("ImageLayer") - .field("key_range", &self.key_range) + .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("file_size", &self.file_size) .field("lsn", &self.lsn) .field("inner", &self.inner) From 175a577ad42476a49978d277a7428e8a078dd6ae Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 6 Feb 2023 19:20:25 +0100 Subject: [PATCH 013/426] automatic layer eviction This patch adds a per-timeline periodic task that executes an eviction policy. The eviction policy is configurable per tenant. Two policies exist: - NoEviction (the default one) - LayerAccessThreshold The LayerAccessThreshold policy examines the last access timestamp per layer in the layer map and evicts the layer if that last access is further in the past than a configurable threshold value. This policy kind is evaluated periodically at a configurable period. It logs a summary statistic at `info!()` or `warn!()` level, depending on whether any evictions failed. This feature has no explicit killswitch since it's off by default. --- Cargo.lock | 3 + Cargo.toml | 1 + control_plane/Cargo.toml | 1 + control_plane/src/pageserver.rs | 5 + libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/models.rs | 12 +- pageserver/Cargo.toml | 3 +- pageserver/src/config.rs | 7 + pageserver/src/http/routes.rs | 8 + pageserver/src/task_mgr.rs | 3 + pageserver/src/tenant.rs | 1 + pageserver/src/tenant/config.rs | 31 +++ pageserver/src/tenant/storage_layer.rs | 73 +++++-- pageserver/src/tenant/timeline.rs | 14 +- .../src/tenant/timeline/eviction_task.rs | 199 ++++++++++++++++++ 15 files changed, 339 insertions(+), 23 deletions(-) create mode 100644 pageserver/src/tenant/timeline/eviction_task.rs diff --git a/Cargo.lock b/Cargo.lock index 6be08d16b1..d526e48198 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -917,6 +917,7 @@ dependencies = [ "reqwest", "safekeeper_api", "serde", + "serde_json", "serde_with", "storage_broker", "tar", @@ -2421,6 +2422,7 @@ dependencies = [ "crc32c", "criterion", "crossbeam-utils", + "either", "enum-map", "enumset", "fail", @@ -2484,6 +2486,7 @@ dependencies = [ "enum-map", "postgres_ffi", "serde", + "serde_json", "serde_with", "utils", "workspace_hack", diff --git a/Cargo.toml b/Cargo.toml index 9033671f55..eaa25b423a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" crossbeam-utils = "0.8.5" +either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 0b2f561d39..309887e1fa 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -15,6 +15,7 @@ postgres.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } serde.workspace = true +serde_json.workspace = true serde_with.workspace = true tar.workspace = true thiserror.workspace = true diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 9cebe028e4..c49bd39f09 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -419,6 +419,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'trace_read_requests' as bool")?, + eviction_policy: settings + .get("eviction_policy") + .map(|x| serde_json::from_str(x)) + .transpose() + .context("Failed to parse 'eviction_policy' json")?, }) .send()? .error_from_body()?; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index dafb246632..7709da1072 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -14,5 +14,6 @@ byteorder.workspace = true utils.workspace = true postgres_ffi.workspace = true enum-map.workspace = true +serde_json.workspace = true workspace_hack.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 9cdcf3a173..3ac7e31ec2 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -155,6 +155,11 @@ pub struct TenantConfigRequest { pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub trace_read_requests: Option, + // We defer the parsing of the eviction_policy field to the request handler. + // Otherwise we'd have to move the types for eviction policy into this package. + // We might do that once the eviction feature has stabilizied. + // For now, this field is not even documented in the openapi_spec.yml. + pub eviction_policy: Option, } impl TenantConfigRequest { @@ -174,6 +179,7 @@ impl TenantConfigRequest { lagging_wal_timeout: None, max_lsn_wal_lag: None, trace_read_requests: None, + eviction_policy: None, } } } @@ -263,11 +269,11 @@ pub struct LayerResidenceEvent { /// #[serde(rename = "timestamp_millis_since_epoch")] #[serde_as(as = "serde_with::TimestampMilliSeconds")] - timestamp: SystemTime, + pub timestamp: SystemTime, /// The new residence status of the layer. - status: LayerResidenceStatus, + pub status: LayerResidenceStatus, /// The reason why we had to record this event. - reason: LayerResidenceEventReason, + pub reason: LayerResidenceEventReason, } /// The reason for recording a given [`ResidenceEvent`]. diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index f3ad2c5de6..d2f0b84863 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -23,6 +23,7 @@ const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true crossbeam-utils.workspace = true +either.workspace = true fail.workspace = true futures.workspace = true git-version.workspace = true @@ -51,7 +52,7 @@ thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-postgres.workspace = true tokio-util.workspace = true -toml_edit.workspace = true +toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true url.workspace = true walkdir.workspace = true diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f88895a970..309e5367a4 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -731,6 +731,13 @@ impl PageServerConf { })?); } + if let Some(eviction_policy) = item.get("eviction_policy") { + t_conf.eviction_policy = Some( + toml_edit::de::from_item(eviction_policy.clone()) + .context("parse eviction_policy")?, + ); + } + Ok(t_conf) } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 229cf96ee3..6a9232e097 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -797,6 +797,14 @@ async fn update_tenant_config_handler( ); } + if let Some(eviction_policy) = request_data.eviction_policy { + tenant_conf.eviction_policy = Some( + serde_json::from_value(eviction_policy) + .context("parse field `eviction_policy`") + .map_err(ApiError::BadRequest)?, + ); + } + let state = get_state(&request); mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id) .instrument(info_span!("tenant_config", tenant = ?tenant_id)) diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index c4f213e755..2734031a09 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -231,6 +231,9 @@ pub enum TaskKind { // Compaction. One per tenant. Compaction, + // Eviction. One per timeline. + Eviction, + // Initial logical size calculation InitialLogicalSizeCalculation, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index bc943372f8..23210b98d5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2757,6 +2757,7 @@ pub mod harness { lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), trace_read_requests: Some(tenant_conf.trace_read_requests), + eviction_policy: Some(tenant_conf.eviction_policy), } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 087cff2537..fca08dd51a 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -91,6 +91,7 @@ pub struct TenantConf { /// to avoid eager reconnects. pub max_lsn_wal_lag: NonZeroU64, pub trace_read_requests: bool, + pub eviction_policy: EvictionPolicy, } /// Same as TenantConf, but this struct preserves the information about @@ -153,6 +154,34 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub trace_read_requests: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub eviction_policy: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum EvictionPolicy { + NoEviction, + LayerAccessThreshold(EvictionPolicyLayerAccessThreshold), +} + +impl EvictionPolicy { + pub fn discriminant_str(&self) -> &'static str { + match self { + EvictionPolicy::NoEviction => "NoEviction", + EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvictionPolicyLayerAccessThreshold { + #[serde(with = "humantime_serde")] + pub period: Duration, + #[serde(with = "humantime_serde")] + pub threshold: Duration, } impl TenantConfOpt { @@ -189,6 +218,7 @@ impl TenantConfOpt { trace_read_requests: self .trace_read_requests .unwrap_or(global_conf.trace_read_requests), + eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), } } @@ -261,6 +291,7 @@ impl Default for TenantConf { max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) .expect("cannot parse default max walreceiver Lsn wal lag"), trace_read_requests: false, + eviction_policy: EvictionPolicy::NoEviction, } } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 6cf38f8737..9198cfd1df 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -13,6 +13,7 @@ use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use anyhow::Result; use bytes::Bytes; +use either::Either; use enum_map::EnumMap; use enumset::EnumSet; use pageserver_api::models::LayerAccessKind; @@ -92,7 +93,23 @@ pub enum ValueReconstructResult { } #[derive(Debug)] -pub struct LayerAccessStats(Mutex); +pub struct LayerAccessStats(Mutex); + +/// This struct holds two instances of [`LayerAccessStatsInner`]. +/// Accesses are recorded to both instances. +/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`]. +/// The `for_eviction_policy` is never reset. +#[derive(Debug, Default, Clone)] +struct LayerAccessStatsLocked { + for_scraping_api: LayerAccessStatsInner, + for_eviction_policy: LayerAccessStatsInner, +} + +impl LayerAccessStatsLocked { + fn iter_mut(&mut self) -> impl Iterator { + [&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter() + } +} #[derive(Debug, Default, Clone)] struct LayerAccessStatsInner { @@ -104,10 +121,10 @@ struct LayerAccessStatsInner { } #[derive(Debug, Clone, Copy)] -struct LayerAccessStatFullDetails { - when: SystemTime, - task_kind: TaskKind, - access_kind: LayerAccessKind, +pub(super) struct LayerAccessStatFullDetails { + pub(super) when: SystemTime, + pub(super) task_kind: TaskKind, + pub(super) access_kind: LayerAccessKind, } #[derive(Clone, Copy, strum_macros::EnumString)] @@ -142,13 +159,13 @@ impl LayerAccessStatFullDetails { impl LayerAccessStats { pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default())); + let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); new.record_residence_event(status, LayerResidenceEventReason::LayerLoad); new } pub(crate) fn for_new_layer_file() -> Self { - let new = LayerAccessStats(Mutex::new(LayerAccessStatsInner::default())); + let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); new.record_residence_event( LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, @@ -176,37 +193,43 @@ impl LayerAccessStats { status: LayerResidenceStatus, reason: LayerResidenceEventReason, ) { - let mut inner = self.0.lock().unwrap(); - inner - .last_residence_changes - .write(LayerResidenceEvent::new(status, reason)); + let mut locked = self.0.lock().unwrap(); + locked.iter_mut().for_each(|inner| { + inner + .last_residence_changes + .write(LayerResidenceEvent::new(status, reason)) + }); } fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) { - let mut inner = self.0.lock().unwrap(); let this_access = LayerAccessStatFullDetails { when: SystemTime::now(), task_kind, access_kind, }; - inner.first_access.get_or_insert(this_access); - inner.count_by_access_kind[access_kind] += 1; - inner.task_kind_flag |= task_kind; - inner.last_accesses.write(this_access); + + let mut locked = self.0.lock().unwrap(); + locked.iter_mut().for_each(|inner| { + inner.first_access.get_or_insert(this_access); + inner.count_by_access_kind[access_kind] += 1; + inner.task_kind_flag |= task_kind; + inner.last_accesses.write(this_access); + }) } fn as_api_model( &self, reset: LayerAccessStatsReset, ) -> pageserver_api::models::LayerAccessStats { - let mut inner = self.0.lock().unwrap(); + let mut locked = self.0.lock().unwrap(); + let inner = &mut locked.for_scraping_api; let LayerAccessStatsInner { first_access, count_by_access_kind, task_kind_flag, last_accesses, last_residence_changes, - } = &*inner; + } = inner; let ret = pageserver_api::models::LayerAccessStats { access_count_by_access_kind: count_by_access_kind .iter() @@ -231,6 +254,20 @@ impl LayerAccessStats { } ret } + + pub(super) fn most_recent_access_or_residence_event( + &self, + ) -> Either { + let locked = self.0.lock().unwrap(); + let inner = &locked.for_eviction_policy; + match inner.last_accesses.recent() { + Some(a) => Either::Left(*a), + None => match inner.last_residence_changes.recent() { + Some(e) => Either::Right(e.clone()), + None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"), + } + } + } } /// Supertrait of the [`Layer`] trait that captures the bare minimum interface diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e1156e7270..bcbf8a12b4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,5 +1,6 @@ //! +mod eviction_task; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context}; @@ -47,7 +48,7 @@ use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{EvictionPolicy, TenantConfOpt}; use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; @@ -801,6 +802,7 @@ impl Timeline { pub fn activate(self: &Arc) { self.set_state(TimelineState::Active); self.launch_wal_receiver(); + self.launch_eviction_task(); } pub fn set_state(&self, new_state: TimelineState) { @@ -889,7 +891,10 @@ impl Timeline { } } + /// Evict multiple layers at once, continuing through errors. + /// /// Try to evict the given `layers_to_evict` by + /// /// 1. Replacing the given layer object in the layer map with a corresponding [`RemoteLayer`] object. /// 2. Deleting the now unreferenced layer file from disk. /// @@ -1057,6 +1062,13 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } + fn get_eviction_policy(&self) -> EvictionPolicy { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .eviction_policy + .unwrap_or(self.conf.default_tenant_conf.eviction_policy) + } + /// Open a Timeline handle. /// /// Loads the metadata for the timeline into memory, but not the layer map. diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs new file mode 100644 index 0000000000..e3e7ce4c9d --- /dev/null +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -0,0 +1,199 @@ +//! The per-timeline layer eviction task. + +use std::{ + ops::ControlFlow, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use either::Either; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, instrument, warn}; + +use crate::{ + task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + tenant::{ + config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, + storage_layer::PersistentLayer, + }, +}; + +use super::Timeline; + +impl Timeline { + pub(super) fn launch_eviction_task(self: &Arc) { + let self_clone = Arc::clone(self); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::Eviction, + Some(self.tenant_id), + Some(self.timeline_id), + &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id), + false, + async move { + self_clone.eviction_task(task_mgr::shutdown_token()).await; + info!("eviction task finishing"); + Ok(()) + }, + ); + } + + #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] + async fn eviction_task(self: Arc, cancel: CancellationToken) { + loop { + let policy = self.get_eviction_policy(); + let cf = self.eviction_iteration(&policy, cancel.clone()).await; + match cf { + ControlFlow::Break(()) => break, + ControlFlow::Continue(sleep_until) => { + tokio::select! { + _ = cancel.cancelled() => { + info!("shutting down"); + break; + } + _ = tokio::time::sleep_until(sleep_until) => { } + } + } + } + } + } + + #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))] + async fn eviction_iteration( + self: &Arc, + policy: &EvictionPolicy, + cancel: CancellationToken, + ) -> ControlFlow<(), Instant> { + debug!("eviction iteration: {policy:?}"); + match policy { + EvictionPolicy::NoEviction => { + // check again in 10 seconds; XXX config watch mechanism + ControlFlow::Continue(Instant::now() + Duration::from_secs(10)) + } + EvictionPolicy::LayerAccessThreshold(p) => { + let start = Instant::now(); + match self.eviction_iteration_threshold(p, cancel).await { + ControlFlow::Break(()) => return ControlFlow::Break(()), + ControlFlow::Continue(()) => (), + } + let elapsed = start.elapsed(); + if elapsed > p.period { + warn!( + configured_period = %humantime::format_duration(p.period), + last_period = %humantime::format_duration(elapsed), + "this eviction period took longer than the configured period" + ); + } + ControlFlow::Continue(start + p.period) + } + } + } + + async fn eviction_iteration_threshold( + self: &Arc, + p: &EvictionPolicyLayerAccessThreshold, + cancel: CancellationToken, + ) -> ControlFlow<()> { + let now = SystemTime::now(); + + #[allow(dead_code)] + #[derive(Debug, Default)] + struct EvictionStats { + not_considered_due_to_clock_skew: usize, + candidates: usize, + evicted: usize, + errors: usize, + not_evictable: usize, + skipped_for_shutdown: usize, + } + let mut stats = EvictionStats::default(); + // Gather layers for eviction. + // NB: all the checks can be invalidated as soon as we release the layer map lock. + // We don't want to hold the layer map lock during eviction. + // So, we just need to deal with this. + let candidates: Vec> = { + let layers = self.layers.read().unwrap(); + let mut candidates = Vec::new(); + for hist_layer in layers.iter_historic_layers() { + if hist_layer.is_remote_layer() { + continue; + } + let last_activity_ts = match hist_layer + .access_stats() + .most_recent_access_or_residence_event() + { + Either::Left(mra) => mra.when, + Either::Right(re) => re.timestamp, + }; + let no_activity_for = match now.duration_since(last_activity_ts) { + Ok(d) => d, + Err(_e) => { + // NB: don't log the error. If there are many layers and the system clock + // is skewed, we'd be flooding the log. + stats.not_considered_due_to_clock_skew += 1; + continue; + } + }; + if no_activity_for > p.threshold { + candidates.push(hist_layer) + } + } + candidates + }; + stats.candidates = candidates.len(); + + let remote_client = match self.remote_client.as_ref() { + None => { + error!( + num_candidates = candidates.len(), + "no remote storage configured, cannot evict layers" + ); + return ControlFlow::Continue(()); + } + Some(c) => c, + }; + + let results = match self + .evict_layer_batch(remote_client, &candidates[..], cancel) + .await + { + Err(pre_err) => { + stats.errors += candidates.len(); + error!("could not do any evictions: {pre_err:#}"); + return ControlFlow::Continue(()); + } + Ok(results) => results, + }; + assert_eq!(results.len(), candidates.len()); + for (l, result) in candidates.iter().zip(results) { + match result { + None => { + stats.skipped_for_shutdown += 1; + } + Some(Ok(true)) => { + debug!("evicted layer {l:?}"); + stats.evicted += 1; + } + Some(Ok(false)) => { + debug!("layer is not evictable: {l:?}"); + stats.not_evictable += 1; + } + Some(Err(e)) => { + // This variant is the case where an unexpected error happened during eviction. + // Expected errors that result in non-eviction are `Some(Ok(false))`. + // So, dump Debug here to gather as much info as possible in this rare case. + warn!("failed to evict layer {l:?}: {e:?}"); + stats.errors += 1; + } + } + } + if stats.not_considered_due_to_clock_skew > 0 || stats.errors > 0 || stats.not_evictable > 0 + { + warn!(stats=?stats, "eviction iteration complete"); + } else { + info!(stats=?stats, "eviction iteration complete"); + } + ControlFlow::Continue(()) + } +} From a4256b325032ccb42e1b4e4b2876295b0dde31ca Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 8 Feb 2023 17:50:22 +0100 Subject: [PATCH 014/426] allow on-demand downloads in walreceiver connection handler Without this patch, basebackup fails if we evict all layers before that. This slipped in as part of commit 01b4b0c2f3731f16f4b9b1cfcb5e7937c76df989 Author: Christian Schwarz Date: Fri Jan 13 17:02:22 2023 +0100 Introduce RequestContext --- .../src/tenant/timeline/walreceiver/connection_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index cd7c7c51d2..64a79b6d1b 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -13,7 +13,7 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, ti use super::TaskStateUpdate; use crate::broker_client::get_broker_client; -use crate::context::RequestContext; +use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::WALRECEIVER_RUNTIME; use crate::task_mgr::{self, TaskKind}; use crate::tenant::Timeline; @@ -413,7 +413,7 @@ impl WalreceiverState { let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, - ctx.download_behavior(), + DownloadBehavior::Download, ); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { From 9657459d802202ad42a086ac75c12f06fba84f5b Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 10 Feb 2023 12:45:38 +0300 Subject: [PATCH 015/426] [proxy] Fix possible unsoundness in the websocket machinery (#3569) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR replaces the ill-advised `unsafe Sync` impl with a de-facto standard way to solve the underlying problem. TLDR: - tokio::task::spawn requires future to be Send - ∀t. (t : Sync) <=> (&t : Send) - ∀t. (t : Send + !Sync) => (&t : !Send) --- Cargo.lock | 1 + Cargo.toml | 3 +- proxy/Cargo.toml | 1 + proxy/src/http/websocket.rs | 203 ++++++++++++++++-------------------- 4 files changed, 93 insertions(+), 115 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d526e48198..67e54d3833 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2901,6 +2901,7 @@ dependencies = [ "serde_json", "sha2", "socket2", + "sync_wrapper", "thiserror", "tls-listener", "tokio", diff --git a/Cargo.toml b/Cargo.toml index eaa25b423a..4e4667f253 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,7 +69,6 @@ once_cell = "1.13" opentelemetry = "0.18.0" opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions = "0.10.0" -tracing-opentelemetry = "0.18.0" parking_lot = "0.12" pin-project-lite = "0.2" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency @@ -93,6 +92,7 @@ socket2 = "0.4.4" strum = "0.24" strum_macros = "0.24" svg_fmt = "0.4.1" +sync_wrapper = "0.1.2" tar = "0.4" thiserror = "1.0" tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] } @@ -105,6 +105,7 @@ toml = "0.5" toml_edit = { version = "0.17", features = ["easy"] } tonic = {version = "0.8", features = ["tls", "tls-roots"]} tracing = "0.1" +tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" uuid = { version = "1.2", features = ["v4", "serde"] } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 1ff7eebd98..152c83e4a0 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -43,6 +43,7 @@ serde.workspace = true serde_json.workspace = true sha2.workspace = true socket2.workspace = true +sync_wrapper.workspace = true thiserror.workspace = true tls-listener.workspace = true tokio-postgres.workspace = true diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index bedded7567..fb7b1cfb5e 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -1,161 +1,136 @@ +use crate::{ + cancellation::CancelMap, config::ProxyConfig, error::io_error, proxy::handle_ws_client, +}; use bytes::{Buf, Bytes}; use futures::{Sink, Stream, StreamExt}; -use hyper::server::accept; -use hyper::server::conn::AddrIncoming; -use hyper::upgrade::Upgraded; -use hyper::{Body, Request, Response, StatusCode}; -use hyper_tungstenite::{tungstenite, WebSocketStream}; -use hyper_tungstenite::{tungstenite::Message, HyperWebsocket}; +use hyper::{ + server::{accept, conn::AddrIncoming}, + upgrade::Upgraded, + Body, Request, Response, StatusCode, +}; +use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; use pin_project_lite::pin_project; -use tokio::net::TcpListener; - -use std::convert::Infallible; -use std::future::ready; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{ready, Context, Poll}; +use std::{ + convert::Infallible, + future::ready, + pin::Pin, + sync::Arc, + task::{ready, Context, Poll}, +}; use tls_listener::TlsListener; - -use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; - +use tokio::{ + io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}, + net::TcpListener, +}; use tracing::{error, info, info_span, warn, Instrument}; use utils::http::{error::ApiError, json::json_response}; -use crate::cancellation::CancelMap; -use crate::config::ProxyConfig; -use crate::proxy::handle_ws_client; +// TODO: use `std::sync::Exclusive` once it's stabilized. +// Tracking issue: https://github.com/rust-lang/rust/issues/98407. +use sync_wrapper::SyncWrapper; pin_project! { - /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite. - pub struct WebSocketRW { + /// This is a wrapper around a [`WebSocketStream`] that + /// implements [`AsyncRead`] and [`AsyncWrite`]. + pub struct WebSocketRw { #[pin] - stream: WebSocketStream, - chunk: Option, + stream: SyncWrapper>, + bytes: Bytes, } } -// FIXME: explain why this is safe or try to remove `unsafe impl`. -unsafe impl Sync for WebSocketRW {} - -impl WebSocketRW { +impl WebSocketRw { pub fn new(stream: WebSocketStream) -> Self { Self { - stream, - chunk: None, - } - } - - fn has_chunk(&self) -> bool { - if let Some(ref chunk) = self.chunk { - chunk.remaining() > 0 - } else { - false + stream: stream.into(), + bytes: Bytes::new(), } } } -fn ws_err_into(e: tungstenite::Error) -> io::Error { - io::Error::new(io::ErrorKind::Other, e.to_string()) -} - -impl AsyncWrite for WebSocketRW { +impl AsyncWrite for WebSocketRw { fn poll_write( self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &[u8], - ) -> Poll> { - let mut this = self.project(); - match this.stream.as_mut().poll_ready(cx) { - Poll::Ready(Ok(())) => { - if let Err(e) = this - .stream - .as_mut() - .start_send(Message::Binary(buf.to_vec())) - { - Poll::Ready(Err(ws_err_into(e))) - } else { - Poll::Ready(Ok(buf.len())) - } - } - Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))), - Poll::Pending => { - cx.waker().wake_by_ref(); - Poll::Pending - } + ) -> Poll> { + let mut stream = self.project().stream.get_pin_mut(); + + ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; + match stream.as_mut().start_send(Message::Binary(buf.into())) { + Ok(()) => Poll::Ready(Ok(buf.len())), + Err(e) => Poll::Ready(Err(io_error(e))), } } - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().stream.poll_flush(cx).map_err(ws_err_into) + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let stream = self.project().stream.get_pin_mut(); + stream.poll_flush(cx).map_err(io_error) } - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().stream.poll_close(cx).map_err(ws_err_into) + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let stream = self.project().stream.get_pin_mut(); + stream.poll_close(cx).map_err(io_error) } } -impl AsyncRead for WebSocketRW { +impl AsyncRead for WebSocketRw { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - if buf.remaining() == 0 { - return Poll::Ready(Ok(())); + if buf.remaining() > 0 { + let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; + let len = std::cmp::min(bytes.len(), buf.remaining()); + buf.put_slice(&bytes[..len]); + self.consume(len); } - let inner_buf = match ready!(self.as_mut().poll_fill_buf(cx)) { - Ok(buf) => buf, - Err(err) => return Poll::Ready(Err(err)), - }; - let len = std::cmp::min(inner_buf.len(), buf.remaining()); - buf.put_slice(&inner_buf[..len]); - - self.consume(len); Poll::Ready(Ok(())) } } -impl AsyncBufRead for WebSocketRW { - fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { +impl AsyncBufRead for WebSocketRw { + fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // Please refer to poll_fill_buf's documentation. + const EOF: Poll> = Poll::Ready(Ok(&[])); + + let mut this = self.project(); loop { - if self.as_mut().has_chunk() { - let buf = self.project().chunk.as_ref().unwrap().chunk(); - return Poll::Ready(Ok(buf)); - } else { - match ready!(self.as_mut().project().stream.poll_next(cx)) { - Some(Ok(message)) => match message { - Message::Text(_) => {} - Message::Binary(chunk) => { - *self.as_mut().project().chunk = Some(Bytes::from(chunk)); - } - Message::Ping(_) => { - // No need to send a reply: tungstenite takes care of this for you. - } - Message::Pong(_) => {} - Message::Close(_) => { - // No need to send a reply: tungstenite takes care of this for you. - return Poll::Ready(Ok(&[])); - } - Message::Frame(_) => { - unreachable!(); - } - }, - Some(Err(err)) => return Poll::Ready(Err(ws_err_into(err))), - None => return Poll::Ready(Ok(&[])), - } + if !this.bytes.chunk().is_empty() { + let chunk = (*this.bytes).chunk(); + return Poll::Ready(Ok(chunk)); + } + + let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx)); + match res.transpose().map_err(io_error)? { + Some(message) => match message { + Message::Ping(_) => {} + Message::Pong(_) => {} + Message::Text(text) => { + // We expect to see only binary messages. + let error = "unexpected text message in the websocket"; + warn!(length = text.len(), error); + return Poll::Ready(Err(io_error(error))); + } + Message::Frame(_) => { + // This case is impossible according to Frame's doc. + panic!("unexpected raw frame in the websocket"); + } + Message::Binary(chunk) => { + assert!(this.bytes.is_empty()); + *this.bytes = Bytes::from(chunk); + } + Message::Close(_) => return EOF, + }, + None => return EOF, } } } - fn consume(self: Pin<&mut Self>, amt: usize) { - if amt > 0 { - self.project() - .chunk - .as_mut() - .expect("No chunk present") - .advance(amt); - } + fn consume(self: Pin<&mut Self>, amount: usize) { + self.project().bytes.advance(amount); } } @@ -171,7 +146,7 @@ async fn serve_websocket( config, cancel_map, session_id, - WebSocketRW::new(websocket), + WebSocketRw::new(websocket), hostname, ) .await?; @@ -199,7 +174,7 @@ async fn ws_handler( tokio::spawn(async move { if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await { - error!("error in websocket connection: {:?}", e); + error!("error in websocket connection: {e:?}"); } }); @@ -231,7 +206,7 @@ pub async fn task_main( let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { if let Err(err) = conn { - error!("failed to accept TLS connection for websockets: {:?}", err); + error!("failed to accept TLS connection for websockets: {err:?}"); ready(false) } else { ready(true) From 4175cfbdac8fc37451710350f4080dd30fb82a8e Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Mon, 6 Feb 2023 16:56:40 +0100 Subject: [PATCH 016/426] Create folder for file_cache --- Dockerfile.compute-node | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 5a3110141c..ddbce42672 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -228,7 +228,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ chown -R postgres:postgres /var/db/postgres && \ chmod 0750 /var/db/postgres/compute && \ - echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \ + # create folder for file cache + mkdir -p -m 777 /neon/cache COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl From 948f047f0ac4be9b6415734f3cf843b7a8e357f7 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 10 Feb 2023 12:26:06 +0100 Subject: [PATCH 017/426] Compile pgvector extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index ddbce42672..c0b11f01c7 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -165,6 +165,21 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz & find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control +######################################################################################### +# +# Layer "vector-pg-build" +# compile pgvector extension +# +######################################################################################### +FROM build-deps AS vector-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \ + cd pgvector && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -178,6 +193,7 @@ COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From f4359b688cf22005e2e1cc6901835380a1ae01e5 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 10 Feb 2023 13:53:47 +0100 Subject: [PATCH 018/426] Backport `cargo fmt` diff from `release` branch into `main` --- compute_tools/src/http/api.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 74d733424d..589a8e1434 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,6 +3,7 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::compute::ComputeNode; use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; @@ -10,8 +11,6 @@ use serde_json; use tracing::{error, info}; use tracing_utils::http::OtelName; -use crate::compute::ComputeNode; - // Service function to handle all available routes. async fn routes(req: Request, compute: &Arc) -> Response { // From 694150ce4021f343501f49b23eb19851b7138a1d Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 10 Feb 2023 18:46:54 +0300 Subject: [PATCH 019/426] [proxy] Respect the magic `RUST_LOG` env variable Usage: `RUST_LOG=trace proxy ...` --- proxy/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/proxy/src/main.rs b/proxy/src/main.rs index c96ca2a171..8bf81ee2b7 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -43,6 +43,7 @@ async fn flatten_err( async fn main() -> anyhow::Result<()> { tracing_subscriber::fmt() .with_ansi(atty::is(atty::Stream::Stdout)) + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) .with_target(false) .init(); From f383b4d5401bd8ddb4e96a683af4d72496967d0a Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 10 Feb 2023 16:58:34 +0000 Subject: [PATCH 020/426] Enable TCP_NODELAY for wss connections --- proxy/src/http/websocket.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index fb7b1cfb5e..d4235c2c38 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -202,7 +202,8 @@ pub async fn task_main( } }; - let addr_incoming = AddrIncoming::from_listener(ws_listener)?; + let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; + let _ = addr_incoming.set_nodelay(true); let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { if let Err(err) = conn { From eaff14da5f1318d9867b08c3cb3ede34a33e2ec3 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Sun, 12 Feb 2023 23:32:26 +0300 Subject: [PATCH 021/426] [proxy] Restore INFO as the default tracing level Also move tracing init to its own function. --- proxy/src/main.rs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 8bf81ee2b7..8812f77b62 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -41,13 +41,8 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { - tracing_subscriber::fmt() - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) - .with_target(false) - .init(); - - // initialize sentry if SENTRY_DSN is provided + // First, initialize logging and troubleshooting subsystems. + init_tracing(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); info!("Version: {GIT_VERSION}"); @@ -112,6 +107,21 @@ async fn main() -> anyhow::Result<()> { Ok(()) } +/// Tracing is used for logging and telemetry. +fn init_tracing() { + tracing_subscriber::fmt() + .with_env_filter({ + // This filter will examine the `RUST_LOG` env variable. + use tracing_subscriber::filter::{EnvFilter, LevelFilter}; + EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy() + }) + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_target(false) + .init(); +} + /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> { let tls_config = match ( From e6618f1cc0d9a93e12333bd1876bbff4d47e2e6c Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 14 Feb 2023 13:17:34 +0200 Subject: [PATCH 022/426] Update current logical size gauge (#3592) Alternative to #3586. Introduces usage of current_logical_size.current_size as a boundary after which we start to update the metric gauge on ingested wal. Previously any incremented value (ingested wal) would had updated the gauge, but this would had left the metric at zero for timelines which never receive any wal even if size had been calculated. Now the gauge is updated right away as the calculation completes, not requiring any wal to be received. --- pageserver/src/tenant/timeline.rs | 44 ++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index bcbf8a12b4..5a829e42e5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -292,18 +292,9 @@ impl LogicalSize { // we change the type. match self.initial_logical_size.get() { Some(initial_size) => { - let absolute_size_increment = u64::try_from( - size_increment - .checked_abs() - .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, - ).expect("casting nonnegative i64 to u64 should not fail"); - - if size_increment < 0 { - initial_size.checked_sub(absolute_size_increment) - } else { - initial_size.checked_add(absolute_size_increment) - }.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) - .map(CurrentLogicalSize::Exact) + initial_size.checked_add_signed(size_increment) + .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) + .map(CurrentLogicalSize::Exact) } None => { let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); @@ -1625,13 +1616,31 @@ impl Timeline { } x @ Err(_) => x.context("Failed to calculate logical size")?, }; + + // we cannot query current_logical_size.current_size() to know the current + // *negative* value, only truncated to u64. + let added = self_clone + .current_logical_size + .size_added_after_initial + .load(AtomicOrdering::Relaxed); + + let sum = calculated_size.saturating_add_signed(added); + + // set the gauge value before it can be set in `update_current_logical_size`. + self_clone.metrics.current_logical_size_gauge.set(sum); + match self_clone .current_logical_size .initial_logical_size .set(calculated_size) { Ok(()) => (), - Err(existing_size) => { + Err(_what_we_just_attempted_to_set) => { + let existing_size = self_clone + .current_logical_size + .initial_logical_size + .get() + .expect("once_cell set was lost, then get failed, impossible."); // This shouldn't happen because the semaphore is initialized with 1. // But if it happens, just complain & report success so there are no further retries. error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing") @@ -1814,10 +1823,15 @@ impl Timeline { // one value while current_logical_size is set to the // other. match logical_size.current_size() { - Ok(new_current_size) => self + Ok(CurrentLogicalSize::Exact(new_current_size)) => self .metrics .current_logical_size_gauge - .set(new_current_size.size()), + .set(new_current_size), + Ok(CurrentLogicalSize::Approximate(_)) => { + // don't update the gauge yet, this allows us not to update the gauge back and + // forth between the initial size calculation task. + } + // this is overflow Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), } } From eb21d9969de9221fb0fedd5c71b72294b552b4fa Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 14 Feb 2023 12:56:03 +0100 Subject: [PATCH 023/426] Add pageserver-3.us-west-2.aws.neon.tech (#3603) --- .github/ansible/prod.us-west-2.hosts.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index ff5d924a91..9cad79b986 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -29,6 +29,8 @@ storage: ansible_host: i-0c834be1dddba8b3f pageserver-2.us-west-2.aws.neon.tech: ansible_host: i-051642d372c0a4f32 + pageserver-3.us-west-2.aws.neon.tech: + ansible_host: i-00c3844beb9ad1c6b safekeepers: hosts: From 86681b92aaeb81a7b7c71fdd3bef4939dc191dac Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Mon, 13 Feb 2023 16:37:09 +0100 Subject: [PATCH 024/426] Enable plls and plcoffee extensions --- Dockerfile.compute-node | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index c0b11f01c7..5c58f4baaa 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -106,7 +106,9 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control ######################################################################################### # From 3569c1bacdaea7b357f927cef32cd657e82af941 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Tue, 14 Feb 2023 16:31:05 +0300 Subject: [PATCH 025/426] [proxy] Fix: don't cache user & dbname in node info cache Upstream proxy erroneously stores user & dbname in compute node info cache entries, thus causing "funny" connection problems if such an entry is reused while connecting to e.g. a different DB on the same compute node. This PR fixes the problem but doesn't eliminate the root cause just yet. I'll revisit this code and make it more type-safe in the upcoming PR. --- proxy/src/auth/backend/link.rs | 2 ++ proxy/src/auth/credentials.rs | 37 ++++++++++-------------------- proxy/src/compute.rs | 12 ++++++++++ proxy/src/console/provider/mock.rs | 13 ++++------- proxy/src/console/provider/neon.rs | 9 ++++---- 5 files changed, 34 insertions(+), 39 deletions(-) diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index ef92b1a444..5d0049c957 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -78,6 +78,8 @@ pub(super) async fn handle_user( client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; + // This config should be self-contained, because we won't + // take username or dbname from client's startup message. let mut config = compute::ConnCfg::new(); config .host(&db_info.host) diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 66ca8be73e..968104f058 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -32,7 +32,6 @@ impl UserFacingError for ClientCredsParseError {} #[derive(Debug, Clone, PartialEq, Eq)] pub struct ClientCredentials<'a> { pub user: &'a str, - pub dbname: &'a str, // TODO: this is a severe misnomer! We should think of a new name ASAP. pub project: Option>, /// If `True`, we'll use the old cleartext password flow. This is used for @@ -59,7 +58,6 @@ impl<'a> ClientCredentials<'a> { // Some parameters are stored in the startup message. let get_param = |key| params.get(key).ok_or(MissingKey(key)); let user = get_param("user")?; - let dbname = get_param("database")?; // Project name might be passed via PG's command-line options. let project_option = params.options_raw().and_then(|mut options| { @@ -100,7 +98,6 @@ impl<'a> ClientCredentials<'a> { info!( user = user, - dbname = dbname, project = project.as_deref(), use_cleartext_password_flow = use_cleartext_password_flow, "credentials" @@ -108,7 +105,6 @@ impl<'a> ClientCredentials<'a> { Ok(Self { user, - dbname, project, use_cleartext_password_flow, }) @@ -131,25 +127,27 @@ mod tests { use ClientCredsParseError::*; #[test] - #[ignore = "TODO: fix how database is handled"] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - // TODO: check that `creds.dbname` is None. let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.project, None); Ok(()) } #[test] - fn parse_missing_project() -> anyhow::Result<()> { - let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + fn parse_excessive() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("database", "world"), // should be ignored + ("foo", "bar"), // should be ignored + ]); let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project, None); Ok(()) @@ -157,14 +155,13 @@ mod tests { #[test] fn parse_project_from_sni() -> anyhow::Result<()> { - let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("foo.localhost"); let common_name = Some("localhost"); let creds = ClientCredentials::parse(&options, sni, common_name, false)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("foo")); Ok(()) @@ -174,13 +171,11 @@ mod tests { fn parse_project_from_options() -> anyhow::Result<()> { let options = StartupMessageParams::new([ ("user", "john_doe"), - ("database", "world"), ("options", "-ckey=1 project=bar -c geqo=off"), ]); let creds = ClientCredentials::parse(&options, None, None, false)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("bar")); Ok(()) @@ -188,18 +183,13 @@ mod tests { #[test] fn parse_projects_identical() -> anyhow::Result<()> { - let options = StartupMessageParams::new([ - ("user", "john_doe"), - ("database", "world"), - ("options", "project=baz"), - ]); + let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]); let sni = Some("baz.localhost"); let common_name = Some("localhost"); let creds = ClientCredentials::parse(&options, sni, common_name, false)?; assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("baz")); Ok(()) @@ -207,11 +197,8 @@ mod tests { #[test] fn parse_projects_different() { - let options = StartupMessageParams::new([ - ("user", "john_doe"), - ("database", "world"), - ("options", "project=first"), - ]); + let options = + StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]); let sni = Some("second.localhost"); let common_name = Some("localhost"); @@ -229,7 +216,7 @@ mod tests { #[test] fn parse_inconsistent_sni() { - let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); let common_name = Some("example.com"); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 0c0cbcde20..3f5eb3caff 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -65,6 +65,18 @@ impl ConnCfg { /// Apply startup message params to the connection config. pub fn set_startup_params(&mut self, params: &StartupMessageParams) { + // Only set `user` if it's not present in the config. + // Link auth flow takes username from the console's response. + if let (None, Some(user)) = (self.get_user(), params.get("user")) { + self.user(user); + } + + // Only set `dbname` if it's not present in the config. + // Link auth flow takes dbname from the console's response. + if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) { + self.dbname(dbname); + } + if let Some(options) = params.options_raw() { // We must drop all proxy-specific parameters. #[allow(unstable_name_collisions)] diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 301c3be516..eaac9c06d9 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -82,16 +82,11 @@ impl Api { .await } - async fn do_wake_compute( - &self, - creds: &ClientCredentials<'_>, - ) -> Result { + async fn do_wake_compute(&self) -> Result { let mut config = compute::ConnCfg::new(); config .host(self.endpoint.host_str().unwrap_or("localhost")) - .port(self.endpoint.port().unwrap_or(5432)) - .dbname(creds.dbname) - .user(creds.user); + .port(self.endpoint.port().unwrap_or(5432)); let node = NodeInfo { config, @@ -117,9 +112,9 @@ impl super::Api for Api { async fn wake_compute( &self, _extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + _creds: &ClientCredentials<'_>, ) -> Result { - self.do_wake_compute(creds) + self.do_wake_compute() .map_ok(CachedNodeInfo::new_uncached) .await } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 00d3ca8352..4eca025d2d 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -97,12 +97,11 @@ impl Api { Some(x) => x, }; + // Don't set anything but host and port! This config will be cached. + // We'll set username and such later using the startup message. + // TODO: add more type safety (in progress). let mut config = compute::ConnCfg::new(); - config - .host(host) - .port(port) - .dbname(creds.dbname) - .user(creds.user); + config.host(host).port(port); let node = NodeInfo { config, From a5ce2b5330233927169152253548f822cf6d1643 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 14 Feb 2023 16:31:06 +0200 Subject: [PATCH 026/426] Add debug messages around timeline.get_current_logical_size --- pageserver/src/consumption_metrics.rs | 23 +++++++++++++---------- pageserver/src/tenant/timeline.rs | 7 ++++++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index a730d39339..e4b7b6d809 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -166,17 +166,20 @@ pub async fn collect_metrics_iteration( match timeline.get_current_logical_size(ctx) { // Only send timeline logical size when it is fully calculated. - Ok((size, is_exact)) if is_exact => { - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: Some(timeline.timeline_id), - metric: TIMELINE_LOGICAL_SIZE, - }, - size, - )); + Ok((size, is_exact)) => { + if is_exact { + current_metrics.push(( + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: TIMELINE_LOGICAL_SIZE, + }, + size, + )); + } else { + info!("logical_size is not fully calculated for timeline {}, skipping sending value {} ", timeline.timeline_id, size); + } } - Ok((_, _)) => {} Err(err) => { error!( "failed to get current logical size for timeline {}: {err:?}", diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5a829e42e5..7a5a9de2f4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -740,10 +740,15 @@ impl Timeline { let mut is_exact = true; let size = current_size.size(); - if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = + if let (CurrentLogicalSize::Approximate(approx_size), Some(init_lsn)) = (current_size, self.current_logical_size.initial_part_end) { is_exact = false; + info!( + "Current size for timeline {} is approximate {}, initial_part_end lsn: {:?}", + self.timeline_id, approx_size, init_lsn + ); + self.try_spawn_size_init_task(init_lsn, ctx); } From a839860c2ea9a0bbce39a43e9849daef698025ee Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 14 Feb 2023 16:49:42 +0200 Subject: [PATCH 027/426] Add debug messages around sending cached metrics --- pageserver/src/consumption_metrics.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index e4b7b6d809..b078782a86 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -242,6 +242,18 @@ pub async fn collect_metrics_iteration( Some(val) => val != curr_val, None => true, }); + + info!( + "sending only changed metrics, {} values at {}", + current_metrics.len(), + Utc::now() + ); + } else { + info!( + "sending all metrics, including cached ones. {} values at {}", + current_metrics.len(), + Utc::now() + ); } if current_metrics.is_empty() { From a974602f9f4311d6b1e778d31e5c2ea17aacebf8 Mon Sep 17 00:00:00 2001 From: Anna Stepanyan Date: Wed, 15 Feb 2023 15:11:06 +0100 Subject: [PATCH 028/426] fix the logical size term definition (#3609) a size of a *database* cannot be a sum of the sizes of *all databases* indicating that a logical size is calculated for a branch ## Describe your changes ## Issue ticket number and link ## Checklist before requesting a review - [x] i checked the suggested changes - [x] this is not a core feature - [x] this is just a docs update, does not require analytics - [x] this PR does not require a public announcement --- docs/synthetic-size.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 8378efc842..407d7b525a 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -37,7 +37,7 @@ The synthetic size is designed to: ## Terms & assumptions -- logical size is the size of a database *at a given point in +- logical size is the size of a branch *at a given point in time*. It's the total size of all tables in all databases, as you see with "\l+" in psql for example, plus the Postgres SLRUs and some small amount of metadata. NOTE that currently, Neon does not include From 1d9d7c02dbc2b404b555d56353e339de44160229 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 15 Feb 2023 16:25:07 +0300 Subject: [PATCH 029/426] [proxy] Don't forward empty `options` to compute nodes Clients may specify endpoint/project name via `options=project=...`, so we should not only remove `project=` from `options` but also drop `options` entirely, because connection pools don't support it. Discussion: https://neondb.slack.com/archives/C033A2WE6BZ/p1676464382670119 --- libs/pq_proto/src/lib.rs | 44 ++++++++++++++++++++------------ proxy/src/compute.rs | 54 ++++++++++++++++++++++++++++++++++------ 2 files changed, 74 insertions(+), 24 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index c5e4dbd1f0..b7995c840c 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -75,27 +75,36 @@ impl StartupMessageParams { /// taking into account all escape sequences but leaving them as-is. /// [`None`] means that there's no `options` in [`Self`]. pub fn options_raw(&self) -> Option> { - // See `postgres: pg_split_opts`. - let mut last_was_escape = false; - let iter = self - .get("options")? - .split(move |c: char| { - // We split by non-escaped whitespace symbols. - let should_split = c.is_ascii_whitespace() && !last_was_escape; - last_was_escape = c == '\\' && !last_was_escape; - should_split - }) - .filter(|s| !s.is_empty()); - - Some(iter) + self.get("options").map(Self::parse_options_raw) } /// Split command-line options according to PostgreSQL's logic, /// applying all escape sequences (using owned strings as needed). /// [`None`] means that there's no `options` in [`Self`]. pub fn options_escaped(&self) -> Option>> { + self.get("options").map(Self::parse_options_escaped) + } + + /// Split command-line options according to PostgreSQL's logic, + /// taking into account all escape sequences but leaving them as-is. + pub fn parse_options_raw(input: &str) -> impl Iterator { // See `postgres: pg_split_opts`. - let iter = self.options_raw()?.map(|s| { + let mut last_was_escape = false; + input + .split(move |c: char| { + // We split by non-escaped whitespace symbols. + let should_split = c.is_ascii_whitespace() && !last_was_escape; + last_was_escape = c == '\\' && !last_was_escape; + should_split + }) + .filter(|s| !s.is_empty()) + } + + /// Split command-line options according to PostgreSQL's logic, + /// applying all escape sequences (using owned strings as needed). + pub fn parse_options_escaped(input: &str) -> impl Iterator> { + // See `postgres: pg_split_opts`. + Self::parse_options_raw(input).map(|s| { let mut preserve_next_escape = false; let escape = |c| { // We should remove '\\' unless it's preceded by '\\'. @@ -108,9 +117,12 @@ impl StartupMessageParams { true => Cow::Owned(s.replace(escape, "")), false => Cow::Borrowed(s), } - }); + }) + } - Some(iter) + /// Iterate through key-value pairs in an arbitrary order. + pub fn iter(&self) -> impl Iterator { + self.params.iter().map(|(k, v)| (k.as_str(), v.as_str())) } // This function is mostly useful in tests. diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 3f5eb3caff..2e12d9ee26 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -77,14 +77,9 @@ impl ConnCfg { self.dbname(dbname); } - if let Some(options) = params.options_raw() { - // We must drop all proxy-specific parameters. - #[allow(unstable_name_collisions)] - let options: String = options - .filter(|opt| !opt.starts_with("project=")) - .intersperse(" ") // TODO: use impl from std once it's stabilized - .collect(); - + // Don't add `options` if they were only used for specifying a project. + // Connection pools don't support `options`, because they affect backend startup. + if let Some(options) = filtered_options(params) { self.options(&options); } @@ -225,3 +220,46 @@ impl ConnCfg { Ok(connection) } } + +/// Retrieve `options` from a startup message, dropping all proxy-secific flags. +fn filtered_options(params: &StartupMessageParams) -> Option { + #[allow(unstable_name_collisions)] + let options: String = params + .options_raw()? + .filter(|opt| !opt.starts_with("project=")) + .intersperse(" ") // TODO: use impl from std once it's stabilized + .collect(); + + // Don't even bother with empty options. + if options.is_empty() { + return None; + } + + Some(options) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_filtered_options() { + // Empty options is unlikely to be useful anyway. + let params = StartupMessageParams::new([("options", "")]); + assert_eq!(filtered_options(¶ms), None); + + // It's likely that clients will only use options to specify endpoint/project. + let params = StartupMessageParams::new([("options", "project=foo")]); + assert_eq!(filtered_options(¶ms), None); + + // Same, because unescaped whitespaces are no-op. + let params = StartupMessageParams::new([("options", " project=foo ")]); + assert_eq!(filtered_options(¶ms).as_deref(), None); + + let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ ")); + + let params = StartupMessageParams::new([("options", "project = foo")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + } +} From 7b182e2605b29a364ab625a2fad7323550c4f5f2 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Thu, 16 Feb 2023 10:33:04 +0200 Subject: [PATCH 030/426] Update settings.md with latest PITR and gc period values (#3618) ## Describe your changes Updates PITR and GC_PERIOD default value doc ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- docs/settings.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/settings.md b/docs/settings.md index 58d32157a3..817f97d8ba 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898' checkpoint_distance = '268435456' # in bytes checkpoint_timeout = '10m' -gc_period = '100 s' +gc_period = '1 hour' gc_horizon = '67108864' max_file_descriptors = '100' @@ -101,7 +101,7 @@ away. #### gc_period -Interval at which garbage collection is triggered. Default is 100 s. +Interval at which garbage collection is triggered. Default is 1 hour. #### image_creation_threshold @@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3. #### pitr_interval -WAL retention duration for PITR branching. Default is 30 days. +WAL retention duration for PITR branching. Default is 7 days. #### walreceiver_connect_timeout From ddbdcdddd7886c72da740b990471dea9ed001413 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 16 Feb 2023 10:53:46 +0200 Subject: [PATCH 031/426] Tenant size calculation: refactor, rewrite, and add SVG (#2817) Refactor the tenant_size_model code. Segment now contains just the minimum amount of information needed to calculate the size. Other information that is useful for building up the segment tree, and for display purposes, is now kept elsewhere. The code in 'main.rs' has a new ScenarioBuilder struct for that. Calculating which Segments are "needed" is now the responsibility of the caller of tenant_size_mode, not part of the calculation itself. So it's up to the caller to make all the decisions with retention periods for each branch. The output of the sizing calculation is now a Vec of SizeResults, rather than a tree. It uses a tree representation internally, when doing the calculation, but it's not exposed to the caller anymore. Refactor the way the recursive calculation is performed. Rewrite the code in size.rs that builds the Segment model. Get rid of the intermediate representation with Update structs. Build the Segments directly, with some local HashMaps and Vecs to track branch points to help with that. retention_period is now an input to gather_inputs(), rather than an output. Update pageserver http API: rename /size endpoint to /synthetic_size with following parameters: - /synthetic_size?inputs_only to get debug info; - /synthetic_size?retention_period=0 to override cutoff that is used to calculate the size; pass header -H "Accept: text/html" to get HTML output, otherwise JSON is returned Update python tests and openapi spec. --------- Co-authored-by: Anastasia Lubennikova Co-authored-by: Joonas Koivunen --- Cargo.lock | 2 + libs/tenant_size_model/Cargo.toml | 2 + libs/tenant_size_model/src/calculation.rs | 219 ++++ libs/tenant_size_model/src/lib.rs | 427 +------- libs/tenant_size_model/src/main.rs | 269 ----- libs/tenant_size_model/src/svg.rs | 193 ++++ libs/tenant_size_model/tests/tests.rs | 313 ++++++ pageserver/src/http/openapi_spec.yml | 7 + pageserver/src/http/routes.rs | 109 +- pageserver/src/tenant.rs | 40 +- pageserver/src/tenant/size.rs | 1159 +++++++++------------ pageserver/src/walredo.rs | 3 +- test_runner/fixtures/neon_fixtures.py | 12 +- test_runner/regress/test_tenant_size.py | 180 +++- 14 files changed, 1581 insertions(+), 1354 deletions(-) create mode 100644 libs/tenant_size_model/src/calculation.rs delete mode 100644 libs/tenant_size_model/src/main.rs create mode 100644 libs/tenant_size_model/src/svg.rs create mode 100644 libs/tenant_size_model/tests/tests.rs diff --git a/Cargo.lock b/Cargo.lock index 67e54d3833..98c4dca09b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3809,6 +3809,8 @@ name = "tenant_size_model" version = "0.1.0" dependencies = [ "anyhow", + "serde", + "serde_json", "workspace_hack", ] diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml index a5f0160f35..15e78932a8 100644 --- a/libs/tenant_size_model/Cargo.toml +++ b/libs/tenant_size_model/Cargo.toml @@ -7,5 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true +serde.workspace = true +serde_json.workspace = true workspace_hack.workspace = true diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs new file mode 100644 index 0000000000..093b053675 --- /dev/null +++ b/libs/tenant_size_model/src/calculation.rs @@ -0,0 +1,219 @@ +use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; + +// +// *-g--*---D---> +// / +// / +// / *---b----*-B---> +// / / +// / / +// -----*--e---*-----f----* C +// E \ +// \ +// *--a---*---A--> +// +// If A and B need to be retained, is it cheaper to store +// snapshot at C+a+b, or snapshots at A and B ? +// +// If D also needs to be retained, which is cheaper: +// +// 1. E+g+e+f+a+b +// 2. D+C+a+b +// 3. D+A+B + +/// [`Segment`] which has had it's size calculated. +#[derive(Clone, Debug)] +struct SegmentSize { + method: SegmentMethod, + + // calculated size of this subtree, using this method + accum_size: u64, + + seg_id: usize, + children: Vec, +} + +struct SizeAlternatives { + // cheapest alternative if parent is available. + incremental: SegmentSize, + + // cheapest alternative if parent node is not available + non_incremental: Option, +} + +impl StorageModel { + pub fn calculate(&self) -> SizeResult { + // Build adjacency list. 'child_list' is indexed by segment id. Each entry + // contains a list of all child segments of the segment. + let mut roots: Vec = Vec::new(); + let mut child_list: Vec> = Vec::new(); + child_list.resize(self.segments.len(), Vec::new()); + + for (seg_id, seg) in self.segments.iter().enumerate() { + if let Some(parent_id) = seg.parent { + child_list[parent_id].push(seg_id); + } else { + roots.push(seg_id); + } + } + + let mut segment_results = Vec::new(); + segment_results.resize( + self.segments.len(), + SegmentSizeResult { + method: SegmentMethod::Skipped, + accum_size: 0, + }, + ); + + let mut total_size = 0; + for root in roots { + if let Some(selected) = self.size_here(root, &child_list).non_incremental { + StorageModel::fill_selected_sizes(&selected, &mut segment_results); + total_size += selected.accum_size; + } else { + // Couldn't find any way to get this root. Error? + } + } + + SizeResult { + total_size, + segments: segment_results, + } + } + + fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec) { + result[selected.seg_id] = SegmentSizeResult { + method: selected.method, + accum_size: selected.accum_size, + }; + // recurse to children + for child in selected.children.iter() { + StorageModel::fill_selected_sizes(child, result); + } + } + + // + // This is the core of the sizing calculation. + // + // This is a recursive function, that for each Segment calculates the best way + // to reach all the Segments that are marked as needed in this subtree, under two + // different conditions: + // a) when the parent of this segment is available (as a snaphot or through WAL), and + // b) when the parent of this segment is not available. + // + fn size_here(&self, seg_id: usize, child_list: &Vec>) -> SizeAlternatives { + let seg = &self.segments[seg_id]; + // First figure out the best way to get each child + let mut children = Vec::new(); + for child_id in &child_list[seg_id] { + children.push(self.size_here(*child_id, child_list)) + } + + // Method 1. If this node is not needed, we can skip it as long as we + // take snapshots later in each sub-tree + let snapshot_later = if !seg.needed { + let mut snapshot_later = SegmentSize { + seg_id, + method: SegmentMethod::Skipped, + accum_size: 0, + children: Vec::new(), + }; + + let mut possible = true; + for child in children.iter() { + if let Some(non_incremental) = &child.non_incremental { + snapshot_later.accum_size += non_incremental.accum_size; + snapshot_later.children.push(non_incremental.clone()) + } else { + possible = false; + break; + } + } + if possible { + Some(snapshot_later) + } else { + None + } + } else { + None + }; + + // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of + // this Segment was given. + let snapshot_here = if !seg.needed || seg.parent.is_none() { + if let Some(snapshot_size) = seg.size { + let mut snapshot_here = SegmentSize { + seg_id, + method: SegmentMethod::SnapshotHere, + accum_size: snapshot_size, + children: Vec::new(), + }; + for child in children.iter() { + snapshot_here.accum_size += child.incremental.accum_size; + snapshot_here.children.push(child.incremental.clone()) + } + Some(snapshot_here) + } else { + None + } + } else { + None + }; + + // Method 3. Use WAL to get here from parent + let wal_here = { + let mut wal_here = SegmentSize { + seg_id, + method: SegmentMethod::Wal, + accum_size: if let Some(parent_id) = seg.parent { + seg.lsn - self.segments[parent_id].lsn + } else { + 0 + }, + children: Vec::new(), + }; + for child in children { + wal_here.accum_size += child.incremental.accum_size; + wal_here.children.push(child.incremental) + } + wal_here + }; + + // If the parent is not available, what's the cheapest method involving + // a snapshot here or later? + let mut cheapest_non_incremental: Option = None; + if let Some(snapshot_here) = snapshot_here { + cheapest_non_incremental = Some(snapshot_here); + } + if let Some(snapshot_later) = snapshot_later { + // Use <=, to prefer skipping if the size is equal + if let Some(parent) = &cheapest_non_incremental { + if snapshot_later.accum_size <= parent.accum_size { + cheapest_non_incremental = Some(snapshot_later); + } + } else { + cheapest_non_incremental = Some(snapshot_later); + } + } + + // And what's the cheapest method, if the parent is available? + let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental + { + // Is it cheaper to use a snapshot here or later, anyway? + // Use <, to prefer Wal over snapshot if the cost is the same + if wal_here.accum_size < cheapest_non_incremental.accum_size { + wal_here + } else { + cheapest_non_incremental.clone() + } + } else { + wal_here + }; + + SizeAlternatives { + incremental: cheapest_incremental, + non_incremental: cheapest_non_incremental, + } + } +} diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs index b156e1be9d..c151e3b42c 100644 --- a/libs/tenant_size_model/src/lib.rs +++ b/libs/tenant_size_model/src/lib.rs @@ -1,401 +1,70 @@ -use std::borrow::Cow; -use std::collections::HashMap; +//! Synthetic size calculation -use anyhow::Context; +mod calculation; +pub mod svg; -/// Pricing model or history size builder. +/// StorageModel is the input to the synthetic size calculation. It represents +/// a tree of timelines, with just the information that's needed for the +/// calculation. This doesn't track timeline names or where each timeline +/// begins and ends, for example. Instead, it consists of "points of interest" +/// on the timelines. A point of interest could be the timeline start or end point, +/// the oldest point on a timeline that needs to be retained because of PITR +/// cutoff, or snapshot points named by the user. For each such point, and the +/// edge connecting the points (implicit in Segment), we store information about +/// whether we need to be able to recover to the point, and if known, the logical +/// size at the point. /// -/// Maintains knowledge of the branches and their modifications. Generic over the branch name key -/// type. -pub struct Storage { - segments: Vec, - - /// Mapping from the branch name to the index of a segment describing it's latest state. - branches: HashMap, +/// The segments must form a well-formed tree, with no loops. +#[derive(serde::Serialize)] +pub struct StorageModel { + pub segments: Vec, } -/// Snapshot of a branch. -#[derive(Clone, Debug, Eq, PartialEq)] +/// Segment represents one point in the tree of branches, *and* the edge that leads +/// to it (if any). We don't need separate structs for points and edges, because each +/// point can have only one parent. +/// +/// When 'needed' is true, it means that we need to be able to reconstruct +/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only +/// a single point is needed, create two Segments with the same lsn, and mark only +/// the child as needed. +/// +#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub struct Segment { /// Previous segment index into ['Storage::segments`], if any. - parent: Option, + pub parent: Option, - /// Description of how did we get to this state. - /// - /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when - /// modifying a branch directly. - pub op: Cow<'static, str>, + /// LSN at this point + pub lsn: u64, - /// LSN before this state - start_lsn: u64, + /// Logical size at this node, if known. + pub size: Option, - /// LSN at this state - pub end_lsn: u64, - - /// Logical size before this state - start_size: u64, - - /// Logical size at this state. Can be None in the last Segment of a branch. - pub end_size: Option, - - /// Indices to [`Storage::segments`] - /// - /// FIXME: this could be an Option - children_after: Vec, - - /// Determined by `retention_period` given to [`Storage::calculate`] + /// If true, the segment from parent to this node is needed by `retention_period` pub needed: bool, } -// -// -// -// -// *-g--*---D---> -// / -// / -// / *---b----*-B---> -// / / -// / / -// -----*--e---*-----f----* C -// E \ -// \ -// *--a---*---A--> -// -// If A and B need to be retained, is it cheaper to store -// snapshot at C+a+b, or snapshots at A and B ? -// -// If D also needs to be retained, which is cheaper: -// -// 1. E+g+e+f+a+b -// 2. D+C+a+b -// 3. D+A+B +/// Result of synthetic size calculation. Returned by StorageModel::calculate() +pub struct SizeResult { + pub total_size: u64, -/// [`Segment`] which has had it's size calculated. -pub struct SegmentSize { - pub seg_id: usize, - - pub method: SegmentMethod, - - this_size: u64, - - pub children: Vec, + // This has same length as the StorageModel::segments vector in the input. + // Each entry in this array corresponds to the entry with same index in + // StorageModel::segments. + pub segments: Vec, } -impl SegmentSize { - fn total(&self) -> u64 { - self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) - } - - pub fn total_children(&self) -> u64 { - if self.method == SnapshotAfter { - self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) - } else { - self.children.iter().fold(0, |acc, x| acc + x.total()) - } - } +#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct SegmentSizeResult { + pub method: SegmentMethod, + // calculated size of this subtree, using this method + pub accum_size: u64, } /// Different methods to retain history from a particular state -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub enum SegmentMethod { - SnapshotAfter, - Wal, - WalNeeded, + SnapshotHere, // A logical snapshot is needed after this segment + Wal, // Keep WAL leading up to this node Skipped, } - -use SegmentMethod::*; - -impl Storage { - /// Creates a new storage with the given default branch name. - pub fn new(initial_branch: K) -> Storage { - let init_segment = Segment { - op: "".into(), - needed: false, - parent: None, - start_lsn: 0, - end_lsn: 0, - start_size: 0, - end_size: Some(0), - children_after: Vec::new(), - }; - - Storage { - segments: vec![init_segment], - branches: HashMap::from([(initial_branch, 0)]), - } - } - - /// Advances the branch with a new point, at given LSN. - pub fn insert_point( - &mut self, - branch: &Q, - op: Cow<'static, str>, - lsn: u64, - size: Option, - ) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") }; - let newseg_id = self.segments.len(); - let lastseg = &mut self.segments[lastseg_id]; - - assert!(lsn > lastseg.end_lsn); - - let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") }; - - let newseg = Segment { - op, - parent: Some(lastseg_id), - start_lsn: lastseg.end_lsn, - end_lsn: lsn, - start_size, - end_size: size, - children_after: Vec::new(), - needed: false, - }; - lastseg.children_after.push(newseg_id); - - self.segments.push(newseg); - *self.branches.get_mut(branch).expect("read already") = newseg_id; - - Ok(()) - } - - /// Advances the branch with the named operation, by the relative LSN and logical size bytes. - pub fn modify_branch( - &mut self, - branch: &Q, - op: Cow<'static, str>, - lsn_bytes: u64, - size_bytes: i64, - ) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") }; - let newseg_id = self.segments.len(); - let lastseg = &mut self.segments[lastseg_id]; - - let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") }; - - let newseg = Segment { - op, - parent: Some(lastseg_id), - start_lsn: lastseg.end_lsn, - end_lsn: lastseg.end_lsn + lsn_bytes, - start_size: last_end_size, - end_size: Some((last_end_size as i64 + size_bytes) as u64), - children_after: Vec::new(), - needed: false, - }; - lastseg.children_after.push(newseg_id); - - self.segments.push(newseg); - *self.branches.get_mut(branch).expect("read already") = newseg_id; - Ok(()) - } - - pub fn insert(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "insert".into(), bytes, bytes as i64) - } - - pub fn update(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "update".into(), bytes, 0i64) - } - - pub fn delete(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64)) - } - - pub fn branch(&mut self, parent: &Q, name: K) -> anyhow::Result<()> - where - K: std::borrow::Borrow + std::fmt::Debug, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - // Find the right segment - let branchseg_id = *self.branches.get(parent).with_context(|| { - format!( - "should had found the parent {:?} by key. in branches {:?}", - parent, self.branches - ) - })?; - - let _branchseg = &mut self.segments[branchseg_id]; - - // Create branch name for it - self.branches.insert(name, branchseg_id); - Ok(()) - } - - pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result { - // Phase 1: Mark all the segments that need to be retained - for (_branch, &last_seg_id) in self.branches.iter() { - let last_seg = &self.segments[last_seg_id]; - let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period); - let mut seg_id = last_seg_id; - loop { - let seg = &mut self.segments[seg_id]; - if seg.end_lsn < cutoff_lsn { - break; - } - seg.needed = true; - if let Some(prev_seg_id) = seg.parent { - seg_id = prev_seg_id; - } else { - break; - } - } - } - - // Phase 2: For each oldest segment in a chain that needs to be retained, - // calculate if we should store snapshot or WAL - self.size_from_snapshot_later(0) - } - - fn size_from_wal(&self, seg_id: usize) -> anyhow::Result { - let seg = &self.segments[seg_id]; - - let this_size = seg.end_lsn - seg.start_lsn; - - let mut children = Vec::new(); - - // try both ways - for &child_id in seg.children_after.iter() { - // try each child both ways - let child = &self.segments[child_id]; - let p1 = self.size_from_wal(child_id)?; - - let p = if !child.needed { - let p2 = self.size_from_snapshot_later(child_id)?; - if p1.total() < p2.total() { - p1 - } else { - p2 - } - } else { - p1 - }; - children.push(p); - } - Ok(SegmentSize { - seg_id, - method: if seg.needed { WalNeeded } else { Wal }, - this_size, - children, - }) - } - - fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result { - // If this is needed, then it's time to do the snapshot and continue - // with wal method. - let seg = &self.segments[seg_id]; - //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed); - if seg.needed { - let mut children = Vec::new(); - - for &child_id in seg.children_after.iter() { - // try each child both ways - let child = &self.segments[child_id]; - let p1 = self.size_from_wal(child_id)?; - - let p = if !child.needed { - let p2 = self.size_from_snapshot_later(child_id)?; - if p1.total() < p2.total() { - p1 - } else { - p2 - } - } else { - p1 - }; - children.push(p); - } - Ok(SegmentSize { - seg_id, - method: WalNeeded, - this_size: seg.start_size, - children, - }) - } else { - // If any of the direct children are "needed", need to be able to reconstruct here - let mut children_needed = false; - for &child in seg.children_after.iter() { - let seg = &self.segments[child]; - if seg.needed { - children_needed = true; - break; - } - } - - let method1 = if !children_needed { - let mut children = Vec::new(); - for child in seg.children_after.iter() { - children.push(self.size_from_snapshot_later(*child)?); - } - Some(SegmentSize { - seg_id, - method: Skipped, - this_size: 0, - children, - }) - } else { - None - }; - - // If this a junction, consider snapshotting here - let method2 = if children_needed || seg.children_after.len() >= 2 { - let mut children = Vec::new(); - for child in seg.children_after.iter() { - children.push(self.size_from_wal(*child)?); - } - let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") }; - Some(SegmentSize { - seg_id, - method: SnapshotAfter, - this_size, - children, - }) - } else { - None - }; - - Ok(match (method1, method2) { - (None, None) => anyhow::bail!( - "neither method was applicable: children_after={}, children_needed={}", - seg.children_after.len(), - children_needed - ), - (Some(method), None) => method, - (None, Some(method)) => method, - (Some(method1), Some(method2)) => { - if method1.total() < method2.total() { - method1 - } else { - method2 - } - } - }) - } - } - - pub fn into_segments(self) -> Vec { - self.segments - } -} diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs deleted file mode 100644 index e32dd055f4..0000000000 --- a/libs/tenant_size_model/src/main.rs +++ /dev/null @@ -1,269 +0,0 @@ -//! Tenant size model testing ground. -//! -//! Has a number of scenarios and a `main` for invoking these by number, calculating the history -//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios -//! into pngs. - -use tenant_size_model::{Segment, SegmentSize, Storage}; - -// Main branch only. Some updates on it. -fn scenario_1() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Main branch only. Some updates on it. -fn scenario_2() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - storage.update("main", 1_000)?; - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Like 2, but more updates on main -fn scenario_3() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Diverged branches -fn scenario_4() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - for _ in 0..8 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -fn scenario_5() -> anyhow::Result<(Vec, SegmentSize)> { - let mut storage = Storage::new("a"); - storage.insert("a", 5000)?; - storage.branch("a", "b")?; - storage.update("b", 4000)?; - storage.update("a", 2000)?; - storage.branch("a", "c")?; - storage.insert("c", 4000)?; - storage.insert("a", 2000)?; - - let size = storage.calculate(5000)?; - - Ok((storage.into_segments(), size)) -} - -fn scenario_6() -> anyhow::Result<(Vec, SegmentSize)> { - use std::borrow::Cow; - - const NO_OP: Cow<'static, str> = Cow::Borrowed(""); - - let branches = [ - Some(0x7ff1edab8182025f15ae33482edb590a_u128), - Some(0xb1719e044db05401a05a2ed588a3ad3f), - Some(0xb68d6691c895ad0a70809470020929ef), - ]; - - // compared to other scenarios, this one uses bytes instead of kB - - let mut storage = Storage::new(None); - - storage.branch(&None, branches[0])?; // at 0 - storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064 - storage.branch(&branches[0], branches[1])?; // at 108951064 - storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472 - storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424 - storage.branch(&branches[0], branches[2])?; // at 283415424 - storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616 - storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400 - - let size = storage.calculate(100_000)?; - - Ok((storage.into_segments(), size)) -} - -fn main() { - let args: Vec = std::env::args().collect(); - - let scenario = if args.len() < 2 { "1" } else { &args[1] }; - - let (segments, size) = match scenario { - "1" => scenario_1(), - "2" => scenario_2(), - "3" => scenario_3(), - "4" => scenario_4(), - "5" => scenario_5(), - "6" => scenario_6(), - other => { - eprintln!("invalid scenario {}", other); - std::process::exit(1); - } - } - .unwrap(); - - graphviz_tree(&segments, &size); -} - -fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) { - use tenant_size_model::SegmentMethod::*; - - let seg_id = node.seg_id; - let seg = segments.get(seg_id).unwrap(); - let lsn = seg.end_lsn; - let size = seg.end_size.unwrap_or(0); - let method = node.method; - - println!(" {{"); - println!(" node [width=0.1 height=0.1 shape=oval]"); - - let tenant_size = node.total_children(); - - let penwidth = if seg.needed { 6 } else { 3 }; - let x = match method { - SnapshotAfter => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"), - Wal => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), - WalNeeded => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), - Skipped => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"), - }; - - println!(" \"seg{seg_id}\" [{x}]"); - println!(" }}"); - - // Recurse. Much of the data is actually on the edge - for child in node.children.iter() { - let child_id = child.seg_id; - graphviz_recurse(segments, child); - - let edge_color = match child.method { - SnapshotAfter => "gray", - Wal => "black", - WalNeeded => "black", - Skipped => "gray", - }; - - println!(" {{"); - println!(" edge [] "); - print!(" \"seg{seg_id}\" -> \"seg{child_id}\" ["); - print!("color={edge_color}"); - if child.method == WalNeeded { - print!(" penwidth=6"); - } - if child.method == Wal { - print!(" penwidth=3"); - } - - let next = segments.get(child_id).unwrap(); - - if next.op.is_empty() { - print!( - " label=\"{} / {}\"", - next.end_lsn - seg.end_lsn, - (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128) - ); - } else { - print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn); - } - println!("]"); - println!(" }}"); - } -} - -fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) { - println!("digraph G {{"); - println!(" fontname=\"Helvetica,Arial,sans-serif\""); - println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]"); - println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]"); - println!(" graph [center=1 rankdir=LR]"); - println!(" edge [dir=none]"); - - graphviz_recurse(segments, tree); - - println!("}}"); -} - -#[test] -fn scenarios_return_same_size() { - type ScenarioFn = fn() -> anyhow::Result<(Vec, SegmentSize)>; - let truths: &[(u32, ScenarioFn, _)] = &[ - (line!(), scenario_1, 8000), - (line!(), scenario_2, 9000), - (line!(), scenario_3, 13000), - (line!(), scenario_4, 16000), - (line!(), scenario_5, 17000), - (line!(), scenario_6, 333_792_000), - ]; - - for (line, scenario, expected) in truths { - let (_, size) = scenario().unwrap(); - assert_eq!(*expected, size.total_children(), "scenario on line {line}"); - } -} diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs new file mode 100644 index 0000000000..f26d3aa79d --- /dev/null +++ b/libs/tenant_size_model/src/svg.rs @@ -0,0 +1,193 @@ +use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; +use std::fmt::Write; + +const SVG_WIDTH: f32 = 500.0; + +struct SvgDraw<'a> { + storage: &'a StorageModel, + branches: &'a [String], + seg_to_branch: &'a [usize], + sizes: &'a [SegmentSizeResult], + + // layout + xscale: f32, + min_lsn: u64, + seg_coordinates: Vec<(f32, f32)>, +} + +fn draw_legend(result: &mut String) -> anyhow::Result<()> { + writeln!( + result, + "" + )?; + writeln!(result, "logical snapshot")?; + writeln!( + result, + "" + )?; + writeln!( + result, + "WAL within retention period" + )?; + writeln!( + result, + "" + )?; + writeln!( + result, + "WAL retained to avoid copy" + )?; + writeln!( + result, + "" + )?; + writeln!(result, "WAL not retained")?; + Ok(()) +} + +pub fn draw_svg( + storage: &StorageModel, + branches: &[String], + seg_to_branch: &[usize], + sizes: &SizeResult, +) -> anyhow::Result { + let mut draw = SvgDraw { + storage, + branches, + seg_to_branch, + sizes: &sizes.segments, + + xscale: 0.0, + min_lsn: 0, + seg_coordinates: Vec::new(), + }; + + let mut result = String::new(); + + writeln!(result, "")?; + + draw.calculate_svg_layout(); + + // Draw the tree + for (seg_id, _seg) in storage.segments.iter().enumerate() { + draw.draw_seg_phase1(seg_id, &mut result)?; + } + + // Draw snapshots + for (seg_id, _seg) in storage.segments.iter().enumerate() { + draw.draw_seg_phase2(seg_id, &mut result)?; + } + + draw_legend(&mut result)?; + + write!(result, "")?; + + Ok(result) +} + +impl<'a> SvgDraw<'a> { + fn calculate_svg_layout(&mut self) { + // Find x scale + let segments = &self.storage.segments; + let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min); + let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max); + + // Start with 1 pixel = 1 byte. Double the scale until it fits into the image + let mut xscale = 1.0; + while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH { + xscale *= 2.0; + } + + // Layout the timelines on Y dimension. + // TODO + let mut y = 100.0; + let mut branch_y_coordinates = Vec::new(); + for _branch in self.branches { + branch_y_coordinates.push(y); + y += 40.0; + } + + // Calculate coordinates for each point + let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) + .map(|(seg, branch_id)| { + let x = (seg.lsn - min_lsn) as f32 / xscale; + let y = branch_y_coordinates[*branch_id]; + (x, y) + }) + .collect(); + + self.xscale = xscale; + self.min_lsn = min_lsn; + self.seg_coordinates = seg_coordinates; + } + + /// Draws lines between points + fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { + let seg = &self.storage.segments[seg_id]; + + let wal_bytes = if let Some(parent_id) = seg.parent { + seg.lsn - self.storage.segments[parent_id].lsn + } else { + 0 + }; + + let style = match self.sizes[seg_id].method { + SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"", + SegmentMethod::Wal if seg.needed && wal_bytes > 0 => { + "stroke-width=\"6\" stroke=\"black\"" + } + SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"", + SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"", + }; + if let Some(parent_id) = seg.parent { + let (x1, y1) = self.seg_coordinates[parent_id]; + let (x2, y2) = self.seg_coordinates[seg_id]; + + writeln!( + result, + "", + )?; + writeln!( + result, + " {wal_bytes} bytes of WAL (seg {seg_id})" + )?; + writeln!(result, "")?; + } else { + // draw a little dash to mark the starting point of this branch + let (x, y) = self.seg_coordinates[seg_id]; + let (x1, y1) = (x, y - 5.0); + let (x2, y2) = (x, y + 5.0); + + writeln!( + result, + "", + )?; + writeln!(result, " (seg {seg_id})")?; + writeln!(result, "")?; + } + + Ok(()) + } + + /// Draw circles where snapshots are taken + fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { + let seg = &self.storage.segments[seg_id]; + + // draw a snapshot point if it's needed + let (coord_x, coord_y) = self.seg_coordinates[seg_id]; + if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { + writeln!( + result, + "", + )?; + writeln!( + result, + " logical size {}", + seg.size.unwrap() + )?; + write!(result, "")?; + } + + Ok(()) + } +} diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs new file mode 100644 index 0000000000..7660d41c56 --- /dev/null +++ b/libs/tenant_size_model/tests/tests.rs @@ -0,0 +1,313 @@ +//! Tenant size model tests. + +use tenant_size_model::{Segment, SizeResult, StorageModel}; + +use std::collections::HashMap; + +struct ScenarioBuilder { + segments: Vec, + + /// Mapping from the branch name to the index of a segment describing its latest state. + branches: HashMap, +} + +impl ScenarioBuilder { + /// Creates a new storage with the given default branch name. + pub fn new(initial_branch: &str) -> ScenarioBuilder { + let init_segment = Segment { + parent: None, + lsn: 0, + size: Some(0), + needed: false, // determined later + }; + + ScenarioBuilder { + segments: vec![init_segment], + branches: HashMap::from([(initial_branch.into(), 0)]), + } + } + + /// Advances the branch with the named operation, by the relative LSN and logical size bytes. + pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) { + let lastseg_id = *self.branches.get(branch).unwrap(); + let newseg_id = self.segments.len(); + let lastseg = &mut self.segments[lastseg_id]; + + let newseg = Segment { + parent: Some(lastseg_id), + lsn: lastseg.lsn + lsn_bytes, + size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64), + needed: false, + }; + + self.segments.push(newseg); + *self.branches.get_mut(branch).expect("read already") = newseg_id; + } + + pub fn insert(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, bytes as i64); + } + + pub fn update(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, 0i64); + } + + pub fn _delete(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, -(bytes as i64)); + } + + /// Panics if the parent branch cannot be found. + pub fn branch(&mut self, parent: &str, name: &str) { + // Find the right segment + let branchseg_id = *self + .branches + .get(parent) + .expect("should had found the parent by key"); + let _branchseg = &mut self.segments[branchseg_id]; + + // Create branch name for it + self.branches.insert(name.to_string(), branchseg_id); + } + + pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) { + // Phase 1: Mark all the segments that need to be retained + for (_branch, &last_seg_id) in self.branches.iter() { + let last_seg = &self.segments[last_seg_id]; + let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period); + let mut seg_id = last_seg_id; + loop { + let seg = &mut self.segments[seg_id]; + if seg.lsn <= cutoff_lsn { + break; + } + seg.needed = true; + if let Some(prev_seg_id) = seg.parent { + seg_id = prev_seg_id; + } else { + break; + } + } + } + + // Perform the calculation + let storage_model = StorageModel { + segments: self.segments.clone(), + }; + let size_result = storage_model.calculate(); + (storage_model, size_result) + } +} + +// Main branch only. Some updates on it. +#[test] +fn scenario_1() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Calculate the synthetic size with retention horizon 1000 + let (_model, result) = scenario.calculate(1000); + + // The end of the branch is at LSN 10000. Need to retain + // a logical snapshot at LSN 9000, plus the WAL between 9000-10000. + // The logical snapshot has size 5000. + assert_eq!(result.total_size, 5000 + 1000); +} + +// Main branch only. Some updates on it. +#[test] +fn scenario_2() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + scenario.update("main", 1_000); + + // + // The history looks like this now: + // + // 10000 11000 + // *----*----*--------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, we need to retain logical snapshot + // at the branch point, size 5000, and the WAL from 10000-11000 on + // both branches. + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 1000); +} + +// Like 2, but more updates on main +#[test] +fn scenario_3() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // + // The history looks like this now: + // + // 10000 15000 + // *----*----*------------------------------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, it's still cheapest to retain + // - snapshot at branch point (size 5000) + // - WAL on child between 10000-11000 + // - WAL on main between 10000-15000 + // + // This is in total 5000 + 1000 + 5000 + // + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 5000); +} + +// Diverged branches +#[test] +fn scenario_4() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + for _ in 0..8 { + scenario.update("main", 1_000); + } + + // + // The history looks like this now: + // + // 10000 18000 + // *----*----*------------------------------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, it's now cheapest to retain + // separate snapshots on both branches: + // - snapshot on main branch at LSN 17000 (size 5000) + // - WAL on main between 17000-18000 + // - snapshot on child branch at LSN 10000 (size 5000) + // - WAL on child between 10000-11000 + // + // This is in total 5000 + 1000 + 5000 + 1000 = 12000 + // + // (If we used the the method from the previous scenario, and + // kept only snapshot at the branch point, we'd need to keep + // all the WAL between 10000-18000 on the main branch, so + // the total size would be 5000 + 1000 + 8000 = 14000. The + // calculation always picks the cheapest alternative) + + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000); +} + +#[test] +fn scenario_5() { + let mut scenario = ScenarioBuilder::new("a"); + scenario.insert("a", 5000); + scenario.branch("a", "b"); + scenario.update("b", 4000); + scenario.update("a", 2000); + scenario.branch("a", "c"); + scenario.insert("c", 4000); + scenario.insert("a", 2000); + + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 17000); +} + +#[test] +fn scenario_6() { + let branches = [ + "7ff1edab8182025f15ae33482edb590a", + "b1719e044db05401a05a2ed588a3ad3f", + "0xb68d6691c895ad0a70809470020929ef", + ]; + + // compared to other scenarios, this one uses bytes instead of kB + + let mut scenario = ScenarioBuilder::new(""); + + scenario.branch("", branches[0]); // at 0 + scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064 + scenario.branch(branches[0], branches[1]); // at 108951064 + scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472 + scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424 + scenario.branch(branches[0], branches[2]); // at 283415424 + scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616 + scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400 + + let (model, result) = scenario.calculate(100_000); + + // FIXME: We previously calculated 333_792_000. But with this PR, we get + // a much lower number. At a quick look at the model output and the + // calculations here, the new result seems correct to me. + eprintln!( + " MODEL: {}", + serde_json::to_string(&model.segments).unwrap() + ); + eprintln!( + "RESULT: {}", + serde_json::to_string(&result.segments).unwrap() + ); + + assert_eq!(result.total_size, 136_236_928); +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index fc271fe83b..e68ceb2dc6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -437,6 +437,13 @@ paths: type: boolean description: | When true, skip calculation and only provide the model inputs (for debugging). Defaults to false. + - name: retention_period + in: query + required: false + schema: + type: integer + description: | + Override the default retention period (in bytes) used for size calculation. get: description: | Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes). diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6a9232e097..7cd7e81fe1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -7,6 +7,7 @@ use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use remote_storage::GenericRemoteStorage; +use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -20,6 +21,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; use crate::tenant::mgr::TenantMapInsertError; +use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{PageReconstructError, Timeline}; use crate::{config::PageServerConf, tenant::mgr}; @@ -479,11 +481,19 @@ async fn tenant_status(request: Request) -> Result, ApiErro /// to debug any of the calculations. Requires `tenant_id` request parameter, supports /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model /// values. +/// +/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size +/// (only if it is shorter than the real cutoff). +/// +/// Note: we don't update the cached size and prometheus metric here. +/// The retention period might be different, and it's nice to have a method to just calculate it +/// without modifying anything anyway. async fn tenant_size_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let inputs_only: Option = parse_query_param(&request, "inputs_only")?; + let retention_period: Option = parse_query_param(&request, "retention_period")?; + let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let tenant = mgr::get_tenant(tenant_id, true) @@ -492,24 +502,29 @@ async fn tenant_size_handler(request: Request) -> Result, A // this can be long operation let inputs = tenant - .gather_size_inputs(&ctx) + .gather_size_inputs(retention_period, &ctx) .await .map_err(ApiError::InternalServerError)?; - let size = if !inputs_only.unwrap_or(false) { - Some( - tenant - .calc_and_update_cached_synthetic_size(&inputs) - .map_err(ApiError::InternalServerError)?, - ) - } else { - None - }; + let mut sizes = None; + if !inputs_only.unwrap_or(false) { + let storage_model = inputs + .calculate_model() + .map_err(ApiError::InternalServerError)?; + let size = storage_model.calculate(); - /// Private response type with the additional "unstable" `inputs` field. - /// - /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is - /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`. + // If request header expects html, return html + if headers["Accept"] == "text/html" { + return synthetic_size_html_response(inputs, storage_model, size); + } + sizes = Some(size); + } else if headers["Accept"] == "text/html" { + return Err(ApiError::BadRequest(anyhow!( + "inputs_only parameter is incompatible with html output request" + ))); + } + + /// The type resides in the pageserver not to expose `ModelInputs`. #[serde_with::serde_as] #[derive(serde::Serialize)] struct TenantHistorySize { @@ -519,6 +534,9 @@ async fn tenant_size_handler(request: Request) -> Result, A /// /// Will be none if `?inputs_only=true` was given. size: Option, + /// Size of each segment used in the model. + /// Will be null if `?inputs_only=true` was given. + segment_sizes: Option>, inputs: crate::tenant::size::ModelInputs, } @@ -526,7 +544,8 @@ async fn tenant_size_handler(request: Request) -> Result, A StatusCode::OK, TenantHistorySize { id: tenant_id, - size, + size: sizes.as_ref().map(|x| x.total_size), + segment_sizes: sizes.map(|x| x.segments), inputs, }, ) @@ -591,6 +610,62 @@ async fn evict_timeline_layer_handler(request: Request) -> Result Result, ApiError> { + let mut timeline_ids: Vec = Vec::new(); + let mut timeline_map: HashMap = HashMap::new(); + for (index, ti) in inputs.timeline_inputs.iter().enumerate() { + timeline_map.insert(ti.timeline_id, index); + timeline_ids.push(ti.timeline_id.to_string()); + } + let seg_to_branch: Vec = inputs + .segments + .iter() + .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap()) + .collect(); + + let svg = + tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes) + .map_err(ApiError::InternalServerError)?; + + let mut response = String::new(); + + use std::fmt::Write; + write!(response, "\n\n").unwrap(); + write!(response, "
\n{svg}\n
").unwrap(); + writeln!(response, "Project size: {}", sizes.total_size).unwrap(); + writeln!(response, "
").unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&inputs).unwrap()
+    )
+    .unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&sizes.segments).unwrap()
+    )
+    .unwrap();
+    writeln!(response, "
").unwrap(); + write!(response, "\n\n").unwrap(); + + html_response(StatusCode::OK, response) +} + +pub fn html_response(status: StatusCode, data: String) -> Result, ApiError> { + let response = Response::builder() + .status(status) + .header(hyper::header::CONTENT_TYPE, "text/html") + .body(Body::from(data.as_bytes().to_vec())) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} + // Helper function to standardize the error messages we produce on bad durations // // Intended to be used with anyhow's `with_context`, e.g.: @@ -1019,7 +1094,7 @@ pub fn make_router( .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .get("/v1/tenant/:tenant_id", tenant_status) - .get("/v1/tenant/:tenant_id/size", tenant_size_handler) + .get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler) .put("/v1/tenant/config", update_tenant_config_handler) .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler) .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 23210b98d5..9e9c98ad62 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2418,6 +2418,9 @@ impl Tenant { #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] pub async fn gather_size_inputs( &self, + // `max_retention_period` overrides the cutoff that is used to calculate the size + // (only if it is shorter than the real cutoff). + max_retention_period: Option, ctx: &RequestContext, ) -> anyhow::Result { let logical_sizes_at_once = self @@ -2425,32 +2428,41 @@ impl Tenant { .concurrent_tenant_size_logical_size_queries .inner(); - // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries - // are for testing/experimenting, we tolerate this. + // TODO: Having a single mutex block concurrent reads is not great for performance. + // + // But the only case where we need to run multiple of these at once is when we + // request a size for a tenant manually via API, while another background calculation + // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. let mut shared_cache = self.cached_logical_sizes.lock().await; - size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await + size::gather_inputs( + self, + logical_sizes_at_once, + max_retention_period, + &mut shared_cache, + ctx, + ) + .await } - /// Calculate synthetic tenant size + /// Calculate synthetic tenant size and cache the result. /// This is periodically called by background worker. /// result is cached in tenant struct #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result { - let inputs = self.gather_size_inputs(ctx).await?; + let inputs = self.gather_size_inputs(None, ctx).await?; - self.calc_and_update_cached_synthetic_size(&inputs) - } - - /// Calculate synthetic size , cache it and set metric value - pub fn calc_and_update_cached_synthetic_size( - &self, - inputs: &size::ModelInputs, - ) -> anyhow::Result { let size = inputs.calculate()?; + self.set_cached_synthetic_size(size); + + Ok(size) + } + + /// Cache given synthetic size and update the metric value + pub fn set_cached_synthetic_size(&self, size: u64) { self.cached_synthetic_tenant_size .store(size, Ordering::Relaxed); @@ -2458,8 +2470,6 @@ impl Tenant { .get_metric_with_label_values(&[&self.tenant_id.to_string()]) .unwrap() .set(size); - - Ok(size) } pub fn get_cached_synthetic_size(&self) -> u64 { diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 2fed4f88b3..2c5efe283b 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -1,8 +1,9 @@ use std::cmp; +use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use anyhow::Context; +use anyhow::{bail, Context}; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; @@ -10,35 +11,80 @@ use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::Tenant; +use crate::tenant::Timeline; use utils::id::TimelineId; use utils::lsn::Lsn; use tracing::*; +use tenant_size_model::{Segment, StorageModel}; + /// Inputs to the actual tenant sizing model /// /// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to /// be a transferrable format between execution environments and developer. +/// +/// This tracks more information than the actual StorageModel that calculation +/// needs. We will convert this into a StorageModel when it's time to perform +/// the calculation. +/// #[serde_with::serde_as] #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct ModelInputs { - updates: Vec, - retention_period: u64, + pub segments: Vec, + pub timeline_inputs: Vec, +} - /// Relevant lsns per timeline. - /// - /// This field is not required for deserialization purposes, which is mostly used in tests. The - /// LSNs explain the outcome (updates) but are not needed in size calculation. - #[serde_as(as = "HashMap")] - #[serde(default)] - timeline_inputs: HashMap, +/// A [`Segment`], with some extra information for display purposes +#[serde_with::serde_as] +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct SegmentMeta { + pub segment: Segment, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub timeline_id: TimelineId, + pub kind: LsnKind, +} + +impl SegmentMeta { + fn size_needed(&self) -> bool { + match self.kind { + LsnKind::BranchStart => { + // If we don't have a later GcCutoff point on this branch, and + // no ancestor, calculate size for the branch start point. + self.segment.needed && self.segment.parent.is_none() + } + LsnKind::BranchPoint => true, + LsnKind::GcCutOff => true, + LsnKind::BranchEnd => false, + } + } +} + +#[derive( + Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize, +)] +pub enum LsnKind { + /// A timeline starting here + BranchStart, + /// A child timeline branches off from here + BranchPoint, + /// GC cutoff point + GcCutOff, + /// Last record LSN + BranchEnd, } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as /// part of [`ModelInputs`] from the HTTP api, explaining the inputs. #[serde_with::serde_as] #[derive(Debug, serde::Serialize, serde::Deserialize)] -struct TimelineInputs { +pub struct TimelineInputs { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub timeline_id: TimelineId, + + #[serde_as(as = "Option")] + pub ancestor_id: Option, + #[serde_as(as = "serde_with::DisplayFromStr")] ancestor_lsn: Lsn, #[serde_as(as = "serde_with::DisplayFromStr")] @@ -49,118 +95,14 @@ struct TimelineInputs { horizon_cutoff: Lsn, #[serde_as(as = "serde_with::DisplayFromStr")] pitr_cutoff: Lsn, + + /// Cutoff point based on GC settings #[serde_as(as = "serde_with::DisplayFromStr")] next_gc_cutoff: Lsn, -} -// Adjust BranchFrom sorting so that we always process ancestor -// before descendants. This is needed to correctly calculate size of -// descendant timelines. -// -// Note that we may have multiple BranchFroms at the same LSN, so we -// need to sort them in the tree order. -// -// see updates_sort_with_branches_at_same_lsn test below -fn sort_updates_in_tree_order(updates: Vec) -> anyhow::Result> { - let mut sorted_updates = Vec::with_capacity(updates.len()); - let mut known_timelineids = HashSet::new(); - let mut i = 0; - while i < updates.len() { - let curr_upd = &updates[i]; - - if let Command::BranchFrom(parent_id) = curr_upd.command { - let parent_id = match parent_id { - Some(parent_id) if known_timelineids.contains(&parent_id) => { - // we have already processed ancestor - // process this BranchFrom Update normally - known_timelineids.insert(curr_upd.timeline_id); - sorted_updates.push(*curr_upd); - i += 1; - continue; - } - None => { - known_timelineids.insert(curr_upd.timeline_id); - sorted_updates.push(*curr_upd); - i += 1; - continue; - } - Some(parent_id) => parent_id, - }; - - let mut j = i; - - // we have not processed ancestor yet. - // there is a chance that it is at the same Lsn - if !known_timelineids.contains(&parent_id) { - let mut curr_lsn_branchfroms: HashMap> = - HashMap::new(); - - // inspect all branchpoints at the same lsn - while j < updates.len() && updates[j].lsn == curr_upd.lsn { - let lookahead_upd = &updates[j]; - j += 1; - - if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command { - match lookahead_parent_id { - Some(lookahead_parent_id) - if !known_timelineids.contains(&lookahead_parent_id) => - { - // we have not processed ancestor yet - // store it for later - let es = - curr_lsn_branchfroms.entry(lookahead_parent_id).or_default(); - es.push((lookahead_upd.timeline_id, j)); - } - _ => { - // we have already processed ancestor - // process this BranchFrom Update normally - known_timelineids.insert(lookahead_upd.timeline_id); - sorted_updates.push(*lookahead_upd); - } - } - } - } - - // process BranchFroms in the tree order - // check that we don't have a cycle if somet entry is orphan - // (this should not happen, but better to be safe) - let mut processed_some_entry = true; - while processed_some_entry { - processed_some_entry = false; - - curr_lsn_branchfroms.retain(|parent_id, branchfroms| { - if known_timelineids.contains(parent_id) { - for (timeline_id, j) in branchfroms { - known_timelineids.insert(*timeline_id); - sorted_updates.push(updates[*j - 1]); - } - processed_some_entry = true; - false - } else { - true - } - }); - } - - if !curr_lsn_branchfroms.is_empty() { - // orphans are expected to be rare and transient between tenant reloads - // for example, an broken ancestor without the child branch being broken. - anyhow::bail!( - "orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}" - ); - } - } - - assert!(j > i); - i = j; - } else { - // not a BranchFrom, keep the same order - sorted_updates.push(*curr_upd); - i += 1; - } - } - - Ok(sorted_updates) + /// Cutoff point calculated from the user-supplied 'max_retention_period' + #[serde_as(as = "Option")] + retention_param_cutoff: Option, } /// Gathers the inputs for the tenant sizing model. @@ -181,257 +123,257 @@ fn sort_updates_in_tree_order(updates: Vec) -> anyhow::Result, + max_retention_period: Option, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, ctx: &RequestContext, ) -> anyhow::Result { - // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to - // our advantage with `?` error handling. - let mut joinset = tokio::task::JoinSet::new(); - // refresh is needed to update gc related pitr_cutoff and horizon_cutoff tenant .refresh_gc_info(ctx) .await .context("Failed to refresh gc_info before gathering inputs")?; + // Collect information about all the timelines let timelines = tenant.list_timelines(); if timelines.is_empty() { // perhaps the tenant has just been created, and as such doesn't have any data yet return Ok(ModelInputs { - updates: vec![], - retention_period: 0, - timeline_inputs: HashMap::default(), + segments: vec![], + timeline_inputs: Vec::new(), }); } + // Build a map of branch points. + let mut branchpoints: HashMap> = HashMap::new(); + for timeline in timelines.iter() { + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + branchpoints + .entry(ancestor_id) + .or_default() + .insert(timeline.get_ancestor_lsn()); + } + } + + // These become the final result. + let mut timeline_inputs = Vec::with_capacity(timelines.len()); + let mut segments: Vec = Vec::new(); + + // + // Build Segments representing each timeline. As we do that, also remember + // the branchpoints and branch startpoints in 'branchpoint_segments' and + // 'branchstart_segments' + // + + // BranchPoint segments of each timeline + // (timeline, branchpoint LSN) -> segment_id + let mut branchpoint_segments: HashMap<(TimelineId, Lsn), usize> = HashMap::new(); + + // timeline, Branchpoint seg id, (ancestor, ancestor LSN) + type BranchStartSegment = (TimelineId, usize, Option<(TimelineId, Lsn)>); + let mut branchstart_segments: Vec = Vec::new(); + + for timeline in timelines.iter() { + let timeline_id = timeline.timeline_id; + let last_record_lsn = timeline.get_last_record_lsn(); + let ancestor_lsn = timeline.get_ancestor_lsn(); + + // there's a race between the update (holding tenant.gc_lock) and this read but it + // might not be an issue, because it's not for Timeline::gc + let gc_info = timeline.gc_info.read().unwrap(); + + // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a + // new gc run, which we have no control over. however differently from `Timeline::gc` + // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not + // actually removing files. + let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + + // If the caller provided a shorter retention period, use that instead of the GC cutoff. + let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { + let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); + if next_gc_cutoff < param_cutoff { + next_gc_cutoff = param_cutoff; + } + Some(param_cutoff) + } else { + None + }; + + // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + // want to query any logical size before initdb_lsn. + let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); + + // Build "interesting LSNs" on this timeline + let mut lsns: Vec<(Lsn, LsnKind)> = gc_info + .retain_lsns + .iter() + .filter(|&&lsn| lsn > ancestor_lsn) + .copied() + // this assumes there are no other retain_lsns than the branchpoints + .map(|lsn| (lsn, LsnKind::BranchPoint)) + .collect::>(); + + // Add branch points we collected earlier, just in case there were any that were + // not present in retain_lsns. We will remove any duplicates below later. + if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { + lsns.extend( + this_branchpoints + .iter() + .map(|lsn| (*lsn, LsnKind::BranchPoint)), + ) + } + + // Add a point for the GC cutoff + let branch_start_needed = next_gc_cutoff <= branch_start_lsn; + if !branch_start_needed { + lsns.push((next_gc_cutoff, LsnKind::GcCutOff)); + } + + lsns.sort_unstable(); + lsns.dedup(); + + // + // Create Segments for the interesting points. + // + + // Timeline start point + let ancestor = timeline + .get_ancestor_timeline_id() + .map(|ancestor_id| (ancestor_id, ancestor_lsn)); + branchstart_segments.push((timeline_id, segments.len(), ancestor)); + segments.push(SegmentMeta { + segment: Segment { + parent: None, // filled in later + lsn: branch_start_lsn.0, + size: None, // filled in later + needed: branch_start_needed, + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::BranchStart, + }); + + // GC cutoff point, and any branch points, i.e. points where + // other timelines branch off from this timeline. + let mut parent = segments.len() - 1; + for (lsn, kind) in lsns { + if kind == LsnKind::BranchPoint { + branchpoint_segments.insert((timeline_id, lsn), segments.len()); + } + segments.push(SegmentMeta { + segment: Segment { + parent: Some(parent), + lsn: lsn.0, + size: None, + needed: lsn > next_gc_cutoff, + }, + timeline_id: timeline.timeline_id, + kind, + }); + parent += 1; + } + + // Current end of the timeline + segments.push(SegmentMeta { + segment: Segment { + parent: Some(parent), + lsn: last_record_lsn.0, + size: None, // Filled in later, if necessary + needed: true, + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::BranchEnd, + }); + + timeline_inputs.push(TimelineInputs { + timeline_id: timeline.timeline_id, + ancestor_id: timeline.get_ancestor_timeline_id(), + ancestor_lsn, + last_record: last_record_lsn, + // this is not used above, because it might not have updated recently enough + latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + horizon_cutoff: gc_info.horizon_cutoff, + pitr_cutoff: gc_info.pitr_cutoff, + next_gc_cutoff, + retention_param_cutoff, + }); + } + + // We now have all segments from the timelines in 'segments'. The timelines + // haven't been linked to each other yet, though. Do that. + for (_timeline_id, seg_id, ancestor) in branchstart_segments { + // Look up the branch point + if let Some(ancestor) = ancestor { + let parent_id = *branchpoint_segments.get(&ancestor).unwrap(); + segments[seg_id].segment.parent = Some(parent_id); + } + } + + // We left the 'size' field empty in all of the Segments so far. + // Now find logical sizes for all of the points that might need or benefit from them. + fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?; + + Ok(ModelInputs { + segments, + timeline_inputs, + }) +} + +/// Augment 'segments' with logical sizes +/// +/// this will probably conflict with on-demand downloaded layers, or at least force them all +/// to be downloaded +/// +async fn fill_logical_sizes( + timelines: &[Arc], + segments: &mut [SegmentMeta], + limit: &Arc, + logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, + ctx: &RequestContext, +) -> anyhow::Result<()> { + let timeline_hash: HashMap> = HashMap::from_iter( + timelines + .iter() + .map(|timeline| (timeline.timeline_id, Arc::clone(timeline))), + ); + // record the used/inserted cache keys here, to remove extras not to start leaking // after initial run the cache should be quite stable, but live timelines will eventually // require new lsns to be inspected. - let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new(); + let mut sizes_needed = HashMap::<(TimelineId, Lsn), Option>::new(); - let mut updates = Vec::new(); + // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to + // our advantage with `?` error handling. + let mut joinset = tokio::task::JoinSet::new(); - // record the per timeline values useful to debug the model inputs, also used to track - // ancestor_lsn without keeping a hold of Timeline - let mut timeline_inputs = HashMap::with_capacity(timelines.len()); - - // used to determine the `retention_period` for the size model - let mut max_cutoff_distance = None; - - // mapping from (TimelineId, Lsn) => if this branch point has been handled already via - // GcInfo::retain_lsns or if it needs to have its logical_size calculated. - let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new(); - - for timeline in timelines { - if !timeline.is_active() { - anyhow::bail!( - "timeline {} is not active, cannot calculate tenant_size now", - timeline.timeline_id - ); + // For each point that would benefit from having a logical size available, + // spawn a Task to fetch it, unless we have it cached already. + for seg in segments.iter() { + if !seg.size_needed() { + continue; } - let last_record_lsn = timeline.get_last_record_lsn(); + let timeline_id = seg.timeline_id; + let lsn = Lsn(seg.segment.lsn); - let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = { - // there's a race between the update (holding tenant.gc_lock) and this read but it - // might not be an issue, because it's not for Timeline::gc - let gc_info = timeline.gc_info.read().unwrap(); - - // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a - // new gc run, which we have no control over. however differently from `Timeline::gc` - // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not - // actually removing files. - let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); - - // the minimum where we should find the next_gc_cutoff for our calculations. - // - // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we - // want to query any logical size before initdb_lsn. - let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn); - - let maybe_cutoff = if next_gc_cutoff > cutoff_minimum { - Some((next_gc_cutoff, LsnKind::GcCutOff)) - } else { - None - }; - - // this assumes there are no other lsns than the branchpoints - let lsns = gc_info - .retain_lsns - .iter() - .inspect(|&&lsn| { - trace!( - timeline_id=%timeline.timeline_id, - "retained lsn: {lsn:?}, is_before_ancestor_lsn={}", - lsn < timeline.get_ancestor_lsn() - ) - }) - .filter(|&&lsn| lsn > timeline.get_ancestor_lsn()) - .copied() - .map(|lsn| (lsn, LsnKind::BranchPoint)) - .chain(maybe_cutoff) - .collect::>(); - - ( - lsns, - gc_info.horizon_cutoff, - gc_info.pitr_cutoff, - next_gc_cutoff, - ) - }; - - // update this to have a retention_period later for the tenant_size_model - // tenant_size_model compares this to the last segments start_lsn - if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) { - match max_cutoff_distance.as_mut() { - Some(max) => { - *max = std::cmp::max(*max, cutoff_distance); - } - _ => { - max_cutoff_distance = Some(cutoff_distance); - } - } - } - - // all timelines branch from something, because it might be impossible to pinpoint - // which is the tenant_size_model's "default" branch. - - let ancestor_lsn = timeline.get_ancestor_lsn(); - - updates.push(Update { - lsn: ancestor_lsn, - command: Command::BranchFrom(timeline.get_ancestor_timeline_id()), - timeline_id: timeline.timeline_id, - }); - - if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() { - // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches - // which are over gc_horizon. for example, a "main" branch which never received any - // updates apart from initdb not have branch points recorded. - referenced_branch_froms - .entry((parent_timeline_id, timeline.get_ancestor_lsn())) - .or_default(); - } - - for (lsn, _kind) in &interesting_lsns { - // mark this visited so don't need to re-process this parent - *referenced_branch_froms - .entry((timeline.timeline_id, *lsn)) - .or_default() = true; - - if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) { - updates.push(Update { - lsn: *lsn, - timeline_id: timeline.timeline_id, - command: Command::Update(*size), - }); - - needed_cache.insert((timeline.timeline_id, *lsn)); - } else { - let timeline = Arc::clone(&timeline); + if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) { + let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned(); + if cached_size.is_none() { + let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap()); let parallel_size_calcs = Arc::clone(limit); let ctx = ctx.attached_child(); joinset.spawn(calculate_logical_size( parallel_size_calcs, timeline, - *lsn, + lsn, ctx, )); } - } - - timeline_inputs.insert( - timeline.timeline_id, - TimelineInputs { - ancestor_lsn, - last_record: last_record_lsn, - // this is not used above, because it might not have updated recently enough - latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff, - pitr_cutoff, - next_gc_cutoff, - }, - ); - } - - // iterate over discovered branch points and make sure we are getting logical sizes at those - // points. - for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() { - if *handled { - continue; - } - - let timeline_id = *timeline_id; - let lsn = *lsn; - - match timeline_inputs.get(&timeline_id) { - Some(inputs) if inputs.ancestor_lsn == lsn => { - // we don't need an update at this branch point which is also point where - // timeline_id branch was branched from. - continue; - } - Some(_) => {} - None => { - // we should have this because we have iterated through all of the timelines - anyhow::bail!("missing timeline_input for {timeline_id}") - } - } - - if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) { - updates.push(Update { - lsn, - timeline_id, - command: Command::Update(*size), - }); - - needed_cache.insert((timeline_id, lsn)); - } else { - let timeline = tenant - .get_timeline(timeline_id, false) - .context("find referenced ancestor timeline")?; - let parallel_size_calcs = Arc::clone(limit); - joinset.spawn(calculate_logical_size( - parallel_size_calcs, - timeline.clone(), - lsn, - ctx.attached_child(), - )); - - if let Some(parent_id) = timeline.get_ancestor_timeline_id() { - // we should not find new ones because we iterated tenants all timelines - anyhow::ensure!( - timeline_inputs.contains_key(&parent_id), - "discovered new timeline {parent_id} (parent of {timeline_id})" - ); - } - }; - } - - // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch - // point. this is needed by the model. - for (timeline_id, inputs) in timeline_inputs.iter() { - let lsn = inputs.last_record; - - if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) { - // this means that the (timeline_id, last_record_lsn) represents a branch point - // we do not want to add EndOfBranch updates for these points because it doesn't fit - // into the current tenant_size_model. - continue; - } - - if lsn > inputs.ancestor_lsn { - // all timelines also have an end point if they have made any progress - updates.push(Update { - lsn, - command: Command::EndOfBranch, - timeline_id: *timeline_id, - }); + e.insert(cached_size); } } + // Perform the size lookups let mut have_any_error = false; - while let Some(res) = joinset.join_next().await { // each of these come with Result, JoinError> // because of spawn + spawn_blocking @@ -460,19 +402,13 @@ pub(super) async fn gather_inputs( debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); logical_size_cache.insert((timeline.timeline_id, lsn), size); - needed_cache.insert((timeline.timeline_id, lsn)); - - updates.push(Update { - lsn, - timeline_id: timeline.timeline_id, - command: Command::Update(size), - }); + sizes_needed.insert((timeline.timeline_id, lsn), Some(size)); } } } // prune any keys not needed anymore; we record every used key and added key. - logical_size_cache.retain(|key, _| needed_cache.contains(key)); + logical_size_cache.retain(|key, _| sizes_needed.contains_key(key)); if have_any_error { // we cannot complete this round, because we are missing data. @@ -480,105 +416,47 @@ pub(super) async fn gather_inputs( anyhow::bail!("failed to calculate some logical_sizes"); } - // the data gathered to updates is per lsn, regardless of the branch, so we can use it to - // our advantage, not requiring a sorted container or graph walk. - // - // for branch points, which come as multiple updates at the same LSN, the Command::Update - // is needed before a branch is made out of that branch Command::BranchFrom. this is - // handled by the variant order in `Command`. - // - updates.sort_unstable(); - - // And another sort to handle Command::BranchFrom ordering - // in case when there are multiple branches at the same LSN. - let sorted_updates = sort_updates_in_tree_order(updates)?; - - let retention_period = match max_cutoff_distance { - Some(max) => max.0, - None => { - anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0") + // Insert the looked up sizes to the Segments + for seg in segments.iter_mut() { + if !seg.size_needed() { + continue; } - }; - Ok(ModelInputs { - updates: sorted_updates, - retention_period, - timeline_inputs, - }) + let timeline_id = seg.timeline_id; + let lsn = Lsn(seg.segment.lsn); + + if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) { + seg.segment.size = Some(*size); + } else { + bail!("could not find size at {} in timeline {}", lsn, timeline_id); + } + } + Ok(()) } impl ModelInputs { + pub fn calculate_model(&self) -> anyhow::Result { + // Convert SegmentMetas into plain Segments + let storage = StorageModel { + segments: self + .segments + .iter() + .map(|seg| seg.segment.clone()) + .collect(), + }; + + Ok(storage) + } + + // calculate total project size pub fn calculate(&self) -> anyhow::Result { - // Option is used for "naming" the branches because it is assumed to be - // impossible to always determine the a one main branch. - let mut storage = tenant_size_model::Storage::>::new(None); + let storage = self.calculate_model()?; + let sizes = storage.calculate(); - for update in &self.updates { - let Update { - lsn, - command: op, - timeline_id, - } = update; - - let Lsn(now) = *lsn; - match op { - Command::Update(sz) => { - storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?; - } - Command::EndOfBranch => { - storage.insert_point(&Some(*timeline_id), "".into(), now, None)?; - } - Command::BranchFrom(parent) => { - // This branch command may fail if it cannot find a parent to branch from. - storage.branch(parent, Some(*timeline_id))?; - } - } - } - - Ok(storage.calculate(self.retention_period)?.total_children()) + Ok(sizes.total_size) } } -/// A point of interest in the tree of branches -#[serde_with::serde_as] -#[derive( - Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize, -)] -struct Update { - #[serde_as(as = "serde_with::DisplayFromStr")] - lsn: utils::lsn::Lsn, - command: Command, - #[serde_as(as = "serde_with::DisplayFromStr")] - timeline_id: TimelineId, -} - -#[serde_with::serde_as] -#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)] -#[serde(rename_all = "snake_case")] -enum Command { - Update(u64), - BranchFrom(#[serde_as(as = "Option")] Option), - EndOfBranch, -} - -impl std::fmt::Debug for Command { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3 - // linebreaks - match self { - Self::Update(arg0) => write!(f, "Update({arg0})"), - Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"), - Self::EndOfBranch => write!(f, "EndOfBranch"), - } - } -} - -#[derive(Debug, Clone, Copy)] -enum LsnKind { - BranchPoint, - GcCutOff, -} - /// Newtype around the tuple that carries the timeline at lsn logical size calculation. struct TimelineAtLsnSizeResult( Arc, @@ -604,227 +482,230 @@ async fn calculate_logical_size( Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } -#[test] -fn updates_sort() { - use std::str::FromStr; - use utils::id::TimelineId; - use utils::lsn::Lsn; - - let ids = [ - TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(), - TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(), - TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(), - ]; - - // try through all permutations - let ids = [ - [&ids[0], &ids[1], &ids[2]], - [&ids[0], &ids[2], &ids[1]], - [&ids[1], &ids[0], &ids[2]], - [&ids[1], &ids[2], &ids[0]], - [&ids[2], &ids[0], &ids[1]], - [&ids[2], &ids[1], &ids[0]], - ]; - - for ids in ids { - // apply a fixture which uses a permutation of ids - let commands = [ - Update { - lsn: Lsn(0), - command: Command::BranchFrom(None), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/67E7618").unwrap(), - command: Command::Update(43696128), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/67E7618").unwrap(), - command: Command::BranchFrom(Some(*ids[0])), - timeline_id: *ids[1], - }, - Update { - lsn: Lsn::from_str("0/76BE4F0").unwrap(), - command: Command::Update(41844736), - timeline_id: *ids[1], - }, - Update { - lsn: Lsn::from_str("0/10E49380").unwrap(), - command: Command::Update(42164224), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/10E49380").unwrap(), - command: Command::BranchFrom(Some(*ids[0])), - timeline_id: *ids[2], - }, - Update { - lsn: Lsn::from_str("0/11D74910").unwrap(), - command: Command::Update(42172416), - timeline_id: *ids[2], - }, - Update { - lsn: Lsn::from_str("0/12051E98").unwrap(), - command: Command::Update(42196992), - timeline_id: *ids[0], - }, - ]; - - let mut sorted = commands; - - // these must sort in the same order, regardless of how the ids sort - // which is why the timeline_id is the last field - sorted.sort_unstable(); - - assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted); - } -} - #[test] fn verify_size_for_multiple_branches() { // this is generated from integration test test_tenant_size_with_multiple_branches, but this way // it has the stable lsn's // - // timelineinputs have been left out, because those explain the inputs, but don't participate - // in further size calculations. - let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#; - + // The timeline_inputs don't participate in the size calculation, and are here just to explain + // the inputs. + let doc = r#" +{ + "segments": [ + { + "segment": { + "parent": 9, + "lsn": 26033560, + "size": null, + "needed": false + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 0, + "lsn": 35720400, + "size": 25206784, + "needed": false + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 1, + "lsn": 35851472, + "size": null, + "needed": true + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "BranchEnd" + }, + { + "segment": { + "parent": 7, + "lsn": 24566168, + "size": null, + "needed": false + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 3, + "lsn": 25261936, + "size": 26050560, + "needed": false + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 4, + "lsn": 25393008, + "size": null, + "needed": true + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "BranchEnd" + }, + { + "segment": { + "parent": null, + "lsn": 23694408, + "size": null, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 6, + "lsn": 24566168, + "size": 25739264, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchPoint" + }, + { + "segment": { + "parent": 7, + "lsn": 25902488, + "size": 26402816, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 8, + "lsn": 26033560, + "size": 26468352, + "needed": true + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchPoint" + }, + { + "segment": { + "parent": 9, + "lsn": 26033560, + "size": null, + "needed": true + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchEnd" + } + ], + "timeline_inputs": [ + { + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "ancestor_lsn": "0/18D3D98", + "last_record": "0/2230CD0", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/2210CD0", + "pitr_cutoff": "0/2210CD0", + "next_gc_cutoff": "0/2210CD0", + "retention_param_cutoff": null + }, + { + "timeline_id": "454626700469f0a9914949b9d018e876", + "ancestor_lsn": "0/176D998", + "last_record": "0/1837770", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/1817770", + "pitr_cutoff": "0/1817770", + "next_gc_cutoff": "0/1817770", + "retention_param_cutoff": null + }, + { + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "ancestor_lsn": "0/0", + "last_record": "0/18D3D98", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/18B3D98", + "pitr_cutoff": "0/18B3D98", + "next_gc_cutoff": "0/18B3D98", + "retention_param_cutoff": null + } + ] +} +"#; let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); - assert_eq!(inputs.calculate().unwrap(), 36_409_872); + assert_eq!(inputs.calculate().unwrap(), 37_851_408); } #[test] -fn updates_sort_with_branches_at_same_lsn() { - use std::str::FromStr; - use Command::{BranchFrom, EndOfBranch}; - - macro_rules! lsn { - ($e:expr) => { - Lsn::from_str($e).unwrap() - }; +fn verify_size_for_one_branch() { + let doc = r#" +{ + "segments": [ + { + "segment": { + "parent": null, + "lsn": 0, + "size": null, + "needed": false + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 0, + "lsn": 305547335776, + "size": 220054675456, + "needed": false + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 1, + "lsn": 305614444640, + "size": null, + "needed": true + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "BranchEnd" } + ], + "timeline_inputs": [ + { + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "ancestor_lsn": "0/0", + "last_record": "47/280A5860", + "latest_gc_cutoff": "47/240A5860", + "horizon_cutoff": "47/240A5860", + "pitr_cutoff": "47/240A5860", + "next_gc_cutoff": "47/240A5860", + "retention_param_cutoff": "0/0" + } + ] +}"#; - let ids = [ - TimelineId::from_str("00000000000000000000000000000000").unwrap(), - TimelineId::from_str("11111111111111111111111111111111").unwrap(), - TimelineId::from_str("22222222222222222222222222222222").unwrap(), - TimelineId::from_str("33333333333333333333333333333333").unwrap(), - TimelineId::from_str("44444444444444444444444444444444").unwrap(), - ]; + let model: ModelInputs = serde_json::from_str(doc).unwrap(); - // issue https://github.com/neondatabase/neon/issues/3179 - let commands = vec![ - Update { - lsn: lsn!("0/0"), - command: BranchFrom(None), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: Command::Update(25387008), - timeline_id: ids[0], - }, - // next three are wrongly sorted, because - // ids[1] is branched from before ids[1] exists - // and ids[2] is branched from before ids[2] exists - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[0])), - timeline_id: ids[2], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[2])), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CA85B8"), - command: Command::Update(28925952), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: Command::Update(29024256), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[4], - }, - Update { - lsn: lsn!("0/22DCE70"), - command: Command::Update(32546816), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/230CE70"), - command: EndOfBranch, - timeline_id: ids[3], - }, - ]; + let res = model.calculate_model().unwrap().calculate(); - let expected = vec![ - Update { - lsn: lsn!("0/0"), - command: BranchFrom(None), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: Command::Update(25387008), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[0])), - timeline_id: ids[2], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[2])), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/1CA85B8"), - command: Command::Update(28925952), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: Command::Update(29024256), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[4], - }, - Update { - lsn: lsn!("0/22DCE70"), - command: Command::Update(32546816), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/230CE70"), - command: EndOfBranch, - timeline_id: ids[3], - }, - ]; + println!("calculated synthetic size: {}", res.total_size); + println!("result: {:?}", serde_json::to_string(&res.segments)); - let sorted_commands = sort_updates_in_tree_order(commands).unwrap(); - - assert_eq!(sorted_commands, expected); + use utils::lsn::Lsn; + let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap(); + let last_lsn: Lsn = "47/280A5860".parse().unwrap(); + println!( + "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}", + u64::from(latest_gc_cutoff_lsn), + u64::from(last_lsn) + ); + assert_eq!(res.total_size, 220121784320); } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index c943bf0a27..21c6ede27e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -27,8 +27,7 @@ use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::ops::{Deref, DerefMut}; -use std::os::fd::RawFd; -use std::os::unix::io::AsRawFd; +use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::prelude::CommandExt; use std::path::PathBuf; use std::process::Stdio; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b35252243e..0620ad8a35 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1217,7 +1217,7 @@ class PageserverHttpClient(requests.Session): """ Returns the tenant size, together with the model inputs as the second tuple item. """ - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size") + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size") self.verbose_error(res) res = res.json() assert isinstance(res, dict) @@ -1228,6 +1228,16 @@ class PageserverHttpClient(requests.Session): assert type(inputs) is dict return (size, inputs) + def tenant_size_debug(self, tenant_id: TenantId) -> str: + """ + Returns the tenant size debug info, as an HTML string + """ + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size", + headers={"Accept": "text/html"}, + ) + return res.text + def timeline_list( self, tenant_id: TenantId, diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index bb3bca8782..8c2996f491 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,13 +1,13 @@ -from typing import Any, List, Tuple +from pathlib import Path +from typing import List, Tuple import pytest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.types import Lsn -def test_empty_tenant_size(neon_simple_env: NeonEnv): +def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): env = neon_simple_env (tenant_id, _) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() @@ -18,6 +18,9 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv): main_branch_name = "main" + branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + assert branch_name == main_branch_name + with env.postgres.create_start( main_branch_name, tenant_id=tenant_id, @@ -39,12 +42,44 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv): size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) assert size == initial_size, "tenant_size should not be affected by shutdown of compute" - expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"] - actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"])) # type: ignore - assert actual_commands == expected_commands + expected_inputs = { + "segments": [ + { + "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True}, + "timeline_id": f"{main_timeline_id}", + "kind": "BranchStart", + }, + { + "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True}, + "timeline_id": f"{main_timeline_id}", + "kind": "BranchEnd", + }, + ], + "timeline_inputs": [ + { + "timeline_id": f"{main_timeline_id}", + "ancestor_id": None, + "ancestor_lsn": "0/0", + "last_record": "0/1698CC0", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/0", + "pitr_cutoff": "0/0", + "next_gc_cutoff": "0/0", + "retention_param_cutoff": None, + } + ], + } + expected_inputs = mask_model_inputs(expected_inputs) + actual_inputs = mask_model_inputs(inputs) + + assert expected_inputs == actual_inputs + + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) -def test_branched_empty_timeline_size(neon_simple_env: NeonEnv): +def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path): """ Issue found in production. Because the ancestor branch was under gc_horizon, the branchpoint was "dangling" and the computation could not be @@ -75,8 +110,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv): assert size_after_branching > initial_size + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) -def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv): + +def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_output_dir: Path): """ More general version of test_branched_empty_timeline_size @@ -128,9 +167,13 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv): size_after_writes = http_client.tenant_size(tenant_id) assert size_after_writes > initial_size + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + @pytest.mark.skip("This should work, but is left out because assumed covered by other tests") -def test_branch_point_within_horizon(neon_simple_env: NeonEnv): +def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 15 @@ -167,9 +210,13 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv): assert size_before_branching < size_after + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + @pytest.mark.skip("This should work, but is left out because assumed covered by other tests") -def test_parent_within_horizon(neon_simple_env: NeonEnv): +def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 5 @@ -179,7 +226,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv): """ env = neon_simple_env - gc_horizon = 200_000 + gc_horizon = 5_000 (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)}) http_client = env.pageserver.http_client() @@ -212,9 +259,13 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv): assert size_before_branching < size_after + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + @pytest.mark.skip("This should work, but is left out because assumed covered by other tests") -def test_only_heads_within_horizon(neon_simple_env: NeonEnv): +def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = small @@ -253,8 +304,14 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv): latest_size = size_now + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) -def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): + +def test_single_branch_get_tenant_size_grows( + neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): """ Operate on single branch reading the tenants size after each transaction. """ @@ -279,7 +336,20 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): collected_responses: List[Tuple[Lsn, int]] = [] + size_debug_file = open(test_output_dir / "size_debug.html", "w") + + def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int): + if current_lsn - initdb_lsn > gc_horizon: + assert ( + size >= prev + ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size" + else: + assert ( + size > prev + ), "tenant_size should grow, because we continue to add WAL to initial snapshot size" + with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg: + initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) with pg.cursor() as cur: cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)") @@ -297,13 +367,19 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - size = http_client.tenant_size(tenant_id) + size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) + + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) if len(collected_responses) > 0: prev = collected_responses[-1][1] if size == 0: assert prev == 0 else: + # branch start shouldn't be past gc_horizon yet + # thus the size should grow as we insert more data + assert current_lsn - initdb_lsn <= gc_horizon assert size > prev collected_responses.append((current_lsn, size)) @@ -323,9 +399,15 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - size = http_client.tenant_size(tenant_id) + size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) + + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + prev = collected_responses[-1][1] - assert size > prev, "tenant_size should grow with updates" + + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + collected_responses.append((current_lsn, size)) while True: @@ -340,9 +422,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): size = http_client.tenant_size(tenant_id) prev = collected_responses[-1][1] - assert ( - size > prev - ), "even though rows have been deleted, the tenant_size should increase" + + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + collected_responses.append((current_lsn, size)) with pg.cursor() as cur: @@ -352,7 +434,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): size = http_client.tenant_size(tenant_id) prev = collected_responses[-1][1] - assert size > prev, "dropping table grows tenant_size" + + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + collected_responses.append((current_lsn, size)) # this isn't too many lines to forget for a while. observed while @@ -364,24 +448,17 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() + size_debug_file.close() + size_after = http_client.tenant_size(tenant_id) prev = collected_responses[-1][1] assert size_after == prev, "size after restarting pageserver should not have changed" - ps_metrics = parse_metrics(http_client.get_metrics(), "pageserver") - tenant_metric_filter = { - "tenant_id": str(tenant_id), - } - tenant_size_metric = int( - ps_metrics.query_one("pageserver_tenant_synthetic_size", filter=tenant_metric_filter).value - ) - - assert tenant_size_metric == size_after, "API size value should be equal to metric size value" - - -def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder): +def test_get_tenant_size_with_multiple_branches( + neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): """ Reported size goes up while branches or rows are being added, goes down after removing branches. """ @@ -481,6 +558,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder size_after = http_client.tenant_size(tenant_id) assert size_after == size_after_thinning_branch + size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file_before.write(size_debug) + # teardown, delete branches, and the size should be going down http_client.timeline_delete(tenant_id, first_branch_timeline_id) @@ -493,3 +574,38 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder assert size_after_deleting_second < size_after_continuing_on_main assert size_after_deleting_second > size_after_first_branch + + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + + +# Helper for tests that compare timeline_inputs +# We don't want to compare the exact values, because they can be unstable +# and cause flaky tests. So replace the values with useful invariants. +def mask_model_inputs(x): + if isinstance(x, dict): + newx = {} + for k, v in x.items(): + if k == "size": + if v is None or v == 0: + # no change + newx[k] = v + elif v < 0: + newx[k] = "<0" + else: + newx[k] = ">0" + elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record": + if v is None or v == 0 or v == "0/0": + # no change + newx[k] = v + else: + newx[k] = "masked" + else: + newx[k] = mask_model_inputs(v) + return newx + elif isinstance(x, list): + newlist = [mask_model_inputs(v) for v in x] + return newlist + else: + return x From 7991bd3b6921ccdd13f0f38085127bbe282d4f26 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 16 Feb 2023 10:56:42 +0200 Subject: [PATCH 032/426] Fix periodic metric sending: don't reset timer on every iteration (#3617) Previously timer was reset on every collect_metrics_iteration and sending of cached metrics was never triggered. This is a follow-up for a69da4a7. --- pageserver/src/consumption_metrics.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index b078782a86..a6d1ec3632 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -75,7 +75,7 @@ pub async fn collect_metrics( // define client here to reuse it for all requests let client = reqwest::Client::new(); let mut cached_metrics: HashMap = HashMap::new(); - let mut prev_iteration_time: Option = None; + let mut prev_iteration_time: std::time::Instant = std::time::Instant::now(); loop { tokio::select! { @@ -86,11 +86,11 @@ pub async fn collect_metrics( _ = ticker.tick() => { // send cached metrics every cached_metric_collection_interval - let send_cached = prev_iteration_time - .map(|x| x.elapsed() >= cached_metric_collection_interval) - .unwrap_or(false); + let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval; - prev_iteration_time = Some(std::time::Instant::now()); + if send_cached { + prev_iteration_time = std::time::Instant::now(); + } collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await; } From 5082d84f5ba099025e6a1ddd51051e6565b16d17 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 14 Feb 2023 15:46:43 +0100 Subject: [PATCH 033/426] Compile pgjwt extension --- Dockerfile.compute-node | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 5c58f4baaa..f4479c46cb 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -182,6 +182,20 @@ RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control +######################################################################################### +# +# Layer "pgjwt-pg-build" +# compile pgjwt extension +# +######################################################################################### +FROM build-deps AS pgjwt-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN git clone https://github.com/michelp/pgjwt.git && \ + cd pgjwt && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -196,6 +210,7 @@ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From f0b41e7750f9f16e524e35a5084ba17465db2112 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 16 Feb 2023 14:25:35 +0200 Subject: [PATCH 034/426] Propose less verbose way to build neon (#3624) Closes https://github.com/neondatabase/neon/issues/3518 and might help https://github.com/neondatabase/neon/issues/3611 and the future build attempts. Propose `-s` flag in the Readme when building via `make` command, to help people to spot build errors easier. --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 29389e7a5d..f8bc1b7736 100644 --- a/README.md +++ b/README.md @@ -83,9 +83,10 @@ cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, -# use "BUILD_TYPE=release make -j`nproc`" +# use "BUILD_TYPE=release make -j`nproc` -s" +# Remove -s for the verbose build log -make -j`nproc` +make -j`nproc` -s ``` #### Building on OSX @@ -99,9 +100,10 @@ cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, -# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s" +# Remove -s for the verbose build log -make -j`sysctl -n hw.logicalcpu` +make -j`sysctl -n hw.logicalcpu` -s ``` #### Dependency installation notes From 0cf7fd0fb82b082d02dfadd9d6a488a7f799d72f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 16 Feb 2023 15:36:13 +0200 Subject: [PATCH 035/426] Compaction with on-demand download (#3598) Repeatedly (twice) try to download the compaction targeted layers before actual compaction. Adds tests for both L0 compaction downloading layers and image creation downloading layers. Image creation support existed already. Fixes #3591 Co-authored-by: Christian Schwarz --- .../src/tenant/remote_timeline_client.rs | 5 +- pageserver/src/tenant/timeline.rs | 206 +++++++++++++++--- test_runner/regress/test_ondemand_download.py | 168 ++++++++++++++ 3 files changed, 345 insertions(+), 34 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 985b480a76..7049a0bd66 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -571,14 +571,15 @@ impl RemoteTimelineClient { Ok(()) } - /// /// Launch a delete operation in the background. /// + /// The operation does not modify local state but assumes the local files have already been + /// deleted, and is used to mirror those changes to remote. + /// /// Note: This schedules an index file upload before the deletions. The /// deletion won't actually be performed, until any previously scheduled /// upload operations, and the index file upload, have completed /// succesfully. - /// pub fn schedule_layer_file_deletion( self: &Arc, names: &[LayerFileName], diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7a5a9de2f4..e606cacf92 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -613,7 +613,10 @@ impl Timeline { self.flush_frozen_layers_and_wait().await } + /// Outermost timeline compaction operation; downloads needed layers. pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> { + const ROUNDS: usize = 2; + let last_record_lsn = self.get_last_record_lsn(); // Last record Lsn could be zero in case the timeline was just created @@ -622,6 +625,86 @@ impl Timeline { return Ok(()); } + // retry two times to allow first round to find layers which need to be downloaded, then + // download them, then retry compaction + for round in 0..ROUNDS { + // should we error out with the most specific error? + let last_round = round == ROUNDS - 1; + + let res = self.compact_inner(ctx).await; + + // If `create_image_layers' or `compact_level0` scheduled any + // uploads or deletions, but didn't update the index file yet, + // do it now. + // + // This isn't necessary for correctness, the remote state is + // consistent without the uploads and deletions, and we would + // update the index file on next flush iteration too. But it + // could take a while until that happens. + // + // Additionally, only do this on the terminal round before sleeping. + if last_round { + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_index_upload_for_file_changes()?; + } + } + + let rls = match res { + Ok(()) => return Ok(()), + Err(CompactionError::DownloadRequired(rls)) if !last_round => { + // this can be done at most one time before exiting, waiting + rls + } + Err(CompactionError::DownloadRequired(rls)) => { + anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len()) + } + Err(CompactionError::Other(e)) => { + return Err(e); + } + }; + + // this path can be visited in the second round of retrying, if first one found that we + // must first download some remote layers + let total = rls.len(); + + let mut downloads = rls + .into_iter() + .map(|rl| self.download_remote_layer(rl)) + .collect::>(); + + let mut failed = 0; + + let cancelled = task_mgr::shutdown_watcher(); + tokio::pin!(cancelled); + + loop { + tokio::select! { + _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"), + res = downloads.next() => { + match res { + Some(Ok(())) => {}, + Some(Err(e)) => { + warn!("Downloading remote layer for compaction failed: {e:#}"); + failed += 1; + } + None => break, + } + } + } + } + + if failed != 0 { + anyhow::bail!("{failed} out of {total} layers failed to download, retrying later"); + } + + // if everything downloaded fine, lets try again + } + + unreachable!("retry loop exits") + } + + /// Compaction which might need to be retried after downloading remote layers. + async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> { // // High level strategy for compaction / image creation: // @@ -660,7 +743,7 @@ impl Timeline { // Is the timeline being deleted? let state = *self.state.borrow(); if state == TimelineState::Stopping { - anyhow::bail!("timeline is Stopping"); + return Err(anyhow::anyhow!("timeline is Stopping").into()); } let target_file_size = self.get_checkpoint_distance(); @@ -680,7 +763,8 @@ impl Timeline { // "enough". let layer_paths_to_upload = self .create_image_layers(&partitioning, lsn, false, ctx) - .await?; + .await + .map_err(anyhow::Error::from)?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -692,18 +776,6 @@ impl Timeline { self.compact_level0(&layer_removal_cs, target_file_size, ctx) .await?; timer.stop_and_record(); - - // If `create_image_layers' or `compact_level0` scheduled any - // uploads or deletions, but didn't update the index file yet, - // do it now. - // - // This isn't necessary for correctness, the remote state is - // consistent without the uploads and deletions, and we would - // update the index file on next flush iteration too. But it - // could take a while until that happens. - if let Some(remote_client) = &self.remote_client { - remote_client.schedule_index_upload_for_file_changes()?; - } } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -2541,10 +2613,13 @@ impl Timeline { ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { let partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 != Lsn(0) - && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold - { - // no repartitioning needed + let distance = lsn.0 - partitioning_guard.1 .0; + if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold { + debug!( + distance, + threshold = self.repartition_threshold, + "no repartitioning needed" + ); return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); } } @@ -2562,8 +2637,12 @@ impl Timeline { // Is it time to create a new image layer for the given partition? fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { + let threshold = self.get_image_creation_threshold(); + let layers = self.layers.read().unwrap(); + let mut max_deltas = 0; + for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; for (img_range, last_img) in image_coverage { @@ -2585,21 +2664,25 @@ impl Timeline { // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed // after we read last_record_lsn, which is passed here in the 'lsn' argument. if img_lsn < lsn { - let threshold = self.get_image_creation_threshold(); let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?; - debug!( - "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", - img_range.start, img_range.end, num_deltas, img_lsn, lsn - ); + max_deltas = max_deltas.max(num_deltas); if num_deltas >= threshold { + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); return Ok(true); } } } } + debug!( + max_deltas, + "none of the partitioned ranges had >= {threshold} deltas" + ); Ok(false) } @@ -2712,25 +2795,55 @@ impl Timeline { Ok(layer_paths_to_upload) } } + #[derive(Default)] struct CompactLevel0Phase1Result { new_layers: Vec, deltas_to_compact: Vec>, } +/// Top-level failure to compact. +#[derive(Debug)] +enum CompactionError { + /// L0 compaction requires layers to be downloaded. + /// + /// This should not happen repeatedly, but will be retried once by top-level + /// `Timeline::compact`. + DownloadRequired(Vec>), + /// Compaction cannot be done right now; page reconstruction and so on. + Other(anyhow::Error), +} + +impl From for CompactionError { + fn from(value: anyhow::Error) -> Self { + CompactionError::Other(value) + } +} + impl Timeline { + /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment. + /// + /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are + /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the + /// start of level0 files compaction, the on-demand download should be revisited as well. async fn compact_level0_phase1( &self, + _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, target_file_size: u64, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let layers = self.layers.read().unwrap(); let mut level0_deltas = layers.get_level0_deltas()?; drop(layers); // Only compact if enough layers have accumulated. - if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { - return Ok(Default::default()); + let threshold = self.get_compaction_threshold(); + if level0_deltas.is_empty() || level0_deltas.len() < threshold { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); } // Gather the files to compact in this iteration. @@ -2766,6 +2879,24 @@ impl Timeline { end: deltas_to_compact.last().unwrap().get_lsn_range().end, }; + let remotes = deltas_to_compact + .iter() + .filter(|l| l.is_remote_layer()) + .inspect(|l| info!("compact requires download of {}", l.filename().file_name())) + .map(|l| { + l.clone() + .downcast_remote_layer() + .expect("just checked it is remote layer") + }) + .collect::>(); + + if !remotes.is_empty() { + // caller is holding the lock to layer_removal_cs, and we don't want to download while + // holding that; in future download_remote_layer might take it as well. this is + // regardless of earlier image creation downloading on-demand, while holding the lock. + return Err(CompactionError::DownloadRequired(remotes)); + } + info!( "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", lsn_range.start, @@ -2773,9 +2904,11 @@ impl Timeline { deltas_to_compact.len(), level0_deltas.len() ); + for l in deltas_to_compact.iter() { info!("compact includes {}", l.filename().file_name()); } + // We don't need the original list of layers anymore. Drop it so that // we don't accidentally use it later in the function. drop(level0_deltas); @@ -2945,7 +3078,9 @@ impl Timeline { } fail_point!("delta-layer-writer-fail-before-finish", |_| { - anyhow::bail!("failpoint delta-layer-writer-fail-before-finish"); + return Err( + anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(), + ); }); writer.as_mut().unwrap().put_value(key, lsn, value)?; @@ -2964,7 +3099,7 @@ impl Timeline { // Fsync all the layer files and directory using multiple threads to // minimize latency. - par_fsync::par_fsync(&layer_paths)?; + par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?; layer_paths.pop().unwrap(); } @@ -2986,11 +3121,13 @@ impl Timeline { layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, target_file_size: u64, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - } = self.compact_level0_phase1(target_file_size, ctx).await?; + } = self + .compact_level0_phase1(layer_removal_cs, target_file_size, ctx) + .await?; if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do @@ -3014,7 +3151,12 @@ impl Timeline { for l in new_layers { let new_delta_path = l.path(); - let metadata = new_delta_path.metadata()?; + let metadata = new_delta_path.metadata().with_context(|| { + format!( + "read file metadata for new created layer {}", + new_delta_path.display() + ) + })?; if let Some(remote_client) = &self.remote_client { remote_client.schedule_layer_file_upload( @@ -3248,7 +3390,7 @@ impl Timeline { let mut layers_to_remove = Vec::new(); - // Scan all on-disk layers in the timeline. + // Scan all layers in the timeline (remote or on-disk). // // Garbage collect the layer if all conditions are satisfied: // 1. it is older than cutoff LSN; diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 3551f27cad..f5f8491ada 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -1,7 +1,10 @@ # It's possible to run any regular test with the local fs remote storage via # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... +import time +from collections import defaultdict from pathlib import Path +from typing import Any, DefaultDict, Dict import pytest from fixtures.log_helper import log @@ -10,6 +13,7 @@ from fixtures.neon_fixtures import ( RemoteStorageKind, assert_tenant_status, available_remote_storages, + wait_for_last_flush_lsn, wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, @@ -449,3 +453,167 @@ def test_download_remote_layers_api( pg_old = env.postgres.create_start(branch_name="main") with pg_old.cursor() as cur: assert query_scalar(cur, "select count(*) from testtab") == table_len + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) +def test_compaction_downloads_on_demand_without_image_creation( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + Create a few layers, then evict, then make sure compaction runs successfully. + """ + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_compaction_downloads_on_demand_without_image_creation", + ) + + env = neon_env_builder.init_start() + + conf = { + # Disable background GC & compaction + "gc_period": "0s", + "compaction_period": "0s", + # unused, because manual will be called after each table + "checkpoint_distance": 100 * 1024**2, + # this will be updated later on to allow manual compaction outside of checkpoints + "compaction_threshold": 100, + # repartitioning parameter, not required here + "image_creation_threshold": 100, + # repartitioning parameter, not required here + "compaction_target_size": 128 * 1024**2, + # pitr_interval and gc_horizon are not interesting because we dont run gc + } + + # Override defaults, to create more layers + tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf)) + env.initial_tenant = tenant_id + pageserver_http = env.pageserver.http_client() + + with env.postgres.create_start("main") as pg: + # no particular reason to create the layers like this, but we are sure + # not to hit the image_creation_threshold here. + with pg.cursor() as cur: + cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + with pg.cursor() as cur: + cur.execute("update a set id = -id") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint" + assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas" + + for layer in layers.historic_layers: + log.info(f"pre-compact: {layer}") + pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + + env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"}) + + pageserver_http.timeline_compact(tenant_id, timeline_id) + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + for layer in layers.historic_layers: + log.info(f"post compact: {layer}") + assert len(layers.historic_layers) == 1, "should have compacted to single layer" + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) +def test_compaction_downloads_on_demand_with_image_creation( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted. + + Due to current implementation, this will make image creation on-demand download layers, but we cannot really + directly test for it. + """ + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_compaction_downloads_on_demand", + ) + + env = neon_env_builder.init_start() + + conf = { + # Disable background GC & compaction + "gc_period": "0s", + "compaction_period": "0s", + # repartitioning threshold is this / 10, but it doesn't really seem to matter + "checkpoint_distance": 50 * 1024**2, + "compaction_threshold": 3, + # important: keep this high for the data ingestion + "image_creation_threshold": 100, + # repartitioning parameter, unused + "compaction_target_size": 128 * 1024**2, + # pitr_interval and gc_horizon are not interesting because we dont run gc + } + + # Override defaults, to create more layers + tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf)) + env.initial_tenant = tenant_id + pageserver_http = env.pageserver.http_client() + + with env.postgres.create_start("main") as pg: + # no particular reason to create the layers like this, but we are sure + # not to hit the image_creation_threshold here. + with pg.cursor() as cur: + cur.execute("create table a (id bigserial primary key, some_value bigint not null)") + cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + for _ in range(0, 2): + for i in range(0, 3): + # create a minimal amount of "delta difficulty" for this table + with pg.cursor() as cur: + cur.execute("update a set some_value = -some_value + %s", (i,)) + + with pg.cursor() as cur: + # vacuuming should aid to reuse keys, though it's not really important + # with image_creation_threshold=1 which we will use on the last compaction + cur.execute("vacuum") + + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + # images should not yet be created, because threshold is too high, + # but these will be reshuffled to L1 layers + pageserver_http.timeline_compact(tenant_id, timeline_id) + + for _ in range(0, 20): + # loop in case flushing is still in progress + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + if not layers.in_memory_layers: + break + time.sleep(0.2) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint" + + kinds_before: DefaultDict[str, int] = defaultdict(int) + + for layer in layers.historic_layers: + kinds_before[layer.kind] += 1 + pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + + assert dict(kinds_before) == {"Delta": 4} + + # now having evicted all layers, reconfigure to have lower image creation + # threshold to expose image creation to downloading all of the needed + # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 + # to be less flaky + env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"}) + + pageserver_http.timeline_compact(tenant_id, timeline_id) + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + kinds_after: DefaultDict[str, int] = defaultdict(int) + for layer in layers.historic_layers: + kinds_after[layer.kind] += 1 + + assert dict(kinds_after) == {"Delta": 4, "Image": 1} + + +def stringify(conf: Dict[str, Any]) -> Dict[str, str]: + return dict(map(lambda x: (x[0], str(x[1])), conf.items())) From d9ba3c5f5ef3af4f16e8ffe23096310238b2d8c7 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 16 Feb 2023 16:29:08 +0200 Subject: [PATCH 036/426] Revert "Add debug messages around timeline.get_current_logical_size" This reverts commit a5ce2b5330233927169152253548f822cf6d1643. --- pageserver/src/consumption_metrics.rs | 23 ++++++++++------------- pageserver/src/tenant/timeline.rs | 7 +------ 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index a6d1ec3632..3d2a01effb 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -166,20 +166,17 @@ pub async fn collect_metrics_iteration( match timeline.get_current_logical_size(ctx) { // Only send timeline logical size when it is fully calculated. - Ok((size, is_exact)) => { - if is_exact { - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: Some(timeline.timeline_id), - metric: TIMELINE_LOGICAL_SIZE, - }, - size, - )); - } else { - info!("logical_size is not fully calculated for timeline {}, skipping sending value {} ", timeline.timeline_id, size); - } + Ok((size, is_exact)) if is_exact => { + current_metrics.push(( + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: TIMELINE_LOGICAL_SIZE, + }, + size, + )); } + Ok((_, _)) => {} Err(err) => { error!( "failed to get current logical size for timeline {}: {err:?}", diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e606cacf92..66afe2cdce 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -812,15 +812,10 @@ impl Timeline { let mut is_exact = true; let size = current_size.size(); - if let (CurrentLogicalSize::Approximate(approx_size), Some(init_lsn)) = + if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = (current_size, self.current_logical_size.initial_part_end) { is_exact = false; - info!( - "Current size for timeline {} is approximate {}, initial_part_end lsn: {:?}", - self.timeline_id, approx_size, init_lsn - ); - self.try_spawn_size_init_task(init_lsn, ctx); } From 6139e8e426c2fdf7d54d517d2a93bb101ada40c6 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 16 Feb 2023 16:29:19 +0200 Subject: [PATCH 037/426] Revert "Add debug messages around sending cached metrics" This reverts commit a839860c2ea9a0bbce39a43e9849daef698025ee. --- pageserver/src/consumption_metrics.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 3d2a01effb..d1383b33cb 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -239,18 +239,6 @@ pub async fn collect_metrics_iteration( Some(val) => val != curr_val, None => true, }); - - info!( - "sending only changed metrics, {} values at {}", - current_metrics.len(), - Utc::now() - ); - } else { - info!( - "sending all metrics, including cached ones. {} values at {}", - current_metrics.len(), - Utc::now() - ); } if current_metrics.is_empty() { From 0d3aefb2749145044b99922fbf33730a0754cd50 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 16 Feb 2023 16:24:47 +0200 Subject: [PATCH 038/426] Only use active timelines in synthetic_size calculation --- pageserver/src/tenant/size.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 2c5efe283b..a41889f16d 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -134,7 +134,7 @@ pub(super) async fn gather_inputs( .context("Failed to refresh gc_info before gathering inputs")?; // Collect information about all the timelines - let timelines = tenant.list_timelines(); + let mut timelines = tenant.list_timelines(); if timelines.is_empty() { // perhaps the tenant has just been created, and as such doesn't have any data yet @@ -144,6 +144,13 @@ pub(super) async fn gather_inputs( }); } + // Filter out timelines that are not active + // + // There may be a race when a timeline is dropped, + // but it is unlikely to cause any issues. In the worst case, + // the calculation will error out. + timelines.retain(|t| t.is_active()); + // Build a map of branch points. let mut branchpoints: HashMap> = HashMap::new(); for timeline in timelines.iter() { From d9c518b2cc44da2cda378199e1ce27fafe0e4838 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 7 Feb 2023 19:36:30 +0200 Subject: [PATCH 039/426] Refactor use_cleartext_password_flow. It's not a property of the credentials that we receive from the client, so remove it from ClientCredentials. Instead, pass it as an argument directly to 'authenticate' function, where it's actually used. All the rest of the changes is just plumbing to pass it through the call stack to 'authenticate' --- proxy/src/auth/backend.rs | 14 ++++++++++++-- proxy/src/auth/credentials.rs | 22 +++++++--------------- proxy/src/proxy.rs | 18 ++++++++++++------ 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 50afbd2a27..5cd02df87c 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -114,6 +114,7 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { &'a mut self, extra: &'a ConsoleReqExtra<'a>, client: &'a mut stream::PqStream, + use_cleartext_password_flow: bool, ) -> auth::Result>> { use BackendType::*; @@ -158,7 +159,7 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { (node, payload.password) } // This is a hack to allow cleartext password in secure connections (wss). - Console(api, creds) if creds.use_cleartext_password_flow => { + Console(api, creds) if use_cleartext_password_flow => { let payload = fetch_plaintext_password(client).await?; let node = api.wake_compute(extra, creds).await?; @@ -182,16 +183,25 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { } /// Authenticate the client via the requested backend, possibly using credentials. + /// + /// If `use_cleartext_password_flow` is true, we use the old cleartext password + /// flow. It is used for websocket connections, which want to minimize the number + /// of round trips. (Plaintext password authentication requires only one round-trip, + /// where SCRAM requires two.) pub async fn authenticate<'a>( &mut self, extra: &'a ConsoleReqExtra<'a>, client: &'a mut stream::PqStream, + use_cleartext_password_flow: bool, ) -> auth::Result> { use BackendType::*; // Handle cases when `project` is missing in `creds`. // TODO: type safety: return `creds` with irrefutable `project`. - if let Some(res) = self.try_password_hack(extra, client).await? { + if let Some(res) = self + .try_password_hack(extra, client, use_cleartext_password_flow) + .await? + { info!("user successfully authenticated (using the password hack)"); return Ok(res); } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 968104f058..2d2f193bec 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -34,9 +34,6 @@ pub struct ClientCredentials<'a> { pub user: &'a str, // TODO: this is a severe misnomer! We should think of a new name ASAP. pub project: Option>, - /// If `True`, we'll use the old cleartext password flow. This is used for - /// websocket connections, which want to minimize the number of round trips. - pub use_cleartext_password_flow: bool, } impl ClientCredentials<'_> { @@ -51,7 +48,6 @@ impl<'a> ClientCredentials<'a> { params: &'a StartupMessageParams, sni: Option<&str>, common_name: Option<&str>, - use_cleartext_password_flow: bool, ) -> Result { use ClientCredsParseError::*; @@ -99,14 +95,12 @@ impl<'a> ClientCredentials<'a> { info!( user = user, project = project.as_deref(), - use_cleartext_password_flow = use_cleartext_password_flow, "credentials" ); Ok(Self { user, project, - use_cleartext_password_flow, }) } } @@ -131,7 +125,7 @@ mod tests { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -146,7 +140,7 @@ mod tests { ("foo", "bar"), // should be ignored ]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -160,7 +154,7 @@ mod tests { let sni = Some("foo.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name, false)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -174,7 +168,7 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -188,7 +182,7 @@ mod tests { let sni = Some("baz.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name, false)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -203,8 +197,7 @@ mod tests { let sni = Some("second.localhost"); let common_name = Some("localhost"); - let err = - ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -221,8 +214,7 @@ mod tests { let sni = Some("project.localhost"); let common_name = Some("example.com"); - let err = - ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); match err { InconsistentSni { sni, cn } => { assert_eq!(sni, "project.localhost"); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index a622a35e6d..c1ed79ecb6 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -127,7 +127,7 @@ pub async fn handle_ws_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true)) + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? @@ -135,7 +135,7 @@ pub async fn handle_ws_client( let client = Client::new(stream, creds, ¶ms, session_id); cancel_map - .with_session(|session| client.connect_to_db(session)) + .with_session(|session| client.connect_to_db(session, true)) .await } @@ -165,7 +165,7 @@ async fn handle_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? @@ -173,7 +173,7 @@ async fn handle_client( let client = Client::new(stream, creds, ¶ms, session_id); cancel_map - .with_session(|session| client.connect_to_db(session)) + .with_session(|session| client.connect_to_db(session, false)) .await } @@ -401,7 +401,11 @@ impl<'a, S> Client<'a, S> { impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. - async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { + async fn connect_to_db( + self, + session: cancellation::Session<'_>, + use_cleartext_password_flow: bool, + ) -> anyhow::Result<()> { let Self { mut stream, mut creds, @@ -416,7 +420,9 @@ impl Client<'_, S> { let auth_result = async { // `&mut stream` doesn't let us merge those 2 lines. - let res = creds.authenticate(&extra, &mut stream).await; + let res = creds + .authenticate(&extra, &mut stream, use_cleartext_password_flow) + .await; async { res }.or_else(|e| stream.throw_error(e)).await } .instrument(info_span!("auth")) From edffe0dd9d811aa4a11e267ceeb82837a56e864b Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 15 Feb 2023 15:17:18 +0300 Subject: [PATCH 040/426] Extract password hack & cleartext hack --- proxy/src/auth/backend.rs | 190 +++++++++++++++--------------- proxy/src/auth/backend/classic.rs | 2 +- proxy/src/auth/backend/link.rs | 2 +- proxy/src/auth/credentials.rs | 19 ++- 4 files changed, 103 insertions(+), 110 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 5cd02df87c..25e288ad2e 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,7 +1,6 @@ mod classic; - mod link; -use futures::TryFutureExt; + pub use link::LinkAuthError; use crate::{ @@ -13,6 +12,7 @@ use crate::{ }, stream, url, }; +use futures::TryFutureExt; use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; @@ -105,107 +105,99 @@ impl<'a, T, E> BackendType<'a, Result> { } } -// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc). -// Read more: https://github.com/rust-lang/rust/issues/99190 -// Alleged fix: https://github.com/rust-lang/rust/pull/89056 -impl<'l> BackendType<'l, ClientCredentials<'_>> { - /// Do something special if user didn't provide the `project` parameter. - async fn try_password_hack<'a>( - &'a mut self, - extra: &'a ConsoleReqExtra<'a>, - client: &'a mut stream::PqStream, - use_cleartext_password_flow: bool, - ) -> auth::Result>> { - use BackendType::*; +/// Compared to [SCRAM](crate::scram), cleartext password auth saves +/// one round trip and *expensive* computations (>= 4096 HMAC iterations). +/// These properties are benefical for serverless JS workers, so we +/// use this mechanism for websocket connections. +async fn do_cleartext_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("cleartext auth flow override is enabled, proceeding"); + let password = AuthFlow::new(client) + .begin(auth::CleartextPassword) + .await? + .authenticate() + .await?; - // If there's no project so far, that entails that client doesn't - // support SNI or other means of passing the project name. - // We now expect to see a very specific payload in the place of password. - let fetch_magic_payload = |client| async { - warn!("project name not specified, resorting to the password hack auth flow"); - let payload = AuthFlow::new(client) - .begin(auth::PasswordHack) - .await? - .authenticate() - .await?; + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(password); - info!(project = &payload.project, "received missing parameter"); - auth::Result::Ok(payload) - }; + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} - // If we want to use cleartext password flow, we can read the password - // from the client and pretend that it's a magic payload (PasswordHack hack). - let fetch_plaintext_password = |client| async { - info!("using cleartext password flow"); - let payload = AuthFlow::new(client) - .begin(auth::CleartextPassword) - .await? - .authenticate() - .await?; +/// Workaround for clients which don't provide an endpoint (project) name. +/// Very similar to [`do_cleartext`], but there's a specific password format. +async fn do_password_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("project not specified, resorting to the password hack auth flow"); + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; - auth::Result::Ok(auth::password_hack::PasswordHackPayload { - project: String::new(), - password: payload, - }) - }; + info!(project = &payload.project, "received missing parameter"); + creds.project = Some(payload.project.into()); - // TODO: find a proper way to merge those very similar blocks. - let (mut node, password) = match self { - Console(api, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload(client).await?; - creds.project = Some(payload.project.into()); - let node = api.wake_compute(extra, creds).await?; + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(payload.password); - (node, payload.password) - } - // This is a hack to allow cleartext password in secure connections (wss). - Console(api, creds) if use_cleartext_password_flow => { - let payload = fetch_plaintext_password(client).await?; - let node = api.wake_compute(extra, creds).await?; + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} - (node, payload.password) - } - Postgres(api, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload(client).await?; - creds.project = Some(payload.project.into()); - let node = api.wake_compute(extra, creds).await?; - - (node, payload.password) - } - _ => return Ok(None), - }; - - node.config.password(password); - Ok(Some(AuthSuccess { - reported_auth_ok: false, - value: node, - })) +/// True to its name, this function encapsulates our current auth trade-offs. +/// Here, we choose the appropriate auth flow based on circumstances. +async fn auth_quirks( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, + allow_cleartext: bool, +) -> auth::Result> { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the endpoint (project) name. + // We now expect to see a very specific payload in the place of password. + if creds.project.is_none() { + return do_password_hack(api, extra, creds, client).await; } + // Password hack should set the project name. + // TODO: make `creds.project` more type-safe. + assert!(creds.project.is_some()); + + // Perform cleartext auth if we're allowed to do that. + // Currently, we use it for websocket connections (latency). + if allow_cleartext { + return do_cleartext_hack(api, extra, creds, client).await; + } + + // Finally, proceed with the main auth flow (SCRAM-based). + classic::authenticate(api, extra, creds, client).await +} + +impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. - /// - /// If `use_cleartext_password_flow` is true, we use the old cleartext password - /// flow. It is used for websocket connections, which want to minimize the number - /// of round trips. (Plaintext password authentication requires only one round-trip, - /// where SCRAM requires two.) - pub async fn authenticate<'a>( + pub async fn authenticate( &mut self, - extra: &'a ConsoleReqExtra<'a>, - client: &'a mut stream::PqStream, - use_cleartext_password_flow: bool, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, + allow_cleartext: bool, ) -> auth::Result> { use BackendType::*; - // Handle cases when `project` is missing in `creds`. - // TODO: type safety: return `creds` with irrefutable `project`. - if let Some(res) = self - .try_password_hack(extra, client, use_cleartext_password_flow) - .await? - { - info!("user successfully authenticated (using the password hack)"); - return Ok(res); - } - let res = match self { Console(api, creds) => { info!( @@ -214,20 +206,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { "performing authentication using the console" ); - assert!(creds.project.is_some()); - classic::handle_user(api.as_ref(), extra, creds, client).await? + let api = api.as_ref(); + auth_quirks(api, extra, creds, client, allow_cleartext).await? } Postgres(api, creds) => { - info!("performing mock authentication using a local postgres instance"); + info!( + user = creds.user, + project = creds.project(), + "performing authentication using a local postgres instance" + ); - assert!(creds.project.is_some()); - classic::handle_user(api.as_ref(), extra, creds, client).await? + let api = api.as_ref(); + auth_quirks(api, extra, creds, client, allow_cleartext).await? } // NOTE: this auth backend doesn't use client credentials. Link(url) => { info!("performing link authentication"); - link::handle_user(url, client) + link::authenticate(url, client) .await? .map(CachedNodeInfo::new_uncached) } @@ -239,9 +235,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { /// When applicable, wake the compute node, gaining its connection info in the process. /// The link auth flow doesn't support this, so we return [`None`] in that case. - pub async fn wake_compute<'a>( + pub async fn wake_compute( &self, - extra: &'a ConsoleReqExtra<'a>, + extra: &ConsoleReqExtra<'_>, ) -> Result, console::errors::WakeComputeError> { use BackendType::*; diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index eefef6e9b4..6753e7ed7f 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -9,7 +9,7 @@ use crate::{ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; -pub(super) async fn handle_user( +pub(super) async fn authenticate( api: &impl console::Api, extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials<'_>, diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 5d0049c957..7175a23dc1 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -53,7 +53,7 @@ pub fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } -pub(super) async fn handle_user( +pub(super) async fn authenticate( link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result> { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 2d2f193bec..c556c33197 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -11,12 +11,16 @@ pub enum ClientCredsParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), - #[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)] + #[error( + "Inconsistent project name inferred from \ + SNI ('{}') and project option ('{}').", + .domain, .option, + )] InconsistentProjectNames { domain: String, option: String }, #[error( "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \ - SNI should be formatted as '.{}'.", + SNI should be formatted as '.{}'.", .sni, .cn, .cn, )] InconsistentSni { sni: String, cn: String }, @@ -92,16 +96,9 @@ impl<'a> ClientCredentials<'a> { } .transpose()?; - info!( - user = user, - project = project.as_deref(), - "credentials" - ); + info!(user, project = project.as_deref(), "credentials"); - Ok(Self { - user, - project, - }) + Ok(Self { user, project }) } } From a4d5c8085bba69131790b8ac299a9e78893b2427 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 15 Feb 2023 15:39:21 +0300 Subject: [PATCH 041/426] Move hacks to a dedicated module. --- proxy/src/auth/backend.rs | 64 ++++---------------------------- proxy/src/auth/backend/hacks.rs | 66 +++++++++++++++++++++++++++++++++ proxy/src/proxy.rs | 4 +- 3 files changed, 75 insertions(+), 59 deletions(-) create mode 100644 proxy/src/auth/backend/hacks.rs diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 25e288ad2e..42b2304bb8 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,10 +1,11 @@ mod classic; +mod hacks; mod link; pub use link::LinkAuthError; use crate::{ - auth::{self, AuthFlow, ClientCredentials}, + auth::{self, ClientCredentials}, console::{ self, provider::{CachedNodeInfo, ConsoleReqExtra}, @@ -15,7 +16,7 @@ use crate::{ use futures::TryFutureExt; use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::info; /// A product of successful authentication. pub struct AuthSuccess { @@ -105,59 +106,6 @@ impl<'a, T, E> BackendType<'a, Result> { } } -/// Compared to [SCRAM](crate::scram), cleartext password auth saves -/// one round trip and *expensive* computations (>= 4096 HMAC iterations). -/// These properties are benefical for serverless JS workers, so we -/// use this mechanism for websocket connections. -async fn do_cleartext_hack( - api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, - creds: &mut ClientCredentials<'_>, - client: &mut stream::PqStream, -) -> auth::Result> { - warn!("cleartext auth flow override is enabled, proceeding"); - let password = AuthFlow::new(client) - .begin(auth::CleartextPassword) - .await? - .authenticate() - .await?; - - let mut node = api.wake_compute(extra, creds).await?; - node.config.password(password); - - Ok(AuthSuccess { - reported_auth_ok: false, - value: node, - }) -} - -/// Workaround for clients which don't provide an endpoint (project) name. -/// Very similar to [`do_cleartext`], but there's a specific password format. -async fn do_password_hack( - api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, - creds: &mut ClientCredentials<'_>, - client: &mut stream::PqStream, -) -> auth::Result> { - warn!("project not specified, resorting to the password hack auth flow"); - let payload = AuthFlow::new(client) - .begin(auth::PasswordHack) - .await? - .authenticate() - .await?; - - info!(project = &payload.project, "received missing parameter"); - creds.project = Some(payload.project.into()); - - let mut node = api.wake_compute(extra, creds).await?; - node.config.password(payload.password); - - Ok(AuthSuccess { - reported_auth_ok: false, - value: node, - }) -} - /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. async fn auth_quirks( @@ -171,7 +119,8 @@ async fn auth_quirks( // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. if creds.project.is_none() { - return do_password_hack(api, extra, creds, client).await; + // Password will be checked by the compute node later. + return hacks::password_hack(api, extra, creds, client).await; } // Password hack should set the project name. @@ -181,7 +130,8 @@ async fn auth_quirks( // Perform cleartext auth if we're allowed to do that. // Currently, we use it for websocket connections (latency). if allow_cleartext { - return do_cleartext_hack(api, extra, creds, client).await; + // Password will be checked by the compute node later. + return hacks::cleartext_hack(api, extra, creds, client).await; } // Finally, proceed with the main auth flow (SCRAM-based). diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs new file mode 100644 index 0000000000..f710581cb2 --- /dev/null +++ b/proxy/src/auth/backend/hacks.rs @@ -0,0 +1,66 @@ +use super::AuthSuccess; +use crate::{ + auth::{self, AuthFlow, ClientCredentials}, + console::{ + self, + provider::{CachedNodeInfo, ConsoleReqExtra}, + }, + stream, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, warn}; + +/// Compared to [SCRAM](crate::scram), cleartext password auth saves +/// one round trip and *expensive* computations (>= 4096 HMAC iterations). +/// These properties are benefical for serverless JS workers, so we +/// use this mechanism for websocket connections. +pub async fn cleartext_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("cleartext auth flow override is enabled, proceeding"); + let password = AuthFlow::new(client) + .begin(auth::CleartextPassword) + .await? + .authenticate() + .await?; + + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(password); + + // Report tentative success; compute node will check the password anyway. + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} + +/// Workaround for clients which don't provide an endpoint (project) name. +/// Very similar to [`cleartext_hack`], but there's a specific password format. +pub async fn password_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("project not specified, resorting to the password hack auth flow"); + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; + + info!(project = &payload.project, "received missing parameter"); + creds.project = Some(payload.project.into()); + + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(payload.password); + + // Report tentative success; compute node will check the password anyway. + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index c1ed79ecb6..0dc48f1212 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -404,7 +404,7 @@ impl Client<'_, S> { async fn connect_to_db( self, session: cancellation::Session<'_>, - use_cleartext_password_flow: bool, + allow_cleartext: bool, ) -> anyhow::Result<()> { let Self { mut stream, @@ -421,7 +421,7 @@ impl Client<'_, S> { let auth_result = async { // `&mut stream` doesn't let us merge those 2 lines. let res = creds - .authenticate(&extra, &mut stream, use_cleartext_password_flow) + .authenticate(&extra, &mut stream, allow_cleartext) .await; async { res }.or_else(|e| stream.throw_error(e)).await } From a1b062123ba921d7ef48e3620eada6b9fc0c288b Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Thu, 16 Feb 2023 21:28:53 +0100 Subject: [PATCH 042/426] Do not deploy storage to old account (#3630) It's gone --- .github/ansible/production.hosts.yaml | 40 ---------- .../production.neon-storage-broker.yaml | 56 -------------- .github/workflows/deploy-prod.yml | 77 ------------------- 3 files changed, 173 deletions(-) delete mode 100644 .github/ansible/production.hosts.yaml delete mode 100644 .github/helm-values/production.neon-storage-broker.yaml diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml deleted file mode 100644 index ecb847bd61..0000000000 --- a/.github/ansible/production.hosts.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -storage: - vars: - console_mgmt_base_url: http://console-release.local - bucket_name: zenith-storage-oregon - bucket_region: us-west-2 - broker_endpoint: http://storage-broker.prod.local:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events - metric_collection_interval: 10min - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "{{ inventory_hostname }}" - safekeeper_s3_prefix: prod-1/wal - hostname_suffix: ".local" - remote_user: admin - sentry_environment: production - - children: - pageservers: - hosts: - zenith-1-ps-2: - console_region_id: aws-us-west-2 - zenith-1-ps-3: - console_region_id: aws-us-west-2 - zenith-1-ps-4: - console_region_id: aws-us-west-2 - zenith-1-ps-5: - console_region_id: aws-us-west-2 - - safekeepers: - hosts: - zenith-1-sk-1: - console_region_id: aws-us-west-2 - zenith-1-sk-2: - console_region_id: aws-us-west-2 - zenith-1-sk-4: - console_region_id: aws-us-west-2 diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml deleted file mode 100644 index aa64081da3..0000000000 --- a/.github/helm-values/production.neon-storage-broker.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Helm chart values for neon-storage-broker -podLabels: - neon_env: production - neon_service: storage-broker - -# Use L4 LB -service: - # service.annotations -- Annotations to add to the service - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet - # assign service to this name at external-dns - external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local - # service.type -- Service type - type: LoadBalancer - # service.port -- broker listen port - port: 50051 - -ingress: - enabled: false - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-storage-broker.fullname\" . }}" - labels: - helm.sh/chart: neon-storage-broker-{{ .Chart.Version }} - app.kubernetes.io/name: neon-storage-broker - app.kubernetes.io/instance: neon-storage-broker - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-storage-broker" - endpoints: - - port: broker - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" - -settings: - sentryEnvironment: "production" diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index f4ce7e9afa..540d187274 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -165,80 +165,3 @@ jobs: - name: Deploy storage-broker run: helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - # Deploy to old account below - - deploy: - runs-on: prod - container: - image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - options: --user root --privileged - if: inputs.deployStorage && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - environment: - name: prod-old - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Redeploy - run: | - export DOCKER_TAG=${{ inputs.dockerTag }} - cd "$(pwd)/.github/ansible" - - ./get_binaries.sh - - eval $(ssh-agent) - echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key - echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater - ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied - - name: Cleanup ansible folder - run: rm -rf ~/.ansible - - deploy-storage-broker: - name: deploy storage broker on old staging and old prod - runs-on: [ self-hosted, gen3, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - if: inputs.deployStorageBroker && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - environment: - name: prod-old - env: - KUBECONFIG: .kubeconfig - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Store kubeconfig file - run: | - echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - - name: Add neon helm chart - run: helm repo add neondatabase https://neondatabase.github.io/helm-charts - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - - name: Cleanup helm folder - run: rm -rf ~/.cache From 526f8b76aa3055635206f1656364957dd506fae9 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 17 Feb 2023 08:29:52 +0000 Subject: [PATCH 043/426] Bump werkzeug from 2.1.2 to 2.2.3 (#3631) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Describe your changes ``` $ poetry add werkzeug@latest "moto[server]@latest" Using version ^2.2.3 for werkzeug Using version ^4.1.2 for moto Updating dependencies Resolving dependencies... (1.6s) Writing lock file Package operations: 0 installs, 2 updates, 1 removal • Removing pytz (2022.1) • Updating werkzeug (2.1.2 -> 2.2.3) • Updating moto (3.1.18 -> 4.1.2) ``` Resolves: - https://github.com/neondatabase/neon/security/dependabot/14 - https://github.com/neondatabase/neon/security/dependabot/13 `@dependabot` failed to create a PR for some reason (I guess because it also needed to handle `moto` dependency) ## Issue ticket number and link N/A ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [x] If it is a core feature, I have added thorough tests. - [x] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [x] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- poetry.lock | 78 +++++++++++++++++++++----------------------------- pyproject.toml | 4 +-- 2 files changed, 35 insertions(+), 47 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7e80b1e10a..bc2c56d74c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1238,7 +1238,6 @@ category = "main" optional = false python-versions = "*" files = [ - {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"}, {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, ] @@ -1309,66 +1308,63 @@ files = [ [[package]] name = "moto" -version = "3.1.18" -description = "A library that allows your python tests to easily mock out the boto library" +version = "4.1.2" +description = "" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, - {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, + {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, + {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, ] [package.dependencies] aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" botocore = ">=1.12.201" -cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""} +cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} -flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""} +flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} -idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -MarkupSafe = "!=2.0.0a1" openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} -pytz = "*" PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.9.0" +responses = ">=0.13.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} -werkzeug = ">=0.5,<2.2.0" +werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] awslambda = ["docker (>=2.5.1)"] batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] ds = ["sshpubkeys (>=3.1.0)"] dynamodb = ["docker (>=2.5.1)"] -dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] +eks = ["sshpubkeys (>=3.1.0)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -ssm = ["PyYAML (>=5.1)", "dataclasses"] +server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +ssm = ["PyYAML (>=5.1)"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] @@ -1716,7 +1712,6 @@ python-versions = ">=3.6" files = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -1750,7 +1745,6 @@ files = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -1762,7 +1756,6 @@ files = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -1795,7 +1788,18 @@ category = "main" optional = false python-versions = "*" files = [ + {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, + {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, + {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, + {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, + {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, + {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, + {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, + {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, + {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, + {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, + {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] @@ -2004,8 +2008,8 @@ files = [ [package.dependencies] pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, + {version = ">=5.0", markers = "python_version < \"3.10\""}, ] [[package]] @@ -2082,18 +2086,6 @@ cryptography = ["cryptography (>=3.4.0)"] pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] -[[package]] -name = "pytz" -version = "2022.1" -description = "World timezone definitions, modern and historical" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, - {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, -] - [[package]] name = "pywin32" version = "301" @@ -2129,13 +2121,6 @@ files = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, - {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, - {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, @@ -2449,16 +2434,19 @@ test = ["websockets"] [[package]] name = "werkzeug" -version = "2.1.2" +version = "2.2.3" description = "The comprehensive WSGI web application library." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"}, - {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"}, + {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"}, + {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"}, ] +[package.dependencies] +MarkupSafe = ">=2.1.1" + [package.extras] watchdog = ["watchdog"] @@ -2655,4 +2643,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3" +content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd" diff --git a/pyproject.toml b/pyproject.toml index d3d3948b9a..415f7f1ae7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,12 +19,12 @@ types-requests = "^2.28.5" types-psycopg2 = "^2.9.18" boto3 = "^1.26.16" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {version = "^3.0.0", extras = ["server"]} +moto = {extras = ["server"], version = "^4.1.2"} backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "2.1.2" +Werkzeug = "^2.2.3" pytest-order = "^1.0.1" allure-pytest = "^2.10.0" pytest-asyncio = "^0.19.0" From 501702b27c3d74c2d1b0e7f38c01770302abce32 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 17 Feb 2023 13:34:26 +0200 Subject: [PATCH 044/426] fix: flaky test_compaction_downloads_on_demand_with_image_creation (#3629) fix is to stop postgres before the final checkpoint to ensure no inmemory layer gets created. Fixes #3627. --- test_runner/regress/test_ondemand_download.py | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index f5f8491ada..eab9c41c57 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -555,32 +555,38 @@ def test_compaction_downloads_on_demand_with_image_creation( env.initial_tenant = tenant_id pageserver_http = env.pageserver.http_client() - with env.postgres.create_start("main") as pg: - # no particular reason to create the layers like this, but we are sure - # not to hit the image_creation_threshold here. - with pg.cursor() as cur: - cur.execute("create table a (id bigserial primary key, some_value bigint not null)") - cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + pg = env.postgres.create_start("main") - for _ in range(0, 2): - for i in range(0, 3): - # create a minimal amount of "delta difficulty" for this table - with pg.cursor() as cur: - cur.execute("update a set some_value = -some_value + %s", (i,)) + # no particular reason to create the layers like this, but we are sure + # not to hit the image_creation_threshold here. + with pg.cursor() as cur: + cur.execute("create table a (id bigserial primary key, some_value bigint not null)") + cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - with pg.cursor() as cur: - # vacuuming should aid to reuse keys, though it's not really important - # with image_creation_threshold=1 which we will use on the last compaction - cur.execute("vacuum") + for i in range(0, 2): + for j in range(0, 3): + # create a minimal amount of "delta difficulty" for this table + with pg.cursor() as cur: + cur.execute("update a set some_value = -some_value + %s", (j,)) - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + with pg.cursor() as cur: + # vacuuming should aid to reuse keys, though it's not really important + # with image_creation_threshold=1 which we will use on the last compaction + cur.execute("vacuum") - # images should not yet be created, because threshold is too high, - # but these will be reshuffled to L1 layers - pageserver_http.timeline_compact(tenant_id, timeline_id) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + if i == 1 and j == 2: + # last iteration; stop before checkpoint to avoid leaving an inmemory layer + pg.stop_and_destroy() + + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + # images should not yet be created, because threshold is too high, + # but these will be reshuffled to L1 layers + pageserver_http.timeline_compact(tenant_id, timeline_id) for _ in range(0, 20): # loop in case flushing is still in progress From ae3eff1ad2d4fa2865439e2d26de98746803ff1f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 17 Feb 2023 13:56:00 +0200 Subject: [PATCH 045/426] Tracing panic hook (#3475) Fixes #3468. This does change how the panics look, and most importantly, make sure they are not interleaved with other messages. Adds a `GET /v1/panic` endpoint for panic testing (useful for sentry dedup and this hook testing). The panics are now logged within a new error level span called `panic` which separates it from other error level events. The panic info is unpacked into span fields: - thread=mgmt request worker - location="pageserver/src/http/routes.rs:898:9" Co-authored-by: Christian Schwarz --- pageserver/src/bin/pageserver.rs | 54 ++++++++++++++++++++++++++++++-- pageserver/src/http/routes.rs | 12 +++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 50eefa8c77..c499fd8d74 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> { } }; + // Initialize logging, which must be initialized before the custom panic hook is installed. + logging::init(conf.log_format)?; + + // disable the default rust panic hook by using `set_hook`. sentry will install it's own on top + // of this, always processing the panic before we log it. + std::panic::set_hook(Box::new(tracing_panic_hook)); + // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry( Some(GIT_VERSION.into()), @@ -210,9 +217,6 @@ fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, ) -> anyhow::Result<()> { - // Initialize logging - logging::init(conf.log_format)?; - // Print version and launch timestamp to the log, // and expose them as prometheus metrics. // A changed version string indicates changed software. @@ -495,6 +499,50 @@ fn cli() -> Command { ) } +/// Named symbol for our panic hook, which logs the panic. +fn tracing_panic_hook(info: &std::panic::PanicInfo) { + // following rust 1.66.1 std implementation: + // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 + let location = info.location(); + + let msg = match info.payload().downcast_ref::<&'static str>() { + Some(s) => *s, + None => match info.payload().downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + + let thread = std::thread::current(); + let thread = thread.name().unwrap_or(""); + let backtrace = std::backtrace::Backtrace::capture(); + + struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); + + impl std::fmt::Display for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column()) + } + } + + let _entered = if let Some(location) = location { + tracing::error_span!("panic", %thread, location = %PrettyLocation(location)) + } else { + // very unlikely to hit here, but the guarantees of std could change + tracing::error_span!("panic", %thread) + } + .entered(); + + if backtrace.status() == std::backtrace::BacktraceStatus::Captured { + // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really + // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to + // string, maybe even to a TLS one but tracing already does that. + tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}"); + } else { + tracing::error!("{msg}"); + } +} + #[test] fn verify_cli() { cli().debug_assert(); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 7cd7e81fe1..71273159b7 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1029,6 +1029,17 @@ async fn active_timeline_of_active_tenant( .map_err(ApiError::NotFound) } +async fn always_panic_handler(req: Request) -> Result, ApiError> { + // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). + // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. + // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. + let query = req.uri().query(); + let _ = std::panic::catch_unwind(|| { + panic!("unconditional panic for testing panic hook integration; request query: {query:?}") + }); + json_response(StatusCode::NO_CONTENT, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1147,5 +1158,6 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", evict_timeline_layer_handler, ) + .get("/v1/panic", always_panic_handler) .any(handler_404)) } From 8e6b27bf7c5a78bd716c343f1113228d927399fb Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 17 Feb 2023 14:15:29 +0200 Subject: [PATCH 046/426] fix: avoid busy loop on replacement failure (#3613) Add an AtomicBool per RemoteLayer, use it to mark together with closed semaphore that remotelayer is unusable until restart or ignore+load. https://github.com/neondatabase/neon/issues/3533#issuecomment-1431481554 --- pageserver/src/tenant/layer_map.rs | 6 ++ .../src/tenant/storage_layer/remote_layer.rs | 13 ++++ pageserver/src/tenant/timeline.rs | 43 +++++++++++-- test_runner/fixtures/neon_fixtures.py | 2 + test_runner/regress/test_ondemand_download.py | 62 +++++++++++++++++++ 5 files changed, 121 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index e446e34f4e..8d7d9c6f8f 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -154,6 +154,12 @@ where expected: &Arc, new: Arc, ) -> anyhow::Result>> { + fail::fail_point!("layermap-replace-notfound", |_| Ok( + // this is not what happens if an L0 layer was not found a anyhow error but perhaps + // that should be changed. this is good enough to show a replacement failure. + Replacement::NotFound + )); + self.layer_map.replace_historic_noflush(expected, new) } diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 51bb4dcc2a..8465a99339 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -49,6 +49,17 @@ pub struct RemoteLayer { access_stats: LayerAccessStats, pub(crate) ongoing_download: Arc, + + /// Has `LayerMap::replace` failed for this (true) or not (false). + /// + /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`. + /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load) + /// unprocessable, because a LayerMap::replace failed. + /// + /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids + /// a possible fast loop between `Timeline::get_reconstruct_data` and + /// `Timeline::download_remote_layer`, which also logs. + pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool, } impl std::fmt::Debug for RemoteLayer { @@ -207,6 +218,7 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } @@ -228,6 +240,7 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 66afe2cdce..683c2cd2d3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3624,14 +3624,26 @@ impl Timeline { &self, remote_layer: Arc, ) -> anyhow::Result<()> { + use std::sync::atomic::Ordering::Relaxed; + let permit = match Arc::clone(&remote_layer.ongoing_download) .acquire_owned() .await { Ok(permit) => permit, Err(_closed) => { - info!("download of layer has already finished"); - return Ok(()); + if remote_layer.download_replacement_failure.load(Relaxed) { + // this path will be hit often, in case there are upper retries. however + // hitting this error will prevent a busy loop between get_reconstruct_data and + // download, so an error is prefered. + // + // TODO: we really should poison the timeline, but panicking is not yet + // supported. Related: https://github.com/neondatabase/neon/issues/3621 + anyhow::bail!("an earlier download succeeded but LayerMap::replace failed") + } else { + info!("download of layer has already finished"); + return Ok(()); + } } }; @@ -3667,8 +3679,8 @@ impl Timeline { { use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); - match updates.replace_historic(&l, new_layer) { - Ok(Replacement::Replaced { .. }) => { /* expected */ } + let failure = match updates.replace_historic(&l, new_layer) { + Ok(Replacement::Replaced { .. }) => false, Ok(Replacement::NotFound) => { // TODO: the downloaded file should probably be removed, otherwise // it will be added to the layermap on next load? we should @@ -3676,6 +3688,7 @@ impl Timeline { // // See: https://github.com/neondatabase/neon/issues/3533 error!("replacing downloaded layer into layermap failed because layer was not found"); + true } Ok(Replacement::RemovalBuffered) => { unreachable!("current implementation does not remove anything") @@ -3694,12 +3707,32 @@ impl Timeline { ?other, "replacing downloaded layer into layermap failed because another layer was found instead of expected" ); + true } Err(e) => { // this is a precondition failure, the layer filename derived // attributes didn't match up, which doesn't seem likely. - error!("replacing downloaded layer into layermap failed: {e:#?}") + error!("replacing downloaded layer into layermap failed: {e:#?}"); + true } + }; + + if failure { + // mark the remote layer permanently failed; the timeline is most + // likely unusable after this. sadly we cannot just poison the layermap + // lock with panic, because that would create an issue with shutdown. + // + // this does not change the retry semantics on failed downloads. + // + // use of Relaxed is valid because closing of the semaphore gives + // happens-before and wakes up any waiters; we write this value before + // and any waiters (or would be waiters) will load it after closing + // semaphore. + // + // See: https://github.com/neondatabase/neon/issues/3533 + remote_layer + .download_replacement_failure + .store(true, Relaxed); } } updates.flush(); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0620ad8a35..ca5288fa0a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1288,6 +1288,7 @@ class PageserverHttpClient(requests.Session): timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, + **kwargs, ) -> Dict[Any, Any]: params = {} if include_non_incremental_logical_size: @@ -1298,6 +1299,7 @@ class PageserverHttpClient(requests.Session): res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params=params, + **kwargs, ) self.verbose_error(res) res_json = res.json() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index eab9c41c57..09657470b6 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -10,6 +10,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + PageserverApiException, RemoteStorageKind, assert_tenant_status, available_remote_storages, @@ -18,6 +19,7 @@ from fixtures.neon_fixtures import ( wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, wait_until, + wait_until_tenant_state, ) from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -623,3 +625,63 @@ def test_compaction_downloads_on_demand_with_image_creation( def stringify(conf: Dict[str, Any]) -> Dict[str, str]: return dict(map(lambda x: (x[0], str(x[1])), conf.items())) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_ondemand_download_failure_to_replace( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking. + + See: https://github.com/neondatabase/neon/issues/3533 + """ + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_failure_to_replace", + ) + + # disable gc and compaction via default tenant config because config is lost while detaching + # so that compaction will not be the one to download the layer but the http handler is + neon_env_builder.pageserver_config_override = ( + """tenant_config={gc_period = "0s", compaction_period = "0s"}""" + ) + + env = neon_env_builder.init_start() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + + env.initial_tenant = tenant_id + pageserver_http = env.pageserver.http_client() + + lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"]) + + wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + + # remove layers so that they will be redownloaded + pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_attach(tenant_id) + + wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) + pageserver_http.configure_failpoints(("layermap-replace-notfound", "return")) + + # requesting details with non-incremental size should trigger a download of the only layer + # this will need to be adjusted if an index for logical sizes is ever implemented + with pytest.raises(PageserverApiException): + # error message is not useful + pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2) + + actual_message = ( + ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found" + ) + assert env.pageserver.log_contains(actual_message) is not None + env.pageserver.allowed_errors.append(actual_message) + + env.pageserver.allowed_errors.append( + ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info" + ) + # this might get to run and attempt on-demand, but not always + env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'") + + # if the above returned, then we didn't have a livelock, and all is well From 6f9af0aa8c74adc6f5d39c8645ed0b48f84c7313 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 26 Jan 2023 17:00:58 +0200 Subject: [PATCH 047/426] [proxy] Enable OpenTelemetry tracing. This commit sets up OpenTelemetry tracing and exporter, so that they can be exported as OpenTelemetry traces as well. All outgoing HTTP requests will be traced. A separate (child) span is created for each outgoing HTTP request, and the tracing context is also propagated to the server in the HTTP headers. If tracing is enabled in the control plane and compute node too, you can now get an end-to-end distributed trace of what happens when a new connection is established, starting from the handshake with the client, creating the 'start_compute' operation in the control plane, starting the compute node, all the way to down to fetching the base backup and the availability checks in compute_ctl. Co-authored-by: Dmitry Ivanov --- Cargo.lock | 64 ++++++++++++++++++++++++++++++ Cargo.toml | 2 + proxy/Cargo.toml | 5 +++ proxy/src/console/provider.rs | 23 +++++++---- proxy/src/console/provider/neon.rs | 5 +-- proxy/src/http.rs | 41 ++++++++++++++----- proxy/src/logging.rs | 46 +++++++++++++++++++++ proxy/src/main.rs | 21 ++-------- proxy/src/metrics.rs | 17 ++++---- workspace_hack/Cargo.toml | 2 +- 10 files changed, 176 insertions(+), 50 deletions(-) create mode 100644 proxy/src/logging.rs diff --git a/Cargo.lock b/Cargo.lock index 98c4dca09b..d154b4eaea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2110,6 +2110,16 @@ version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +[[package]] +name = "mime_guess" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2884,6 +2894,7 @@ dependencies = [ "md5", "metrics", "once_cell", + "opentelemetry", "parking_lot", "pin-project-lite", "pq_proto", @@ -2892,6 +2903,8 @@ dependencies = [ "rcgen", "regex", "reqwest", + "reqwest-middleware", + "reqwest-tracing", "routerify", "rstest", "rustls", @@ -2909,7 +2922,9 @@ dependencies = [ "tokio-postgres-rustls", "tokio-rustls", "tracing", + "tracing-opentelemetry", "tracing-subscriber", + "tracing-utils", "url", "utils", "uuid", @@ -3079,6 +3094,7 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "once_cell", "percent-encoding", "pin-project-lite", @@ -3098,6 +3114,36 @@ dependencies = [ "winreg", ] +[[package]] +name = "reqwest-middleware" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894" +dependencies = [ + "anyhow", + "async-trait", + "http", + "reqwest", + "serde", + "task-local-extensions", + "thiserror", +] + +[[package]] +name = "reqwest-tracing" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce" +dependencies = [ + "async-trait", + "opentelemetry", + "reqwest", + "reqwest-middleware", + "task-local-extensions", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "ring" version = "0.16.20" @@ -3790,6 +3836,15 @@ dependencies = [ "xattr", ] +[[package]] +name = "task-local-extensions" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4" +dependencies = [ + "tokio", +] + [[package]] name = "tempfile" version = "3.3.0" @@ -4360,6 +4415,15 @@ dependencies = [ "libc", ] +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-bidi" version = "0.3.10" diff --git a/Cargo.toml b/Cargo.toml index 4e4667f253..99a3f56026 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -76,6 +76,8 @@ prost = "0.11" rand = "0.8" regex = "1.4" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } +reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } +reqwest-middleware = "0.2.0" routerify = "3" rpds = "0.12.0" rustls = "0.20" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 152c83e4a0..96a62d2c49 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -28,6 +28,7 @@ itertools.workspace = true md5.workspace = true metrics.workspace = true once_cell.workspace = true +opentelemetry.workspace = true parking_lot.workspace = true pin-project-lite.workspace = true pq_proto.workspace = true @@ -35,6 +36,8 @@ prometheus.workspace = true rand.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } +reqwest-middleware.workspace = true +reqwest-tracing.workspace = true routerify.workspace = true rustls-pemfile.workspace = true rustls.workspace = true @@ -49,7 +52,9 @@ tls-listener.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true tokio.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true +tracing-utils.workspace = true tracing.workspace = true url.workspace = true utils.workspace = true diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 7621aba19b..80cd94d483 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -11,8 +11,10 @@ use async_trait::async_trait; use std::sync::Arc; pub mod errors { - use crate::error::{io_error, UserFacingError}; - use reqwest::StatusCode as HttpStatusCode; + use crate::{ + error::{io_error, UserFacingError}, + http, + }; use thiserror::Error; /// A go-to error message which doesn't leak any detail. @@ -24,7 +26,7 @@ pub mod errors { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {}: {}", .status, .text)] Console { - status: HttpStatusCode, + status: http::StatusCode, text: Box, }, @@ -35,7 +37,7 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. - pub fn http_status_code(&self) -> Option { + pub fn http_status_code(&self) -> Option { use ApiError::*; match self { Console { status, .. } => Some(*status), @@ -51,15 +53,15 @@ pub mod errors { // To minimize risks, only select errors are forwarded to users. // Ask @neondatabase/control-plane for review before adding more. Console { status, .. } => match *status { - HttpStatusCode::NOT_FOUND => { + http::StatusCode::NOT_FOUND => { // Status 404: failed to get a project-related resource. format!("{REQUEST_FAILED}: endpoint cannot be found") } - HttpStatusCode::NOT_ACCEPTABLE => { + http::StatusCode::NOT_ACCEPTABLE => { // Status 406: endpoint is disabled (we don't allow connections). format!("{REQUEST_FAILED}: endpoint is disabled") } - HttpStatusCode::LOCKED => { + http::StatusCode::LOCKED => { // Status 423: project might be in maintenance mode (or bad state). format!("{REQUEST_FAILED}: endpoint is temporary unavailable") } @@ -70,13 +72,18 @@ pub mod errors { } } - // Helps eliminate graceless `.map_err` calls without introducing another ctor. impl From for ApiError { fn from(e: reqwest::Error) -> Self { io_error(e).into() } } + impl From for ApiError { + fn from(e: reqwest_middleware::Error) -> Self { + io_error(e).into() + } + } + #[derive(Debug, Error)] pub enum GetAuthInfoError { // We shouldn't include the actual secret here. diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 4eca025d2d..3644db17f7 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,7 +8,6 @@ use super::{ use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; -use reqwest::StatusCode as HttpStatusCode; use tracing::{error, info, info_span, warn, Instrument}; #[derive(Clone)] @@ -52,7 +51,7 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(HttpStatusCode::NOT_FOUND) => return Ok(None), + Some(http::StatusCode::NOT_FOUND) => return Ok(None), _otherwise => return Err(e.into()), }, }; @@ -154,7 +153,7 @@ impl super::Api for Api { /// Parse http response body, taking status code into account. async fn parse_body serde::Deserialize<'a>>( - response: reqwest::Response, + response: http::Response, ) -> Result { let status = response.status(); if status.is_success() { diff --git a/proxy/src/http.rs b/proxy/src/http.rs index e847edc8bd..a544157800 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,7 +1,24 @@ +//! HTTP client and server impls. +//! Other modules should use stuff from this module instead of +//! directly relying on deps like `reqwest` (think loose coupling). + pub mod server; pub mod websocket; +pub use reqwest::{Request, Response, StatusCode}; +pub use reqwest_middleware::{ClientWithMiddleware, Error}; + use crate::url::ApiUrl; +use reqwest_middleware::RequestBuilder; + +/// This is the preferred way to create new http clients, +/// because it takes care of observability (OpenTelemetry). +/// We deliberately don't want to replace this with a public static. +pub fn new_client() -> ClientWithMiddleware { + reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build() +} /// Thin convenience wrapper for an API provided by an http endpoint. #[derive(Debug, Clone)] @@ -9,13 +26,17 @@ pub struct Endpoint { /// API's base URL. endpoint: ApiUrl, /// Connection manager with built-in pooling. - client: reqwest::Client, + client: ClientWithMiddleware, } impl Endpoint { /// Construct a new HTTP endpoint wrapper. - pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { - Self { endpoint, client } + /// Http client is not constructed under the hood so that it can be shared. + pub fn new(endpoint: ApiUrl, client: impl Into) -> Self { + Self { + endpoint, + client: client.into(), + } } #[inline(always)] @@ -23,19 +44,16 @@ impl Endpoint { &self.endpoint } - /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. - pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + pub fn get(&self, path: &str) -> RequestBuilder { let mut url = self.endpoint.clone(); url.path_segments_mut().push(path); self.client.get(url.into_inner()) } /// Execute a [request](reqwest::Request). - pub async fn execute( - &self, - request: reqwest::Request, - ) -> Result { + pub async fn execute(&self, request: Request) -> Result { self.client.execute(request).await } } @@ -43,11 +61,12 @@ impl Endpoint { #[cfg(test)] mod tests { use super::*; + use reqwest::Client; #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; - let endpoint = Endpoint::new(url, reqwest::Client::new()); + let endpoint = Endpoint::new(url, Client::new()); // Validate that this pattern makes sense. let req = endpoint @@ -66,7 +85,7 @@ mod tests { #[test] fn uuid_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; - let endpoint = Endpoint::new(url, reqwest::Client::new()); + let endpoint = Endpoint::new(url, Client::new()); let req = endpoint .get("frobnicate") diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs new file mode 100644 index 0000000000..2baf824fc3 --- /dev/null +++ b/proxy/src/logging.rs @@ -0,0 +1,46 @@ +use tracing_opentelemetry::OpenTelemetryLayer; +use tracing_subscriber::{ + filter::{EnvFilter, LevelFilter}, + prelude::*, +}; + +/// Initialize logging and OpenTelemetry tracing and exporter. +/// +/// Logging can be configured using `RUST_LOG` environment variable. +/// +/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up +/// configuration from environment variables. For example, to change the +/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. +/// See +pub async fn init() -> anyhow::Result { + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy(); + + let fmt_layer = tracing_subscriber::fmt::layer() + .with_ansi(atty::is(atty::Stream::Stderr)) + .with_writer(std::io::stderr) + .with_target(false); + + let otlp_layer = tracing_utils::init_tracing("proxy") + .await + .map(OpenTelemetryLayer::new); + + tracing_subscriber::registry() + .with(env_filter) + .with(otlp_layer) + .with(fmt_layer) + .try_init()?; + + Ok(LoggingGuard) +} + +pub struct LoggingGuard; + +impl Drop for LoggingGuard { + fn drop(&mut self) { + // Shutdown trace pipeline gracefully, so that it has a chance to send any + // pending traces before we exit. + tracing_utils::shutdown_tracing(); + } +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 8812f77b62..54f49b5a3c 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -12,6 +12,7 @@ mod config; mod console; mod error; mod http; +mod logging; mod metrics; mod parse; mod proxy; @@ -41,8 +42,7 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { - // First, initialize logging and troubleshooting subsystems. - init_tracing(); + let _logging_guard = logging::init().await?; let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); info!("Version: {GIT_VERSION}"); @@ -107,21 +107,6 @@ async fn main() -> anyhow::Result<()> { Ok(()) } -/// Tracing is used for logging and telemetry. -fn init_tracing() { - tracing_subscriber::fmt() - .with_env_filter({ - // This filter will examine the `RUST_LOG` env variable. - use tracing_subscriber::filter::{EnvFilter, LevelFilter}; - EnvFilter::builder() - .with_default_directive(LevelFilter::INFO.into()) - .from_env_lossy() - }) - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_target(false) - .init(); -} - /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> { let tls_config = match ( @@ -161,7 +146,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> })); let url = args.get_one::("auth-endpoint").unwrap().parse()?; - let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + let endpoint = http::Endpoint::new(url, http::new_client()); let api = console::provider::neon::Api::new(endpoint, caches); auth::BackendType::Console(Cow::Owned(api), ()) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index d9aa4aec8c..83b28288ee 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,12 +1,11 @@ -//! //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -//! +use crate::http; use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; use std::{collections::HashMap, time::Duration}; -use tracing::{debug, error, log::info, trace}; +use tracing::{debug, error, info, instrument, trace}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -40,15 +39,14 @@ pub async fn collect_metrics( metric_collection_endpoint ); - // define client here to reuse it for all requests - let client = reqwest::Client::new(); + let http_client = http::new_client(); let mut cached_metrics: HashMap)> = HashMap::new(); loop { tokio::select! { _ = ticker.tick() => { - match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await + match collect_metrics_iteration(&http_client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await { Err(e) => { error!("Failed to send consumption metrics: {} ", e); @@ -60,7 +58,7 @@ pub async fn collect_metrics( } } -pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { +fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { let mut current_metrics: Vec<(Ids, (u64, DateTime))> = Vec::new(); let metrics = prometheus::default_registry().gather(); @@ -99,8 +97,9 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { current_metrics } -pub async fn collect_metrics_iteration( - client: &reqwest::Client, +#[instrument(skip_all)] +async fn collect_metrics_iteration( + client: &http::ClientWithMiddleware, cached_metrics: &mut HashMap)>, metric_collection_endpoint: &reqwest::Url, hostname: String, diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 30a6d3a92b..68138b3df4 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -38,7 +38,7 @@ prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-syntax = { version = "0.6" } -reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] } ring = { version = "0.16", features = ["std"] } rustls = { version = "0.20", features = ["dangerous_configuration"] } scopeguard = { version = "1" } From 956b6f17ca35f002d1dcb74a7c803db798d43c94 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 17 Feb 2023 13:16:30 +0300 Subject: [PATCH 048/426] [proxy] Handle some unix signals. On the surface, this doesn't add much, but there are some benefits: * We can do graceful shutdowns and thus record more code coverage data. * We now have a foundation for the more interesting behaviors, e.g. "stop accepting new connections after SIGTERM but keep serving the existing ones". * We give the otel machinery a chance to flush trace events before finally shutting down. --- proxy/Cargo.toml | 2 +- proxy/src/config.rs | 1 + proxy/src/console/mgmt.rs | 20 +++++---- proxy/src/http/websocket.rs | 2 +- proxy/src/logging.rs | 1 + proxy/src/main.rs | 58 ++++++++++++++++++--------- proxy/src/metrics.rs | 48 ++++++++++------------ test_runner/fixtures/neon_fixtures.py | 10 +++-- workspace_hack/Cargo.toml | 2 +- 9 files changed, 79 insertions(+), 65 deletions(-) diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 96a62d2c49..030a5f1d6e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -51,7 +51,7 @@ thiserror.workspace = true tls-listener.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["signal"] } tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 5e285f3625..600db7f8ec 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -8,6 +8,7 @@ pub struct ProxyConfig { pub metric_collection: Option, } +#[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 51a117d3b7..c00c06fbb7 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -5,10 +5,7 @@ use crate::{ use anyhow::Context; use once_cell::sync::Lazy; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::{ - net::{TcpListener, TcpStream}, - thread, -}; +use std::{net::TcpStream, thread}; use tracing::{error, info, info_span}; use utils::{ postgres_backend::{self, AuthType, PostgresBackend}, @@ -34,23 +31,24 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N CPLANE_WAITERS.notify(psql_session_id, msg) } -/// Console management API listener thread. +/// Console management API listener task. /// It spawns console response handlers needed for the link auth. -pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { +pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> { scopeguard::defer! { info!("mgmt has shut down"); } - listener - .set_nonblocking(false) - .context("failed to set listener to blocking")?; - loop { - let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?; + let (socket, peer_addr) = listener.accept().await?; info!("accepted connection from {peer_addr}"); + + let socket = socket.into_std()?; socket .set_nodelay(true) .context("failed to set client socket option")?; + socket + .set_nonblocking(false) + .context("failed to set client socket option")?; // TODO: replace with async tasks. thread::spawn(move || { diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index d4235c2c38..1757652a90 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -186,8 +186,8 @@ async fn ws_handler( } pub async fn task_main( - ws_listener: TcpListener, config: &'static ProxyConfig, + ws_listener: TcpListener, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 2baf824fc3..0c8c2858b9 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -41,6 +41,7 @@ impl Drop for LoggingGuard { fn drop(&mut self) { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. + tracing::info!("shutting down the tracing machinery"); tracing_utils::shutdown_tracing(); } } diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 54f49b5a3c..c319cb9cfc 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -28,7 +28,7 @@ use config::ProxyConfig; use futures::FutureExt; use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; -use tracing::{info, info_span, Instrument}; +use tracing::{info, warn}; use utils::{project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); @@ -60,16 +60,17 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = args.get_one::("mgmt").unwrap().parse()?; info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?; + let mgmt_listener = TcpListener::bind(mgmt_address).await?; let proxy_address: SocketAddr = args.get_one::("proxy").unwrap().parse()?; info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; let mut tasks = vec![ + tokio::spawn(handle_signals()), tokio::spawn(http::server::task_main(http_listener)), tokio::spawn(proxy::task_main(config, proxy_listener)), - tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)), + tokio::spawn(console::mgmt::task_main(mgmt_listener)), ]; if let Some(wss_address) = args.get_one::("wss") { @@ -78,35 +79,52 @@ async fn main() -> anyhow::Result<()> { let wss_listener = TcpListener::bind(wss_address).await?; tasks.push(tokio::spawn(http::websocket::task_main( - wss_listener, config, + wss_listener, ))); } - // TODO: refactor. - if let Some(metric_collection) = &config.metric_collection { - let hostname = hostname::get()? - .into_string() - .map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?; - - tasks.push(tokio::spawn( - metrics::collect_metrics( - &metric_collection.endpoint, - metric_collection.interval, - hostname, - ) - .instrument(info_span!("collect_metrics")), - )); + if let Some(metrics_config) = &config.metric_collection { + tasks.push(tokio::spawn(metrics::task_main(metrics_config))); } - // This will block until all tasks have completed. - // Furthermore, the first one to fail will cancel the rest. + // This combinator will block until either all tasks complete or + // one of them finishes with an error (others will be cancelled). let tasks = tasks.into_iter().map(flatten_err); let _: Vec<()> = futures::future::try_join_all(tasks).await?; Ok(()) } +/// Handle unix signals appropriately. +async fn handle_signals() -> anyhow::Result<()> { + use tokio::signal::unix::{signal, SignalKind}; + + let mut hangup = signal(SignalKind::hangup())?; + let mut interrupt = signal(SignalKind::interrupt())?; + let mut terminate = signal(SignalKind::terminate())?; + + loop { + tokio::select! { + // Hangup is commonly used for config reload. + _ = hangup.recv() => { + warn!("received SIGHUP; config reload is not supported"); + } + // Shut down the whole application. + _ = interrupt.recv() => { + warn!("received SIGINT, exiting immediately"); + bail!("interrupted"); + } + // TODO: Don't accept new proxy connections. + // TODO: Shut down once all exisiting connections have been closed. + _ = terminate.recv() => { + warn!("received SIGTERM, exiting immediately"); + bail!("terminated"); + } + } + } +} + /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> { let tls_config = match ( diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 83b28288ee..8bbae9638b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,10 +1,10 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::http; +use crate::{config::MetricCollectionConfig, http}; use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; -use std::{collections::HashMap, time::Duration}; +use std::collections::HashMap; use tracing::{debug, error, info, instrument, trace}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -23,37 +23,31 @@ pub struct Ids { pub endpoint_id: String, } -pub async fn collect_metrics( - metric_collection_endpoint: &reqwest::Url, - metric_collection_interval: Duration, - hostname: String, -) -> anyhow::Result<()> { +pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> { + info!("metrics collector config: {config:?}"); scopeguard::defer! { - info!("collect_metrics has shut down"); + info!("metrics collector has shut down"); } - let mut ticker = tokio::time::interval(metric_collection_interval); - - info!( - "starting collect_metrics. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - let http_client = http::new_client(); let mut cached_metrics: HashMap)> = HashMap::new(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + let mut ticker = tokio::time::interval(config.interval); loop { - tokio::select! { - _ = ticker.tick() => { + ticker.tick().await; - match collect_metrics_iteration(&http_client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await - { - Err(e) => { - error!("Failed to send consumption metrics: {} ", e); - }, - Ok(_) => { trace!("collect_metrics_iteration completed successfully") }, - } - } + let res = collect_metrics_iteration( + &http_client, + &mut cached_metrics, + &config.endpoint, + &hostname, + ) + .await; + + match res { + Err(e) => error!("failed to send consumption metrics: {e} "), + Ok(_) => trace!("periodic metrics collection completed successfully"), } } } @@ -102,7 +96,7 @@ async fn collect_metrics_iteration( client: &http::ClientWithMiddleware, cached_metrics: &mut HashMap)>, metric_collection_endpoint: &reqwest::Url, - hostname: String, + hostname: &str, ) -> anyhow::Result<()> { info!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", @@ -133,7 +127,7 @@ async fn collect_metrics_iteration( stop_time: *curr_time, }, metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname.clone()), + idempotency_key: idempotency_key(hostname.to_owned()), value, extra: Ids { endpoint_id: curr_key.endpoint_id.clone(), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ca5288fa0a..59d616eb6f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2529,15 +2529,17 @@ class NeonProxy(PgProtocol): tb: Optional[TracebackType], ): if self._popen is not None: - # NOTE the process will die when we're done with tests anyway, because - # it's a child process. This is mostly to clean up in between different tests. - self._popen.kill() + self._popen.terminate() + try: + self._popen.wait(timeout=5) + except subprocess.TimeoutExpired: + log.warn("failed to gracefully terminate proxy; killing") + self._popen.kill() @staticmethod async def activate_link_auth( local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True ): - pg_user = "proxy" if create_user: diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 68138b3df4..c0cf3c5611 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -45,7 +45,7 @@ scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } -tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] } tokio-util = { version = "0.7", features = ["codec", "io"] } tonic = { version = "0.8", features = ["tls-roots"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } From d90cd36bcc014fef63e35964a42ffc4ed3e18424 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 17 Feb 2023 13:32:45 +0300 Subject: [PATCH 049/426] [proxy] Improve tracing spans here and there. --- proxy/src/auth/backend.rs | 1 + proxy/src/proxy.rs | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 42b2304bb8..b8599adaeb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -140,6 +140,7 @@ async fn auth_quirks( impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. + #[tracing::instrument(fields(allow_cleartext), skip_all)] pub async fn authenticate( &mut self, extra: &ConsoleReqExtra<'_>, diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 0dc48f1212..9642047812 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -17,7 +17,7 @@ use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{error, info, warn}; /// Number of times we should retry the `/proxy_wake_compute` http request. const NUM_RETRIES_WAKE_COMPUTE: usize = 1; @@ -91,13 +91,13 @@ pub async fn task_main( .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. error!("per-client task finished with an error: {e:#}"); - }) - .instrument(info_span!("client", session = format_args!("{session_id}"))), + }), ); } } // TODO(tech debt): unite this with its twin below. +#[tracing::instrument(fields(session_id), skip_all)] pub async fn handle_ws_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -139,6 +139,7 @@ pub async fn handle_ws_client( .await } +#[tracing::instrument(fields(session_id), skip_all)] async fn handle_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -423,9 +424,9 @@ impl Client<'_, S> { let res = creds .authenticate(&extra, &mut stream, allow_cleartext) .await; + async { res }.or_else(|e| stream.throw_error(e)).await } - .instrument(info_span!("auth")) .await?; let AuthSuccess { From b242b0ad6787f23ee008527fdf690ccc2012694f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 17 Feb 2023 15:56:56 +0200 Subject: [PATCH 050/426] Fix flaky tests (#3616) ## Describe your changes test_on_demand_download is flaky because not waiting until created image layer is transferred to S3. test_tenants_with_remote_storage just leaves garbage at the end of overwritten file. Right solution for test_on_demand_download is to add some API call to wait completion of synchronization with S3 (not just based on last record LSN). But right now it is solved using sleep. ## Issue ticket number and link #3209 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- test_runner/regress/test_ondemand_download.py | 3 +++ test_runner/regress/test_tenants_with_remote_storage.py | 1 + 2 files changed, 4 insertions(+) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 09657470b6..5ee94de32d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -218,6 +218,9 @@ def test_ondemand_download_timetravel( log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" + # Wait until generated image layers are uploaded to S3 + time.sleep(3) + env.pageserver.stop() # remove all the layer files diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6da6a4d446..769bc10280 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -280,6 +280,7 @@ def test_tenant_upgrades_index_json_from_v0( timeline_file.seek(0) json.dump(v0_index_part, timeline_file) + timeline_file.truncate(timeline_file.tell()) env.pageserver.start() pageserver_http = env.pageserver.http_client() From 40799d8ae760e188b9e0770ee94075476bdfeb21 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 17 Feb 2023 16:49:40 +0200 Subject: [PATCH 051/426] Add debug messages to catch abnormal consumption metric values --- pageserver/src/consumption_metrics.rs | 8 +++++++- proxy/src/metrics.rs | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index d1383b33cb..8916d4f1c9 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size"; const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size"; #[serde_as] -#[derive(Serialize)] +#[derive(Serialize, Debug)] struct Ids { #[serde_as(as = "DisplayFromStr")] tenant_id: TenantId, @@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration( } } else { error!("metrics endpoint refused the sent metrics: {:?}", res); + for metric in chunk_to_send.iter() { + // Report if the metric value is suspiciously large + if metric.value > (1u64 << 40) { + error!("potentially abnormal metric value: {:?}", metric); + } + } } } Err(err) => { diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 8bbae9638b..3b28346872 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -18,7 +18,7 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. /// -#[derive(Eq, Hash, PartialEq, Serialize)] +#[derive(Eq, Hash, PartialEq, Serialize, Debug)] pub struct Ids { pub endpoint_id: String, } @@ -183,6 +183,12 @@ async fn collect_metrics_iteration( } } else { error!("metrics endpoint refused the sent metrics: {:?}", res); + for metric in chunk.iter() { + // Report if the metric value is suspiciously large + if metric.value > (1u64 << 40) { + error!("potentially abnormal metric value: {:?}", metric); + } + } } } Ok(()) From 53128d56d9b3692873416e5d618091508d56e504 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 17 Feb 2023 16:57:37 +0200 Subject: [PATCH 052/426] Fix make clean: Use correct paths in neon-pg-ext-clean --- Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 92a4532684..e04a82c7c9 100644 --- a/Makefile +++ b/Makefile @@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-% .PHONY: neon-pg-ext-clean-% neon-pg-ext-clean-%: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean .PHONY: neon-pg-ext neon-pg-ext: \ From 8d28a24b26aeb5d7dc4ed3f13930971b6b381e49 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 17 Feb 2023 17:32:01 +0100 Subject: [PATCH 053/426] staging: enable automatic layer eviction at 20m threshold + period (#3636) What it says on the tin. Part of #2476 --- .github/ansible/staging.eu-west-1.hosts.yaml | 5 +++++ .github/ansible/staging.us-east-2.hosts.yaml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index f28dc8e07b..d19026b3f7 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,6 +8,11 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events metric_collection_interval: 10min + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "20m" + threshold: "20m" remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 9a1a095282..a8b9d41a65 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,6 +8,11 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events metric_collection_interval: 10min + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "20m" + threshold: "20m" remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" From 564fa11244ef23889f1402af64c8cb7745153200 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 17 Feb 2023 18:18:23 +0000 Subject: [PATCH 054/426] Update Postgres extensions (#3615) - Update postgis from 3.3.1 from 3.3.2 - Update plv8 from 3.1.4 to 3.1.5 - Update h3-pg from 4.0.1 to 4.1.2 (and underlying h3 from 4.0.1 to 4.1.0) --- Dockerfile.compute-node | 62 +++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index f4479c46cb..95d6237ecf 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -50,15 +50,15 @@ RUN apt update && \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \ - tar zxvf SFCGAL-v1.3.10.tar.gz && \ - cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ +# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 +RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ + mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ + cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ - tar xvzf postgis-3.3.1.tar.gz && \ - cd postgis-3.3.1 && \ +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ + mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ ./autogen.sh && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ @@ -83,29 +83,14 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils + apt install -y ninja-build python3-dev libncurses5 binutils clang -# https://github.com/plv8/plv8/issues/475: -# v8 uses gold for linking and sets `--thread-count=4` which breaks -# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) -# Install newer gold version manually as debian-testing binutils version updates -# libc version, which in turn breaks other extension built against non-testing libc. -RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ - tar xvzf binutils-2.38.tar.gz && \ - cd binutils-2.38 && \ - cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ - cd ../bfd && ./configure && make bfdver.h && \ - cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ - cp /usr/local/bin/ld.gold /usr/bin/gold - -# Sed is used to patch for https://github.com/plv8/plv8/issues/503 -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ - tar xvzf v3.1.4.tar.gz && \ - cd plv8-3.1.4 && \ +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \ + mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ - sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ + find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -126,20 +111,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2 && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ - tar xvzf h3.tgz && \ - cd h3-4.0.1 && \ - mkdir build && \ - cd build && \ +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ + mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \ + mkdir build && cd build && \ cmake .. -DCMAKE_BUILD_TYPE=Release && \ make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/h3 make install && \ cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ - tar xvzf h3-pg.tgz && \ - cd h3-pg-4.0.1 && \ +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \ + mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -155,9 +137,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 FROM build-deps AS unit-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \ - tar xvzf 7.7.tar.gz && \ - cd postgresql-unit-7.7 && \ +RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. @@ -176,8 +157,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz & FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \ - cd pgvector && \ +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \ + mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control @@ -191,8 +172,9 @@ RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \ FROM build-deps AS pgjwt-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN git clone https://github.com/michelp/pgjwt.git && \ - cd pgjwt && \ +# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 +RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ + mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control From 2153d2e00a917b675b454ae8d222e2946900e4d1 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Fri, 17 Feb 2023 14:14:41 -0800 Subject: [PATCH 055/426] Run compute_ctl in a cgroup in VMs (#3577) --- .dockerignore | 1 + .github/workflows/build_and_test.yml | 25 ++++++-------- Dockerfile.vm-compute-node | 32 ++++++++++++++++++ compute_tools/src/bin/compute_ctl.rs | 3 -- compute_tools/src/informant.rs | 50 ---------------------------- compute_tools/src/lib.rs | 1 - vm-cgconfig.conf | 12 +++++++ 7 files changed, 56 insertions(+), 68 deletions(-) create mode 100644 Dockerfile.vm-compute-node delete mode 100644 compute_tools/src/informant.rs create mode 100644 vm-cgconfig.conf diff --git a/.dockerignore b/.dockerignore index d256b21af1..a6e11805e9 100644 --- a/.dockerignore +++ b/.dockerignore @@ -21,3 +21,4 @@ !workspace_hack/ !neon_local/ !scripts/ninstall.sh +!vm-cgconfig.conf diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 27b7f54856..d16d221cc4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -611,34 +611,31 @@ jobs: run: shell: sh -eu {0} env: - VM_INFORMANT_VERSION: 0.1.1 + VM_BUILDER_VERSION: v0.4.6 steps: - - name: Downloading latest vm-builder + - name: Checkout + uses: actions/checkout@v1 + with: + fetch-depth: 0 + + - name: Downloading vm-builder run: | - curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder + curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder - name: Pulling compute-node image run: | docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }} + - name: Building VM compute-node rootfs run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant - chmod +x vm-informant - - - name: Adding VM informant to compute-node image - run: | - ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}) - docker cp vm-informant $ID:/bin/vm-informant - docker commit $ID temp-vm-compute-node - docker rm -f $ID + docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node . - name: Build vm image run: | # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images - ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - name: Pushing vm-compute-node image run: | diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node new file mode 100644 index 0000000000..af3bfb3590 --- /dev/null +++ b/Dockerfile.vm-compute-node @@ -0,0 +1,32 @@ +# Note: this file *mostly* just builds on Dockerfile.compute-node + +ARG SRC_IMAGE +ARG VM_INFORMANT_VERSION=v0.1.6 + +# Pull VM informant and set up inittab +FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant + +RUN set -e \ + && rm -f /etc/inittab \ + && touch /etc/inittab + +ADD vm-cgconfig.conf /etc/cgconfig.conf +RUN set -e \ + && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ + && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab + +# Combine, starting from non-VM compute node image. +FROM $SRC_IMAGE as base + +# Temporarily set user back to root so we can run apt update and adduser +USER root +RUN apt update && \ + apt install --no-install-recommends -y \ + cgroup-tools +RUN adduser vm-informant --disabled-password --no-create-home +USER postgres + +COPY --from=informant /etc/inittab /etc/inittab +COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant + +ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"] diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2c42662020..49cf1cd347 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -44,7 +44,6 @@ use tracing::{error, info}; use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; use compute_tools::http::api::launch_http_server; -use compute_tools::informant::spawn_vm_informant_if_present; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; @@ -141,8 +140,6 @@ fn main() -> Result<()> { // requests, while configuration is still in progress. let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); - // Also spawn the thread responsible for handling the VM informant -- if it's present - let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant"); // Start Postgres let mut delay_exit = false; diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs deleted file mode 100644 index 8a6e3ab43a..0000000000 --- a/compute_tools/src/informant.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::path::Path; -use std::process; -use std::thread; -use std::time::Duration; -use tracing::{info, warn}; - -use anyhow::{Context, Result}; - -const VM_INFORMANT_PATH: &str = "/bin/vm-informant"; -const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000; - -/// Launch a thread to start the VM informant if it's present (and restart, on failure) -pub fn spawn_vm_informant_if_present() -> Result>> { - let exists = Path::new(VM_INFORMANT_PATH) - .try_exists() - .context("could not check if path exists")?; - - if !exists { - return Ok(None); - } - - Ok(Some( - thread::Builder::new() - .name("run-vm-informant".into()) - .spawn(move || run_informant())?, - )) -} - -fn run_informant() -> ! { - let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS); - - info!("starting VM informant"); - - loop { - let mut cmd = process::Command::new(VM_INFORMANT_PATH); - // Block on subprocess: - let result = cmd.status(); - - match result { - Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"), - Ok(status) if !status.success() => { - warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying") - } - Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"), - } - - // Wait before retrying - thread::sleep(restart_wait); - } -} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index a71b92f91a..aee6b53e6a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -8,7 +8,6 @@ pub mod http; #[macro_use] pub mod logger; pub mod compute; -pub mod informant; pub mod monitor; pub mod params; pub mod pg_helpers; diff --git a/vm-cgconfig.conf b/vm-cgconfig.conf new file mode 100644 index 0000000000..a2e201708e --- /dev/null +++ b/vm-cgconfig.conf @@ -0,0 +1,12 @@ +# Configuration for cgroups in VM compute nodes +group neon-postgres { + perm { + admin { + uid = vm-informant; + } + task { + gid = users; + } + } + memory {} +} From af210c8b42da3b57f4750d41cba0e7b4047aa92a Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Mon, 20 Feb 2023 13:23:13 +0200 Subject: [PATCH 056/426] Allow running do_gc in non testing env (#3639) ## Describe your changes Since the current default gc period is set to 1 hour, whenever there is an immediate need to reduce PITR and run gc, the user has to wait 1 hour for PITR change to take effect By enabling this API the user can configure PITR and immediately call the do_gc API to trigger gc ## Issue ticket number and link #3590 ## Checklist before requesting a review - [X] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- pageserver/src/http/routes.rs | 7 +++---- pageserver/src/tenant/mgr.rs | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 71273159b7..d2d9f24efb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -14,7 +14,7 @@ use utils::http::request::{get_request_param, must_get_query_param, parse_query_ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineInfo, + TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; use crate::pgdatadir_mapping::LsnForTimestamp; @@ -40,7 +40,7 @@ use utils::{ // Imports only used for testing APIs #[cfg(feature = "testing")] -use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +use super::models::ConfigureFailpointsRequest; struct State { conf: &'static PageServerConf, @@ -925,7 +925,6 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -#[cfg(feature = "testing")] async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -1124,7 +1123,7 @@ pub fn make_router( ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", - testing_api!("run timeline GC", timeline_gc_handler), + timeline_gc_handler, ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a74dfdea04..a44cb02b4d 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -540,13 +540,11 @@ where } } -#[cfg(feature = "testing")] use { crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, utils::http::error::ApiError, }; -#[cfg(feature = "testing")] pub async fn immediate_gc( tenant_id: TenantId, timeline_id: TimelineId, From 8f557477c612a181a22075946e89e180879d1d35 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 20 Feb 2023 17:51:27 +0300 Subject: [PATCH 057/426] Add new safekeeper to ap-southeast-1 prod (#3645) --- .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 7c6d1db6d7..71fced23c2 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -36,3 +36,5 @@ storage: ansible_host: i-0e338adda8eb2d19f safekeeper-2.ap-southeast-1.aws.neon.tech: ansible_host: i-04fb63634e4679eb9 + safekeeper-3.ap-southeast-1.aws.neon.tech: + ansible_host: i-05481f3bc88cfc2d4 From d5d690c0442d812d61a9d65dbf41411b987c3674 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Mon, 20 Feb 2023 16:05:21 +0100 Subject: [PATCH 058/426] Use fqdn for staging console management API (#3642) `console-staging.local` is legacy manual CNAME to `neon-internal-api.aws.neon.build` in r53 We could use `neon-internal-api.aws.neon.build` name directly --- .github/ansible/staging.eu-west-1.hosts.yaml | 4 ++-- .github/ansible/staging.us-east-2.hosts.yaml | 4 ++-- .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml | 4 ++-- .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml | 2 +- .../dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 4 ++-- .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index d19026b3f7..b537795704 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-dev-storage-eu-west-1 bucket_region: eu-west-1 - console_mgmt_base_url: http://console-staging.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.build broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min tenant_config: eviction_policy: diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index a8b9d41a65..cd8f832af0 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-staging-storage-us-east-2 bucket_region: us-east-2 - console_mgmt_base_url: http://console-staging.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.build broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min tenant_config: eviction_policy: diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index c49b8d2009..ecf57554d9 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.eu-west-1.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index 157ae66ed1..91ddd07eae 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -10,7 +10,7 @@ settings: uri: "https://console.stage.neon.tech/psql_session/" domain: "pg.neon.build" sentryEnvironment: "staging" - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy-link pods diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index 99b67d75c1..6ec18ff388 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.cloud.stage.neon.tech" sentryEnvironment: "staging" wssPort: 8443 - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 764bb25b64..9b250fce6e 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.us-east-2.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy pods From e363911c85c85faa515c8905b08e60c1c431c5e8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 20 Feb 2023 16:18:13 +0100 Subject: [PATCH 059/426] timeline: propagate span to download_remote_layer (#3644) fixes #3643 refs #3604 --- pageserver/src/tenant/timeline.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 683c2cd2d3..f2b0a98509 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3758,7 +3758,7 @@ impl Timeline { drop(permit); Ok(()) - }, + }.in_current_span(), ); receiver.await.context("download task cancelled")? From ee1eda99212636285f92745140c625c152f6b04a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 20 Feb 2023 16:29:39 +0100 Subject: [PATCH 060/426] eviction: remove EvictionStats::not_considered_due_to_clock_skew Rationale: see the block comment added in this patch. fixes #3641 --- .../src/tenant/timeline/eviction_task.rs | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index e3e7ce4c9d..fc84517cc2 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -100,7 +100,6 @@ impl Timeline { #[allow(dead_code)] #[derive(Debug, Default)] struct EvictionStats { - not_considered_due_to_clock_skew: usize, candidates: usize, evicted: usize, errors: usize, @@ -129,9 +128,21 @@ impl Timeline { let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { - // NB: don't log the error. If there are many layers and the system clock - // is skewed, we'd be flooding the log. - stats.not_considered_due_to_clock_skew += 1; + // We reach here if `now` < `last_activity_ts`, which can legitimately + // happen if there is an access between us getting `now`, and us getting + // the access stats from the layer. + // + // The other reason why it can happen is system clock skew because + // SystemTime::now() is not monotonic, so, even if there is no access + // to the layer after we get `now` at the beginning of this function, + // it could be that `now` < `last_activity_ts`. + // + // To distinguish the cases, we would need to record `Instant`s in the + // access stats (i.e., monotonic timestamps), but then, the timestamps + // values in the access stats would need to be `Instant`'s, and hence + // they would be meaningless outside of the pageserver process. + // At the time of writing, the trade-off is that access stats are more + // valuable than detecting clock skew. continue; } }; @@ -188,8 +199,7 @@ impl Timeline { } } } - if stats.not_considered_due_to_clock_skew > 0 || stats.errors > 0 || stats.not_evictable > 0 - { + if stats.errors > 0 || stats.not_evictable > 0 { warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); From 485b2696749a93d67ca58a3d54b9b659766903d1 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 20 Feb 2023 16:35:23 +0100 Subject: [PATCH 061/426] eviction: tone down logs to debug!() level if there were no evictions fixes #3647 --- pageserver/src/tenant/timeline/eviction_task.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index fc84517cc2..fe7e7a1654 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -199,7 +199,9 @@ impl Timeline { } } } - if stats.errors > 0 || stats.not_evictable > 0 { + if stats.candidates == stats.not_evictable { + debug!(stats=?stats, "eviction iteration complete"); + } else if stats.errors > 0 || stats.not_evictable > 0 { warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); From e3d75879c0af3e820c6bd9a74e19a9411d8afb52 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Mon, 20 Feb 2023 18:11:06 +0100 Subject: [PATCH 062/426] Use fqdn to access console management API on production (#3651) console-release.local is legacy manual CNAME to neon-internal-api.aws.neon.tech in r53 We could use neon-internal-api.aws.neon.tech name directly This already was deployed to staging in https://github.com/neondatabase/neon/pull/3642 --- .github/ansible/prod.ap-southeast-1.hosts.yaml | 4 ++-- .github/ansible/prod.eu-central-1.hosts.yaml | 4 ++-- .github/ansible/prod.us-east-2.hosts.yaml | 4 ++-- .github/ansible/prod.us-west-2.hosts.yaml | 4 ++-- .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 4 ++-- .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml | 4 ++-- .../helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 4 ++-- .../prod-us-west-2-eta.neon-proxy-scram-legacy.yaml | 4 ++-- .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml | 4 ++-- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 71fced23c2..13b44f4052 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-ap-southeast-1 bucket_region: ap-southeast-1 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index 83d4f6f37d..2236dcbc06 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-eu-central-1 bucket_region: eu-central-1 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 7f7601cd39..56bece3e77 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-us-east-2 bucket_region: us-east-2 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index 9cad79b986..f03e2d9435 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-us-west-2 bucket_region: us-west-2 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index a640d468b3..389da35463 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index c9430877de..7e16ac2d3d 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 677df6a5be..05e41e7a97 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-2.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml index 3a5cde4b01..e67a3e4461 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.cloud.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 919a0d503c..5dc23b282e 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-west-2.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods From bc7d3c647601d56332121403f1138d16317d2d98 Mon Sep 17 00:00:00 2001 From: Keanu Ashwell Date: Tue, 21 Feb 2023 05:51:54 +1000 Subject: [PATCH 063/426] docs: add dependency requirements for arch based systems (#3588) This pull request adds information on building neon on Arch based system such as Artix, Manjaro, Antergos, etc. --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index f8bc1b7736..819693f1f3 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ protobuf-devel ``` +* On Arch based systems, these packages are needed: +```bash +pacman -S base-devel readline zlib libseccomp openssl clang \ +postgresql-libs cmake postgresql protobuf +``` 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` From d7d3f451f0fe6a037c8c4add276da8dcb2ad11f8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 10:03:55 +0200 Subject: [PATCH 064/426] Use tracing panic hook in all binaries (#3634) Enables tracing panic hook in addition to pageserver introduced in #3475: - proxy - safekeeper - storage_broker For proxy, a drop guard which resets the original std panic hook was added on the first commit. Other binaries don't need it so they never reset anything by `disarm`ing the drop guard. The aim of the change is to make sure all panics a) have span information b) are logged similar to other messages, not interleaved with other messages as happens right now. Interleaving happens right now because std prints panics to stderr, and other logging happens in stdout. If this was handled gracefully by some utility, the log message splitter would treat panics as belonging to the previous message because it expects a message to start with a timestamp. Cc: #3468 --- libs/utils/src/logging.rs | 112 +++++++++++++++++++++++ pageserver/src/bin/pageserver.rs | 50 +--------- proxy/src/main.rs | 1 + safekeeper/src/bin/safekeeper.rs | 5 + storage_broker/src/bin/storage_broker.rs | 10 +- 5 files changed, 128 insertions(+), 50 deletions(-) diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 02684d3d16..f770622a60 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> { Ok(()) } + +/// Disable the default rust panic hook by using `set_hook`. +/// +/// For neon binaries, the assumption is that tracing is configured before with [`init`], after +/// that sentry is configured (if needed). sentry will install it's own on top of this, always +/// processing the panic before we log it. +/// +/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr). +/// If the assumptions about the initialization order are not held, use +/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be +/// lost. +#[must_use] +pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard { + std::panic::set_hook(Box::new(tracing_panic_hook)); + TracingPanicHookGuard::new() +} + +/// Drop guard which restores the std panic hook on drop. +/// +/// Tracing should not be used when it's not configured, but we cannot really latch on to any +/// imaginary lifetime of tracing. +pub struct TracingPanicHookGuard { + act: bool, +} + +impl TracingPanicHookGuard { + fn new() -> Self { + TracingPanicHookGuard { act: true } + } + + /// Make this hook guard not do anything when dropped. + pub fn forget(&mut self) { + self.act = false; + } +} + +impl Drop for TracingPanicHookGuard { + fn drop(&mut self) { + if self.act { + let _ = std::panic::take_hook(); + } + } +} + +/// Named symbol for our panic hook, which logs the panic. +fn tracing_panic_hook(info: &std::panic::PanicInfo) { + // following rust 1.66.1 std implementation: + // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 + let location = info.location(); + + let msg = match info.payload().downcast_ref::<&'static str>() { + Some(s) => *s, + None => match info.payload().downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + + let thread = std::thread::current(); + let thread = thread.name().unwrap_or(""); + let backtrace = std::backtrace::Backtrace::capture(); + + let _entered = if let Some(location) = location { + tracing::error_span!("panic", %thread, location = %PrettyLocation(location)) + } else { + // very unlikely to hit here, but the guarantees of std could change + tracing::error_span!("panic", %thread) + } + .entered(); + + if backtrace.status() == std::backtrace::BacktraceStatus::Captured { + // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really + // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to + // string, maybe even to a TLS one but tracing already does that. + tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}"); + } else { + tracing::error!("{msg}"); + } + + // ensure that we log something on the panic if this hook is left after tracing has been + // unconfigured. worst case when teardown is racing the panic is to log the panic twice. + tracing::dispatcher::get_default(|d| { + if let Some(_none) = d.downcast_ref::() { + let location = location.map(PrettyLocation); + log_panic_to_stderr(thread, msg, location, &backtrace); + } + }); +} + +#[cold] +fn log_panic_to_stderr( + thread: &str, + msg: &str, + location: Option>, + backtrace: &std::backtrace::Backtrace, +) { + eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"); +} + +struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); + +impl std::fmt::Display for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column()) + } +} + +impl std::fmt::Debug for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(self, f) + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c499fd8d74..01a2c85d74 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -91,9 +91,9 @@ fn main() -> anyhow::Result<()> { // Initialize logging, which must be initialized before the custom panic hook is installed. logging::init(conf.log_format)?; - // disable the default rust panic hook by using `set_hook`. sentry will install it's own on top - // of this, always processing the panic before we log it. - std::panic::set_hook(Box::new(tracing_panic_hook)); + // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. + // disarming this hook on pageserver, because we never tear down tracing. + logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry( @@ -499,50 +499,6 @@ fn cli() -> Command { ) } -/// Named symbol for our panic hook, which logs the panic. -fn tracing_panic_hook(info: &std::panic::PanicInfo) { - // following rust 1.66.1 std implementation: - // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 - let location = info.location(); - - let msg = match info.payload().downcast_ref::<&'static str>() { - Some(s) => *s, - None => match info.payload().downcast_ref::() { - Some(s) => &s[..], - None => "Box", - }, - }; - - let thread = std::thread::current(); - let thread = thread.name().unwrap_or(""); - let backtrace = std::backtrace::Backtrace::capture(); - - struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); - - impl std::fmt::Display for PrettyLocation<'_, '_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column()) - } - } - - let _entered = if let Some(location) = location { - tracing::error_span!("panic", %thread, location = %PrettyLocation(location)) - } else { - // very unlikely to hit here, but the guarantees of std could change - tracing::error_span!("panic", %thread) - } - .entered(); - - if backtrace.status() == std::backtrace::BacktraceStatus::Captured { - // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really - // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to - // string, maybe even to a TLS one but tracing already does that. - tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}"); - } else { - tracing::error!("{msg}"); - } -} - #[test] fn verify_cli() { cli().debug_assert(); diff --git a/proxy/src/main.rs b/proxy/src/main.rs index c319cb9cfc..85478da3bc 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -43,6 +43,7 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { let _logging_guard = logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); info!("Version: {GIT_VERSION}"); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 1a068412c8..683050e9cd 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -126,7 +126,12 @@ fn main() -> anyhow::Result<()> { return Ok(()); } + // important to keep the order of: + // 1. init logging + // 2. tracing panic hook + // 3. sentry logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::replace_panic_hook_with_tracing_panic_hook().forget(); info!("version: {GIT_VERSION}"); let args_workdir = &args.datadir; diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index c73206b7dc..1a0d261184 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -424,12 +424,16 @@ async fn http1_handler( #[tokio::main] async fn main() -> Result<(), Box> { - // initialize sentry if SENTRY_DSN is provided - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - let args = Args::parse(); + // important to keep the order of: + // 1. init logging + // 2. tracing panic hook + // 3. sentry logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + // initialize sentry if SENTRY_DSN is provided + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); info!("version: {GIT_VERSION}"); ::metrics::set_build_info_metric(GIT_VERSION); From 5c5b03ce08f930480592aeb0a473a82279590efb Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Mon, 20 Feb 2023 17:22:52 +0100 Subject: [PATCH 065/426] Compile xml2 extension --- Dockerfile.compute-node | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 95d6237ecf..1d6c2f354f 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -11,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ - libicu-dev + libicu-dev libxslt1-dev ######################################################################################### # @@ -23,7 +23,8 @@ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION} postgres RUN cd postgres && \ - ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \ + --with-libxml --with-libxslt && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers @@ -34,7 +35,8 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control ######################################################################################### # @@ -255,6 +257,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libicu67, locales for collations (including ICU) # libossp-uuid16 for extension ossp-uuid # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS +# libxml2, libxslt1.1 for xml2 RUN apt update && \ apt install --no-install-recommends -y \ locales \ @@ -266,6 +269,8 @@ RUN apt update && \ libproj19 \ libprotobuf-c1 \ libsfcgal1 \ + libxml2 \ + libxslt1.1 \ gdb && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 From 7de373210df708d874d36bf335c669b12f685da9 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 13:02:19 +0200 Subject: [PATCH 066/426] Warn when background tasks exceed their configured period (#3654) Fixes #3648. --- pageserver/src/tenant/tasks.rs | 64 +++++++++++++------ .../src/tenant/timeline/eviction_task.rs | 9 +-- test_runner/fixtures/neon_fixtures.py | 1 + 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index b126545ee4..db269a1745 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -3,7 +3,7 @@ use std::ops::ControlFlow; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; @@ -60,26 +60,32 @@ async fn compaction_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { info!("received cancellation request"); - return; + return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { ControlFlow::Break(()) => return, ControlFlow::Continue(tenant) => tenant, }, - }; + }; - let mut sleep_duration = tenant.get_compaction_period(); - if sleep_duration == Duration::ZERO { + let started_at = Instant::now(); + + let period = tenant.get_compaction_period(); + let sleep_duration = if period == Duration::ZERO { info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); + Duration::from_secs(10) } else { // Run compaction if let Err(e) = tenant.compaction_iteration(&ctx).await { - sleep_duration = wait_duration; - error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration); + error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period } - } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "compaction"); // Sleep tokio::select! { @@ -122,23 +128,26 @@ async fn gc_loop(tenant_id: TenantId) { }, }; - let gc_period = tenant.get_gc_period(); + let started_at = Instant::now(); + + let period = tenant.get_gc_period(); let gc_horizon = tenant.get_gc_horizon(); - let mut sleep_duration = gc_period; - if sleep_duration == Duration::ZERO { + let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); + Duration::from_secs(10) } else { // Run gc - if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await - { - sleep_duration = wait_duration; - error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); - } + let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await; + if let Err(e) = res { + error!("Gc failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period } - } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "gc"); // Sleep tokio::select! { @@ -197,3 +206,18 @@ async fn wait_for_active_tenant( } } } + +pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { + // Duration::ZERO will happen because it's the "disable [bgtask]" value. + if elapsed >= period && period != Duration::ZERO { + // humantime does no significant digits clamping whereas Duration's debug is a bit more + // intelligent. however it makes sense to keep the "configuration format" for period, even + // though there's no way to output the actual config value. + warn!( + ?elapsed, + period = %humantime::format_duration(period), + task, + "task iteration took longer than the configured period" + ); + } +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index fe7e7a1654..0dd169363e 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -44,6 +44,7 @@ impl Timeline { loop { let policy = self.get_eviction_policy(); let cf = self.eviction_iteration(&policy, cancel.clone()).await; + match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { @@ -78,13 +79,7 @@ impl Timeline { ControlFlow::Continue(()) => (), } let elapsed = start.elapsed(); - if elapsed > p.period { - warn!( - configured_period = %humantime::format_duration(p.period), - last_period = %humantime::format_duration(elapsed), - "this eviction period took longer than the configured period" - ); - } + crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); ControlFlow::Continue(start + p.period) } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 59d616eb6f..63196609cc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2079,6 +2079,7 @@ class NeonPageserver(PgProtocol): ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress + ".*task iteration took longer than the configured period.*", ] def start( From b220ba6cd1973cdbe4d886904f2c217c6df72c77 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 13:42:11 +0200 Subject: [PATCH 067/426] add random init delay for background tasks (#3655) Fixes #3649. --- pageserver/src/tenant/tasks.rs | 66 +++++++++++++++++-- .../src/tenant/timeline/eviction_task.rs | 13 ++++ 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index db269a1745..e9ce52d1ab 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -11,6 +11,7 @@ use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::mgr; use crate::tenant::{Tenant, TenantState}; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantId; @@ -53,12 +54,14 @@ async fn compaction_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let cancel = task_mgr::shutdown_token(); let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); + let mut first = true; loop { trace!("waking up"); let tenant = tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request"); return; }, @@ -68,9 +71,19 @@ async fn compaction_loop(tenant_id: TenantId) { }, }; + let period = tenant.get_compaction_period(); + + // TODO: we shouldn't need to await to find tenant and this could be moved outside of + // loop + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; + } + } + let started_at = Instant::now(); - let period = tenant.get_compaction_period(); let sleep_duration = if period == Duration::ZERO { info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. @@ -89,7 +102,7 @@ async fn compaction_loop(tenant_id: TenantId) { // Sleep tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request during idling"); break; }, @@ -111,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let cancel = task_mgr::shutdown_token(); // GC might require downloading, to find the cutoff LSN that corresponds to the // cutoff specified as time. let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; loop { trace!("waking up"); let tenant = tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request"); return; }, @@ -128,9 +143,17 @@ async fn gc_loop(tenant_id: TenantId) { }, }; + let period = tenant.get_gc_period(); + + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; + } + } + let started_at = Instant::now(); - let period = tenant.get_gc_period(); let gc_horizon = tenant.get_gc_horizon(); let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { info!("automatic GC is disabled"); @@ -151,7 +174,7 @@ async fn gc_loop(tenant_id: TenantId) { // Sleep tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request during idling"); break; }, @@ -207,6 +230,37 @@ async fn wait_for_active_tenant( } } +#[derive(thiserror::Error, Debug)] +#[error("cancelled")] +pub(crate) struct Cancelled; + +/// Provide a random delay for background task initialization. +/// +/// This delay prevents a thundering herd of background tasks and will likely keep them running on +/// different periods for more stable load. +pub(crate) async fn random_init_delay( + period: Duration, + cancel: &CancellationToken, +) -> Result<(), Cancelled> { + use rand::Rng; + + let d = { + let mut rng = rand::thread_rng(); + + // gen_range asserts that the range cannot be empty, which it could be because period can + // be set to zero to disable gc or compaction, so lets set it to be at least 10s. + let period = std::cmp::max(period, Duration::from_secs(10)); + + // semi-ok default as the source of jitter + rng.gen_range(Duration::ZERO..=period) + }; + + tokio::select! { + _ = cancel.cancelled() => Err(Cancelled), + _ = tokio::time::sleep(d) => Ok(()), + } +} + pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { // Duration::ZERO will happen because it's the "disable [bgtask]" value. if elapsed >= period && period != Duration::ZERO { diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 0dd169363e..2aad0ef0f3 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -41,6 +41,19 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { + use crate::tenant::tasks::random_init_delay; + { + let policy = self.get_eviction_policy(); + let period = match policy { + EvictionPolicy::LayerAccessThreshold(lat) => lat.period, + EvictionPolicy::NoEviction => Duration::from_secs(10), + }; + if random_init_delay(period, &cancel).await.is_err() { + info!("shutting down"); + return; + } + } + loop { let policy = self.get_eviction_policy(); let cf = self.eviction_iteration(&policy, cancel.clone()).await; From c0de7f5cd832d7d784c8ce47bb71310c1a13f47d Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 21 Feb 2023 17:31:23 +0100 Subject: [PATCH 068/426] Build `pg_jsonschema` and `pg_graphql` extensions (#3535) ## Describe your changes Layer for building pg extensions written on Rust It required forking: * `cargo-pgx` (in order not to catch an ABI mismatch error (`cargo-pgx` hardcoded ABI tcdi/pgx#1032) * `pg_jsonschema` (to use forked `cargo-pgx` version) * `pgx-contrib-spiext` (to use forked `cargo-pgx`) * `pg_graphql` (to use forked `cargo-pgx` and `pgx-contrib-spiext` version) Before the patch: ``` postgres=# create extension pg_jsonschema; 2023-02-02 17:45:23.120 UTC [35] ERROR: incompatible library "/usr/local/lib/pg_jsonschema.so": ABI mismatch 2023-02-02 17:45:23.120 UTC [35] DETAIL: Server has ABI "Neon Postgres", library has "PostgreSQL". 2023-02-02 17:45:23.120 UTC [35] STATEMENT: create extension pg_jsonschema; ERROR: incompatible library "/usr/local/lib/pg_jsonschema.so": ABI mismatch DETAIL: Server has ABI "Neon Postgres", library has "PostgreSQL". ``` After ``` postgres=# create extension pg_jsonschema; CREATE EXTENSION postgres=# select json_matches_schema('{"type": "object"}', '{}'); json_matches_schema --------------------- t postgres=# create extension pg_graphql; CREATE EXTENSION postgres=# create table book(id int primary key, title text); CREATE TABLE postgres=# insert into book(id, title) values (1, 'book 1'); INSERT 0 1 postgres=# select graphql.resolve($$ query { bookCollection { edges { node { id } } } } $$); resolve ---------------------------------------------------------------- {"data": {"bookCollection": {"edges": [{"node": {"id": 1}}]}}} (1 row) ``` ## Issue ticket number and link Closes #3429, #3096 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [x] If it is a core feature, I have added thorough tests. - [x] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [x] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. `pg_jsonschema` extension will be available for our customers --- Dockerfile.compute-node | 59 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 1d6c2f354f..2f34f6dc15 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -1,3 +1,4 @@ +ARG PG_VERSION ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com ARG IMAGE=rust ARG TAG=pinned @@ -180,6 +181,62 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control +######################################################################################### +# +# Layer "rust extensions" +# This layer is used to build `pgx` deps +# +######################################################################################### +FROM build-deps AS rust-extensions-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN apt-get update && \ + apt-get install -y curl libclang-dev cmake && \ + useradd -ms /bin/bash nonroot -b /home + +ENV HOME=/home/nonroot +ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" +USER nonroot +WORKDIR /home/nonroot +ARG PG_VERSION + +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ + chmod +x rustup-init && \ + ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ + rm rustup-init && \ + cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \ + /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root + +######################################################################################### +# +# Layer "pg-jsonschema-pg-build" +# Compile "pg_jsonschema" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-jsonschema-pg-build + +RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \ + cd pg_jsonschema && \ + cargo pgx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control + +######################################################################################### +# +# Layer "pg-graphql-pg-build" +# Compile "pg_graphql" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-graphql-pg-build + +RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \ + cd pg_graphql && \ + cargo pgx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -195,6 +252,8 @@ COPY --from=h3-pg-build /h3/usr / COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From fe462de85bb1991db4661e8fb4f4eb666a02b318 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 19:31:53 +0200 Subject: [PATCH 069/426] fix: log download failed error (#3661) Fixes #3659 --- pageserver/src/tenant/tasks.rs | 2 +- pageserver/src/tenant/timeline.rs | 1 + test_runner/regress/test_tenants.py | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e9ce52d1ab..20d1d2bfb6 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -74,7 +74,7 @@ async fn compaction_loop(tenant_id: TenantId) { let period = tenant.get_compaction_period(); // TODO: we shouldn't need to await to find tenant and this could be moved outside of - // loop + // loop, #3501. There are also additional "allowed_errors" in tests. if first { first = false; if random_init_delay(period, &cancel).await.is_err() { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f2b0a98509..8bc02cd10a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3745,6 +3745,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. + info!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index e56bb1b469..9e75396799 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -250,6 +250,10 @@ def test_pageserver_with_empty_tenants( env.pageserver.allowed_errors.append( ".*could not load tenant.*Failed to list timelines directory.*" ) + # this is until #3501 + env.pageserver.allowed_errors.append( + ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant" + ) client = env.pageserver.http_client() From 5d001b1e5ab1789e55b086e943d32b7a08c65287 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 20:20:13 +0200 Subject: [PATCH 070/426] chore: ignore all compaction inactive tenant errors (#3665) these are happening in tests because of #3655 but they sure took some time to appear. makes the `Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant` into a globally allowed error, because it has been seen failing on different test cases. --- test_runner/fixtures/neon_fixtures.py | 2 ++ test_runner/regress/test_tenants.py | 4 ---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 63196609cc..73f224039e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2080,6 +2080,8 @@ class NeonPageserver(PgProtocol): ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", + # this is until #3501 + ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant", ] def start( diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 9e75396799..e56bb1b469 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -250,10 +250,6 @@ def test_pageserver_with_empty_tenants( env.pageserver.allowed_errors.append( ".*could not load tenant.*Failed to list timelines directory.*" ) - # this is until #3501 - env.pageserver.allowed_errors.append( - ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant" - ) client = env.pageserver.http_client() From 225add041fe6bed6ceea24e11bb4dab9c2314919 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 21:09:31 +0200 Subject: [PATCH 071/426] calculate_logical_size: no longer use spawn_blocking (#3664) Calculation of logical size is now async because of layer downloads, so we shouldn't use spawn_blocking for it. Use of `spawn_blocking` exhausted resources which are needed by `tokio::io::copy` when copying from a stream to a file which lead to deadlock. Fixes: #3657 --- pageserver/src/tenant/timeline.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8bc02cd10a..176eb61ff3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1770,15 +1770,9 @@ impl Timeline { let calculation = async { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); - tokio::task::spawn_blocking(move || { - // Run in a separate thread since this can do a lot of - // synchronous file IO without .await inbetween - // if there are no RemoteLayers that would require downloading. - let h = tokio::runtime::Handle::current(); - h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx)) - }) - .await - .context("Failed to spawn calculation result task")? + self_calculation + .calculate_logical_size(init_lsn, cancel, &ctx) + .await }; let timeline_state_cancellation = async { loop { @@ -1811,7 +1805,7 @@ impl Timeline { tokio::pin!(calculation); loop { tokio::select! { - res = &mut calculation => { return res } + res = &mut calculation => { return res } reason = timeline_state_cancellation => { debug!(reason = reason, "cancelling calculation"); cancel.cancel(); From b8b8c19fb41df2db8addc6be0e56c29b29ce84a3 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 21 Feb 2023 21:14:08 +0200 Subject: [PATCH 072/426] fix: hold permit until GetObject eof (#3663) previously we applied the ratelimiting only up to receiving the headers from s3, or somewhere near it. the commit adds an adapter which carries the permit until the AsyncRead has been disposed. fixes #3662. --- Cargo.lock | 1 + libs/remote_storage/Cargo.toml | 2 +- libs/remote_storage/src/s3_bucket.rs | 45 +++++++++++++++++++++++----- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d154b4eaea..dab3d12263 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3054,6 +3054,7 @@ dependencies = [ "hyper", "metrics", "once_cell", + "pin-project-lite", "serde", "serde_json", "tempfile", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 4382fbac32..15812e8439 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -21,7 +21,7 @@ toml_edit.workspace = true tracing.workspace = true metrics.workspace = true utils.workspace = true - +pin-project-lite.workspace = true workspace_hack.workspace = true [dev-dependencies] diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 18a2c5dedd..93f5e0596e 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -20,7 +20,10 @@ use aws_sdk_s3::{ }; use aws_smithy_http::body::SdkBody; use hyper::Body; -use tokio::{io, sync::Semaphore}; +use tokio::{ + io::{self, AsyncRead}, + sync::Semaphore, +}; use tokio_util::io::ReaderStream; use tracing::debug; @@ -102,7 +105,7 @@ pub struct S3Bucket { // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. - concurrency_limiter: Semaphore, + concurrency_limiter: Arc, } #[derive(Default)] @@ -162,7 +165,7 @@ impl S3Bucket { client, bucket_name: aws_config.bucket_name.clone(), prefix_in_bucket, - concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), + concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())), }) } @@ -194,9 +197,10 @@ impl S3Bucket { } async fn download_object(&self, request: GetObjectRequest) -> Result { - let _guard = self + let permit = self .concurrency_limiter - .acquire() + .clone() + .acquire_owned() .await .context("Concurrency limiter semaphore got closed during S3 download") .map_err(DownloadError::Other)?; @@ -217,9 +221,10 @@ impl S3Bucket { let metadata = object_output.metadata().cloned().map(StorageMetadata); Ok(Download { metadata, - download_stream: Box::pin(io::BufReader::new( + download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new( + permit, object_output.body.into_async_read(), - )), + ))), }) } Err(SdkError::ServiceError { @@ -240,6 +245,32 @@ impl S3Bucket { } } +pin_project_lite::pin_project! { + /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. + struct RatelimitedAsyncRead { + permit: tokio::sync::OwnedSemaphorePermit, + #[pin] + inner: S, + } +} + +impl RatelimitedAsyncRead { + fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { + RatelimitedAsyncRead { permit, inner } + } +} + +impl AsyncRead for RatelimitedAsyncRead { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut io::ReadBuf<'_>, + ) -> std::task::Poll> { + let this = self.project(); + this.inner.poll_read(cx, buf) + } +} + #[async_trait::async_trait] impl RemoteStorage for S3Bucket { async fn list(&self) -> anyhow::Result> { From 2caece207715714fc2ef2b803bf92e8ca50ea7db Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 21 Feb 2023 21:11:52 +0100 Subject: [PATCH 073/426] Add -v to ansible invocations (#3670) To get more debug output on failures --- .github/workflows/deploy-dev.yml | 2 +- .github/workflows/deploy-prod.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index 409517bf63..b080a29f7c 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -67,7 +67,7 @@ jobs: ./get_binaries.sh ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} + ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version - name: Cleanup ansible folder diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index 540d187274..6096ac8ab9 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -68,7 +68,7 @@ jobs: ./get_binaries.sh ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} + ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version deploy-proxy-prod-new: From 95018672fa05fd14b975a6ab0a516dce7e89d21b Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 22 Feb 2023 12:55:41 +0300 Subject: [PATCH 074/426] Remove safekeeper-1.ap-southeast-1.aws.neon.tech (#3671) We migrated all timelines to `safekeeper-3.ap-southeast-1.aws.neon.tech`, now old instance can be removed. --- .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 13b44f4052..8ccb67b04a 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -32,8 +32,6 @@ storage: hosts: safekeeper-0.ap-southeast-1.aws.neon.tech: ansible_host: i-0d6f1dc5161eef894 - safekeeper-1.ap-southeast-1.aws.neon.tech: - ansible_host: i-0e338adda8eb2d19f safekeeper-2.ap-southeast-1.aws.neon.tech: ansible_host: i-04fb63634e4679eb9 safekeeper-3.ap-southeast-1.aws.neon.tech: From 965b4f4ae253143c26b9ced84aa836df264d0fdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Wed, 22 Feb 2023 16:50:07 +0200 Subject: [PATCH 075/426] Change the staging neon-proxy-scram update strategy to RollingUpdate (#3678) ## Describe your changes When we deploy the proxy with the default Recreate strategy, there's always some downtime and existing connections will be shut down. Change the strategy to RollingUpdate and delay the kill signal by one week. AWS Network Loadbalancer keeps the existing connections alive for as long as the pods are alive, but will direct new connections to new pods. ## Issue ticket number and link https://github.com/neondatabase/neon/issues/3333 --- .../dev-eu-west-1-zeta.neon-proxy-scram.yaml | 15 +++++++++++++++ .../dev-us-east-2-beta.neon-proxy-scram.yaml | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index ecf57554d9..ad712c4745 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -1,6 +1,21 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 604800"] +terminationGracePeriodSeconds: 604800 + image: repository: neondatabase/neon diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 9b250fce6e..a091be1016 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -1,6 +1,21 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 604800"] +terminationGracePeriodSeconds: 604800 + image: repository: neondatabase/neon From 412e0aa9858f4b406c08488ffa0c14eebfd941ec Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 22 Feb 2023 18:28:01 +0200 Subject: [PATCH 076/426] Skip largest N holes during compaction (#3597) ## Describe your changes This is yet another attempt to address problem with storage size ballooning #2948 Previous PR #3348 tries to address this problem by maintaining list of holes for each layer. The problem with this approach is that we have to load all layer on pageserver start. Lazy loading of layers is not possible any more. This PR tries to collect information of N largest holes on compaction time and exclude this holes from produced layers. It can cause generation of larger number of layers (up to 2 times) and producing small layers. But it requires minimal changes in code and doesn't affect storage format. For graphical explanation please see thread: https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451 ## Issue ticket number and link #2948 #3348 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- pageserver/src/tenant/timeline.rs | 71 ++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 176eb61ff3..d46ac26e7d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -19,6 +19,7 @@ use tracing::*; use utils::id::TenantTimelineId; use std::cmp::{max, min, Ordering}; +use std::collections::BinaryHeap; use std::collections::HashMap; use std::fs; use std::ops::{Deref, Range}; @@ -82,6 +83,25 @@ enum FlushLoopState { Exited, } +/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Hole { + key_range: Range, + coverage_size: usize, +} + +impl Ord for Hole { + fn cmp(&self, other: &Self) -> Ordering { + other.coverage_size.cmp(&self.coverage_size) // inverse order + } +} + +impl PartialOrd for Hole { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -2941,6 +2961,47 @@ impl Timeline { }, )?; + // Determine N largest holes where N is number of compacted layers. + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here? + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + for (next_key, _next_lsn, _size) in itertools::process_results( + deltas_to_compact.iter().map(|l| l.key_iter(ctx)), + |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0), + )? { + if let Some(prev_key) = prev { + // just first fast filter + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + } + prev = Some(next_key.next()); + } + drop(layers); + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + let mut next_hole = 0; // index of next hole in holes vector + // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. // @@ -3035,14 +3096,22 @@ impl Timeline { } if writer.is_some() { let written_size = writer.as_mut().unwrap().size(); - // check if key cause layer overflow... + let contains_hole = + next_hole < holes.len() && key >= holes[next_hole].key_range.end; + // check if key cause layer overflow or contains hole... if is_dup_layer || dup_end_lsn.is_valid() || written_size + key_values_total_size > target_file_size + || contains_hole { // ... if so, flush previous layer and prepare to write new one new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); writer = None; + + if contains_hole { + // skip hole + next_hole += 1; + } } } // Remember size of key value because at next iteration we will access next item From b0311cfdeb84d10dfb4ac24dc209b212129324da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Wed, 22 Feb 2023 20:15:37 +0200 Subject: [PATCH 077/426] Change the production neon-proxy-scram update strategy to RollingUpdate (#3683) ## Describe your changes The same change in production as was done in staging by https://github.com/neondatabase/neon/pull/3678 ## Issue ticket number and link https://github.com/neondatabase/neon/issues/3333 --- ...-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 16 ++++++++++++++++ ...prod-eu-central-1-gamma.neon-proxy-scram.yaml | 16 ++++++++++++++++ .../prod-us-east-2-delta.neon-proxy-scram.yaml | 16 ++++++++++++++++ .../prod-us-west-2-eta.neon-proxy-scram.yaml | 16 ++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 389da35463..8d65e94d00 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -1,6 +1,22 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 604800"] +terminationGracePeriodSeconds: 604800 + + image: repository: neondatabase/neon diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index 7e16ac2d3d..f806b37482 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -1,6 +1,22 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 604800"] +terminationGracePeriodSeconds: 604800 + + image: repository: neondatabase/neon diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 05e41e7a97..38719f64e7 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -1,6 +1,22 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 604800"] +terminationGracePeriodSeconds: 604800 + + image: repository: neondatabase/neon diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 5dc23b282e..d5a7d6d575 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -1,6 +1,22 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 604800"] +terminationGracePeriodSeconds: 604800 + + image: repository: neondatabase/neon From a8d7360881cc00e4884da97068b763d0dfde156b Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 22 Feb 2023 14:12:09 +0100 Subject: [PATCH 078/426] Compile `hypopg` extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 2f34f6dc15..514d5c28bd 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -181,6 +181,21 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control +######################################################################################### +# +# Layer "hypopg-pg-build" +# compile hypopg extension +# +######################################################################################### +FROM build-deps AS hypopg-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \ + mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control + ######################################################################################### # # Layer "rust extensions" @@ -254,6 +269,7 @@ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From f3ad6359114aec85fef2cc078acf3ab69e99d20c Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 22 Feb 2023 16:30:13 +0100 Subject: [PATCH 079/426] Compile `pgrouting` extension --- Dockerfile.compute-node | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 514d5c28bd..497e23a250 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -60,10 +60,11 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / +ENV PATH "/usr/local/pgsql/bin:$PATH" + RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ ./autogen.sh && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ cd extensions/postgis && \ @@ -77,6 +78,15 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control +RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ + mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control + ######################################################################################### # # Layer "plv8-build" From eb403da8143a7b764291abf1354fc3e586835a43 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Wed, 22 Feb 2023 21:19:05 +0200 Subject: [PATCH 080/426] Use debug level for successful GET http requests (#3681) We started rather frequently scrap some apis for metadata. This includes layer eviction tester, I believe console does that too. It should eliminate these logs: https://neonprod.grafana.net/goto/rr_ace1Vz?orgId=1 (Note the rate around 2k messages per minute) --- libs/utils/src/http/endpoint.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 483ff15c55..9c300de7a7 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -4,13 +4,14 @@ use anyhow::{anyhow, Context}; use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::http::HeaderValue; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; +use hyper::{Method, StatusCode}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; use tokio::task::JoinError; -use tracing::info; +use tracing; use std::future::Future; use std::net::TcpListener; @@ -27,7 +28,14 @@ static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { }); async fn logger(res: Response, info: RequestInfo) -> Result, ApiError> { - info!("{} {} {}", info.method(), info.uri().path(), res.status(),); + // cannot factor out the Level to avoid the repetition + // because tracing can only work with const Level + // which is not the case here + if info.method() == Method::GET && res.status() == StatusCode::OK { + tracing::debug!("{} {} {}", info.method(), info.uri().path(), res.status()); + } else { + tracing::info!("{} {} {}", info.method(), info.uri().path(), res.status()); + } Ok(res) } @@ -203,7 +211,7 @@ pub fn serve_thread_main( where S: Future + Send + Sync, { - info!("Starting an HTTP endpoint at {}", listener.local_addr()?); + tracing::info!("Starting an HTTP endpoint at {}", listener.local_addr()?); // Create a Service from the router above to handle incoming requests. let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap(); From 093570af20c7e619e53866ada789f83bb13b3eb5 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 22 Feb 2023 15:57:52 +0100 Subject: [PATCH 081/426] Compile `pg_hashids` extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 497e23a250..702e882957 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -206,6 +206,21 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypo make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control +######################################################################################### +# +# Layer "pg-hashids-pg-build" +# compile pg_hashids extension +# +######################################################################################### +FROM build-deps AS pg-hashids-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ + mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control + ######################################################################################### # # Layer "rust extensions" @@ -280,6 +295,7 @@ COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 0692fffbf30b0322c86d6919b2a3b865348541df Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 22 Feb 2023 22:52:22 +0400 Subject: [PATCH 082/426] Bump vendor/postgres to include hotfix for unlogged tables with indexes. https://github.com/neondatabase/postgres/pull/259 https://github.com/neondatabase/postgres/pull/262 --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index f210ac524b..b44ee1d9a5 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit f210ac524b42d2d6f404f8505c64de36e977d17c +Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 33f9763454..303fa4050f 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 33f976345490351f951d72f81621c2263c186c9a +Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3 From 5ebf7e5619db5961ed056d440d7d516e24cb28fe Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 22 Feb 2023 12:29:27 +0100 Subject: [PATCH 083/426] Fix `pg_jsonschema` and `pg_graphql` --- Dockerfile.compute-node | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 702e882957..bf5e6d99c2 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -261,6 +261,8 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \ cd pg_jsonschema && \ cargo pgx install --release && \ + # it's needed to enable extension because it uses untrusted C language + sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_jsonschema.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control ######################################################################################### @@ -275,6 +277,8 @@ FROM rust-extensions-build AS pg-graphql-pg-build RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \ cd pg_graphql && \ cargo pgx install --release && \ + # it's needed to enable extension because it uses untrusted C language + sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control ######################################################################################### From 20a4d817cecd2ec3d63abf3d2cb4385ef9e72f41 Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 23 Feb 2023 15:10:22 +0100 Subject: [PATCH 084/426] Update vendored PostgreSQL versions to 14.7 and 15.2 (#3581) ## Describe your changes Rebase vendored PostgreSQL onto 14.7 and 15.2 ## Issue ticket number and link #3579 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [x] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [x] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ``` The version of PostgreSQL that we use is updated to 14.7 for PostgreSQL 14 and 15.2 for PostgreSQL 15. ``` --- test_runner/fixtures/neon_fixtures.py | 10 +- test_runner/regress/test_tenant_size.py | 164 ++++++++++++++++-------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 4 files changed, 120 insertions(+), 58 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 73f224039e..c4b3d057f8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1669,7 +1669,7 @@ class AbstractNeonCli(abc.ABC): timeout=timeout, ) if not res.returncode: - log.info(f"Run success: {res.stdout}") + log.info(f"Run {res.args} success: {res.stdout}") elif check_return_code: # this way command output will be in recorded and shown in CI in failure message msg = f"""\ @@ -3463,6 +3463,14 @@ def wait_for_last_flush_lsn( return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) +def wait_for_wal_insert_lsn( + env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId +) -> Lsn: + """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" + last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) + return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) + + def fork_at_current_lsn( env: NeonEnv, pg: Postgres, diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 8c2996f491..a4b5f7739a 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -3,8 +3,15 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn -from fixtures.types import Lsn +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, + Postgres, + wait_for_last_flush_lsn, + wait_for_wal_insert_lsn, +) +from fixtures.types import Lsn, TenantId, TimelineId def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): @@ -324,7 +331,7 @@ def test_single_branch_get_tenant_size_grows( # inserts is larger than gc_horizon. for example 0x20000 here hid the fact # that there next_gc_cutoff could be smaller than initdb_lsn, which will # obviously lead to issues when calculating the size. - gc_horizon = 0x30000 + gc_horizon = 0x38000 neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" env = neon_env_builder.init_start() @@ -334,29 +341,75 @@ def test_single_branch_get_tenant_size_grows( http_client = env.pageserver.http_client() - collected_responses: List[Tuple[Lsn, int]] = [] + collected_responses: List[Tuple[str, Lsn, int]] = [] size_debug_file = open(test_output_dir / "size_debug.html", "w") - def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int): - if current_lsn - initdb_lsn > gc_horizon: + def check_size_change( + current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int + ): + if current_lsn - initdb_lsn >= gc_horizon: assert ( - size >= prev + size >= prev_size ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size" else: assert ( - size > prev + size > prev_size ), "tenant_size should grow, because we continue to add WAL to initial snapshot size" - with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg: - initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + def get_current_consistent_size( + env: NeonEnv, + pg: Postgres, + size_debug_file, # apparently there is no public signature for open()... + http_client: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Tuple[Lsn, int]: + consistent = False + size_debug = None + + current_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id) + # We want to make sure we have a self-consistent set of values. + # Size changes with WAL, so only if both before and after getting + # the size of the tenant reports the same WAL insert LSN, we're OK + # to use that (size, LSN) combination. + # Note that 'wait_for_wal_flush_lsn' is not accurate enough: There + # can be more wal after the flush LSN that can arrive on the + # pageserver before we're requesting the page size. + # Anyway, in general this is only one iteration, so in general + # this is fine. + while not consistent: + size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) + size_debug = http_client.tenant_size_debug(tenant_id) + + after_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id) + consistent = current_lsn == after_lsn + current_lsn = after_lsn + size_debug_file.write(size_debug) + return (current_lsn, size) + + with env.postgres.create_start( + branch_name, + tenant_id=tenant_id, + ### autovacuum is disabled to limit WAL logging. + config_lines=["autovacuum=off"], + ) as pg: + (initdb_lsn, size) = get_current_consistent_size( + env, pg, size_debug_file, http_client, tenant_id, timeline_id + ) + collected_responses.append(("INITDB", initdb_lsn, size)) + with pg.cursor() as cur: - cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)") + cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL) WITH (fillfactor = 40)") + + (current_lsn, size) = get_current_consistent_size( + env, pg, size_debug_file, http_client, tenant_id, timeline_id + ) + collected_responses.append(("CREATE", current_lsn, size)) batch_size = 100 - i = 0 - while True: + for i in range(3): with pg.cursor() as cur: cur.execute( f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)", @@ -365,27 +418,24 @@ def test_single_branch_get_tenant_size_grows( i += 1 - current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + (current_lsn, size) = get_current_consistent_size( + env, pg, size_debug_file, http_client, tenant_id, timeline_id + ) - size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) + prev_size = collected_responses[-1][2] + if size == 0: + assert prev_size == 0 + else: + # branch start shouldn't be past gc_horizon yet + # thus the size should grow as we insert more data + # "gc_horizon" is tuned so that it kicks in _after_ the + # insert phase, but before the update phase ends. + assert ( + current_lsn - initdb_lsn <= gc_horizon + ), "Tuning of GC window is likely out-of-date" + assert size > prev_size - size_debug = http_client.tenant_size_debug(tenant_id) - size_debug_file.write(size_debug) - - if len(collected_responses) > 0: - prev = collected_responses[-1][1] - if size == 0: - assert prev == 0 - else: - # branch start shouldn't be past gc_horizon yet - # thus the size should grow as we insert more data - assert current_lsn - initdb_lsn <= gc_horizon - assert size > prev - - collected_responses.append((current_lsn, size)) - - if len(collected_responses) > 2: - break + collected_responses.append(("INSERT", current_lsn, size)) while True: with pg.cursor() as cur: @@ -397,18 +447,15 @@ def test_single_branch_get_tenant_size_grows( if updated == 0: break - current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + (current_lsn, size) = get_current_consistent_size( + env, pg, size_debug_file, http_client, tenant_id, timeline_id + ) - size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) + prev_size = collected_responses[-1][2] - size_debug = http_client.tenant_size_debug(tenant_id) - size_debug_file.write(size_debug) + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) - prev = collected_responses[-1][1] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) - - collected_responses.append((current_lsn, size)) + collected_responses.append(("UPDATE", current_lsn, size)) while True: with pg.cursor() as cur: @@ -418,40 +465,47 @@ def test_single_branch_get_tenant_size_grows( if deleted == 0: break - current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + (current_lsn, size) = get_current_consistent_size( + env, pg, size_debug_file, http_client, tenant_id, timeline_id + ) - size = http_client.tenant_size(tenant_id) - prev = collected_responses[-1][1] + prev_size = collected_responses[-1][2] - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) - collected_responses.append((current_lsn, size)) + collected_responses.append(("DELETE", current_lsn, size)) with pg.cursor() as cur: cur.execute("DROP TABLE t0") - current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + # The size of the tenant should still be as large as before we dropped + # the table, because the drop operation can still be undone in the PITR + # defined by gc_horizon. + (current_lsn, size) = get_current_consistent_size( + env, pg, size_debug_file, http_client, tenant_id, timeline_id + ) - size = http_client.tenant_size(tenant_id) - prev = collected_responses[-1][1] + prev_size = collected_responses[-1][2] - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) - collected_responses.append((current_lsn, size)) + collected_responses.append(("DROP", current_lsn, size)) # this isn't too many lines to forget for a while. observed while # developing these tests that locally the value is a bit more than what we # get in the ci. - for lsn, size in collected_responses: - log.info(f"collected: {lsn}, {size}") + for phase, lsn, size in collected_responses: + log.info(f"collected: {phase}, {lsn}, {size}") env.pageserver.stop() env.pageserver.start() + size_after = http_client.tenant_size(tenant_id) + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) size_debug_file.close() - size_after = http_client.tenant_size(tenant_id) - prev = collected_responses[-1][1] + prev = collected_responses[-1][2] assert size_after == prev, "size after restarting pageserver should not have changed" diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index b44ee1d9a5..468d3c0824 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51 +Subproject commit 468d3c08245906f083fed1009759f9f953f5915d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 303fa4050f..9a2093383a 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3 +Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869 From ec4ecdd5435e1a33842ccfb7322d24d623a48dcf Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Thu, 23 Feb 2023 16:01:48 +0100 Subject: [PATCH 085/426] Enable postgres SPI extensions --- Dockerfile.compute-node | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index bf5e6d99c2..68278ed07b 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -32,11 +32,15 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \ # Enable some of contrib extensions + echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control ######################################################################################### From c79dd8d4580d044f9bac7318788bc660da19c14a Mon Sep 17 00:00:00 2001 From: Sam Kleinman Date: Thu, 23 Feb 2023 13:19:39 -0500 Subject: [PATCH 086/426] compute_ctl: support for fetching spec from control plane (#3610) --- Cargo.lock | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/bin/compute_ctl.rs | 36 +++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index dab3d12263..c97e9a196a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -854,6 +854,7 @@ dependencies = [ "opentelemetry", "postgres", "regex", + "reqwest", "serde", "serde_json", "tar", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index f8c3481f57..46b0e80896 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -17,6 +17,7 @@ regex.workspace = true serde.workspace = true serde_json.workspace = true tar.workspace = true +reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tracing.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 49cf1cd347..a4e9262072 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -65,6 +65,9 @@ fn main() -> Result<()> { let spec = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let compute_id = matches.get_one::("compute-id"); + let control_plane_uri = matches.get_one::("control-plane-uri"); + // Try to use just 'postgres' if no path is provided let pgbin = matches.get_one::("pgbin").unwrap(); @@ -77,8 +80,27 @@ fn main() -> Result<()> { let path = Path::new(sp); let file = File::open(path)?; serde_json::from_reader(file)? + } else if let Some(id) = compute_id { + if let Some(cp_base) = control_plane_uri { + let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec"); + let jwt: String = match std::env::var("NEON_CONSOLE_JWT") { + Ok(v) => v, + Err(_) => "".to_string(), + }; + + reqwest::blocking::Client::new() + .get(cp_uri) + .header("Authorization", jwt) + .send()? + .json()? + } else { + panic!( + "must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"", + control_plane_uri, compute_id + ); + } } else { - panic!("cluster spec should be provided via --spec or --spec-path argument"); + panic!("compute spec should be provided via --spec or --spec-path argument"); } } }; @@ -227,6 +249,18 @@ fn cli() -> clap::Command { .long("spec-path") .value_name("SPEC_PATH"), ) + .arg( + Arg::new("compute-id") + .short('i') + .long("compute-id") + .value_name("COMPUTE_ID"), + ) + .arg( + Arg::new("control-plane-uri") + .short('p') + .long("control-plane-uri") + .value_name("CONTROL_PLANE"), + ) } #[test] From 9f906ff2369ac9cea4a92245d88e5a70cf5f7e02 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Thu, 23 Feb 2023 19:56:21 +0100 Subject: [PATCH 087/426] Add pageserver-2.us-east-2.aws.neon.tech (#3701) --- .github/ansible/prod.us-east-2.hosts.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 56bece3e77..22c705e1cf 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -27,6 +27,8 @@ storage: ansible_host: i-062227ba7f119eb8c pageserver-1.us-east-2.aws.neon.tech: ansible_host: i-0b3ec0afab5968938 + pageserver-2.us-east-2.aws.neon.tech: + ansible_host: i-0d7a1c4325e71421d safekeepers: hosts: From f51b48fa492dd9ff65ac5026c2b6928b41c4e25a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 24 Feb 2023 13:45:32 +0200 Subject: [PATCH 088/426] Fix UNLOGGED tables. Instead of trying to create missing files on the way, send init fork contents as main fork from pageserver during basebackup. Add test for that. Call put_rel_drop for init forks; previously they weren't removed. Bump vendor/postgres to revert previous approach on Postgres side. Co-authored-by: Arseny Sher ref https://github.com/neondatabase/postgres/pull/264 ref https://github.com/neondatabase/postgres/pull/259 ref https://github.com/neondatabase/neon/issues/1222 --- libs/pageserver_api/src/reltag.rs | 9 ++++++ pageserver/src/basebackup.rs | 45 ++++++++++++++++++++-------- pageserver/src/walingest.rs | 4 +-- test_runner/regress/test_unlogged.py | 34 +++++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 6 files changed, 79 insertions(+), 17 deletions(-) create mode 100644 test_runner/regress/test_unlogged.py diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 43d38bd986..12693379f5 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -98,6 +98,15 @@ impl RelTag { name } + + pub fn with_forknum(&self, forknum: u8) -> Self { + RelTag { + forknum, + spcnode: self.spcnode, + dbnode: self.dbnode, + relnode: self.relnode, + } + } } /// diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 06d4853274..41fa0a67bb 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -33,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; use postgres_ffi::TransactionId; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; @@ -190,14 +191,31 @@ where { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; - // Gather and send relational files in each database if full backup is requested. - if self.full_backup { - for rel in self - .timeline - .list_rels(spcnode, dbnode, self.lsn, self.ctx) - .await? - { - self.add_rel(rel).await?; + // If full backup is requested, include all relation files. + // Otherwise only include init forks of unlogged relations. + let rels = self + .timeline + .list_rels(spcnode, dbnode, self.lsn, self.ctx) + .await?; + for &rel in rels.iter() { + // Send init fork as main fork to provide well formed empty + // contents of UNLOGGED relations. Postgres copies it in + // `reinit.c` during recovery. + if rel.forknum == INIT_FORKNUM { + // I doubt we need _init fork itself, but having it at least + // serves as a marker relation is unlogged. + self.add_rel(rel, rel).await?; + self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?; + continue; + } + + if self.full_backup { + if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM)) + { + // skip this, will include it when we reach the init fork + continue; + } + self.add_rel(rel, rel).await?; } } } @@ -220,15 +238,16 @@ where Ok(()) } - async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { + /// Add contents of relfilenode `src`, naming it as `dst`. + async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { let nblocks = self .timeline - .get_rel_size(tag, self.lsn, false, self.ctx) + .get_rel_size(src, self.lsn, false, self.ctx) .await?; // If the relation is empty, create an empty file if nblocks == 0 { - let file_name = tag.to_segfile_name(0); + let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; self.ar.append(&header, &mut io::empty()).await?; return Ok(()); @@ -244,12 +263,12 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx) + .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx) .await?; segment_data.extend_from_slice(&img[..]); } - let file_name = tag.to_segfile_name(seg as u32); + let file_name = dst.to_segfile_name(seg as u32); let header = new_tar_header(&file_name, segment_data.len() as u64)?; self.ar.append(&header, segment_data.as_slice()).await?; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3761c65668..63d568a342 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -37,7 +37,7 @@ use crate::walrecord::*; use crate::ZERO_PAGE; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::CheckPoint; @@ -762,7 +762,7 @@ impl<'a> WalIngest<'a> { )?; for xnode in &parsed.xnodes { - for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM { + for forknum in MAIN_FORKNUM..=INIT_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py new file mode 100644 index 0000000000..b6b20f1230 --- /dev/null +++ b/test_runner/regress/test_unlogged.py @@ -0,0 +1,34 @@ +from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn + + +# +# Test UNLOGGED tables/relations. Postgres copies init fork contents to main +# fork to reset them during recovery. In Neon, pageserver directly sends init +# fork contents as main fork during basebackup. +# +def test_unlogged(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_unlogged", "empty") + pg = env.postgres.create_start("test_unlogged") + + conn = pg.connect() + cur = conn.cursor() + + cur.execute("CREATE UNLOGGED TABLE iut (id int);") + # create index to test unlogged index relation as well + cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);") + cur.execute("INSERT INTO iut values (42);") + + # create another compute to fetch inital empty contents from pageserver + fork_at_current_lsn(env, pg, "test_unlogged_basebackup", "test_unlogged") + pg2 = env.postgres.create_start( + "test_unlogged_basebackup", + ) + + conn2 = pg2.connect() + cur2 = conn2.cursor() + # after restart table should be empty but valid + cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)") + cur2.execute("EXECUTE iut_plan (43);") + cur2.execute("SELECT * FROM iut") + assert cur2.fetchall() == [(43,)] diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 468d3c0824..5fb2e0bba0 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 468d3c08245906f083fed1009759f9f953f5915d +Subproject commit 5fb2e0bba06cc018ee2506f337c91751ab695454 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 9a2093383a..919851e781 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869 +Subproject commit 919851e7811fcb2ecfc67f35bfd63a35639c73b5 From 000eb1b069ce5f17b8711130488f1fe8bd206749 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 27 Feb 2023 12:44:08 +0000 Subject: [PATCH 089/426] Bump tempfile from 3.3.0 to 3.4.0 (#3709) Update `tempfile` crate to get rid of `remove_dir_all` dependency Ref https://github.com/neondatabase/neon/security/dependabot/15 --- Cargo.lock | 18 ++++-------------- Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c97e9a196a..6b23144182 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3067,15 +3067,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi", -] - [[package]] name = "reqwest" version = "0.11.14" @@ -3849,16 +3840,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" dependencies = [ "cfg-if", "fastrand", - "libc", "redox_syscall", - "remove_dir_all", - "winapi", + "rustix", + "windows-sys 0.42.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 99a3f56026..ea22b04124 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -150,7 +150,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } criterion = "0.4" rcgen = "0.10" rstest = "0.16" -tempfile = "3.2" +tempfile = "3.4" tonic-build = "0.8" # This is only needed for proxy's tests. From 1360361f602f4e253257033e21ccdbfd8bcc542e Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 28 Feb 2023 21:11:00 -0800 Subject: [PATCH 090/426] Fix missing VM cgconfig.conf (#3718) It was being added to the wrong stage in the dockerfile. This should fix it, and resolves an ongoing issue on staging. --- Dockerfile.vm-compute-node | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node index af3bfb3590..9929c9b675 100644 --- a/Dockerfile.vm-compute-node +++ b/Dockerfile.vm-compute-node @@ -10,7 +10,6 @@ RUN set -e \ && rm -f /etc/inittab \ && touch /etc/inittab -ADD vm-cgconfig.conf /etc/cgconfig.conf RUN set -e \ && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab @@ -26,6 +25,7 @@ RUN apt update && \ RUN adduser vm-informant --disabled-password --no-create-home USER postgres +ADD vm-cgconfig.conf /etc/cgconfig.conf COPY --from=informant /etc/inittab /etc/inittab COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant From d19c5248c960baffcd5ae096e8f18bbd5dfd45e0 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Wed, 1 Mar 2023 18:09:08 +0200 Subject: [PATCH 091/426] Add UUID header to mgmt API (#3708) ## Describe your changes ## Issue ticket number and link #3479 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- Cargo.lock | 3 +- libs/utils/Cargo.toml | 3 +- libs/utils/src/http/endpoint.rs | 116 ++++++++++++++++++++++++++++++-- workspace_hack/Cargo.toml | 1 - 4 files changed, 116 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6b23144182..02b03e02fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4505,6 +4505,7 @@ dependencies = [ "byteorder", "bytes", "criterion", + "futures", "git-version", "heapless", "hex", @@ -4534,6 +4535,7 @@ dependencies = [ "tracing", "tracing-subscriber", "url", + "uuid", "workspace_hack", ] @@ -4840,7 +4842,6 @@ dependencies = [ "either", "fail", "futures", - "futures-channel", "futures-executor", "futures-util", "hashbrown 0.12.3", diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 92e805ac58..6acdb6fa53 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -13,6 +13,7 @@ bincode.workspace = true bytes.workspace = true heapless.workspace = true hyper = { workspace = true, features = ["full"] } +futures = { workspace = true} routerify.workspace = true serde.workspace = true serde_json.workspace = true @@ -39,7 +40,7 @@ pq_proto.workspace = true workspace_hack.workspace = true url.workspace = true - +uuid = { version = "1.2", features = ["v4", "serde"] } [dev-dependencies] byteorder.workspace = true bytes.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 9c300de7a7..41975f6944 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -8,8 +8,7 @@ use hyper::{Method, StatusCode}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; -use routerify::RequestInfo; -use routerify::{Middleware, Router, RouterBuilder, RouterService}; +use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService}; use tokio::task::JoinError; use tracing; @@ -27,14 +26,35 @@ static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static X_REQUEST_ID_HEADER_STR: &str = "x-request-id"; + +static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HEADER_STR); +#[derive(Debug, Default, Clone)] +struct RequestId(String); + async fn logger(res: Response, info: RequestInfo) -> Result, ApiError> { + let request_id = info.context::().unwrap_or_default().0; + // cannot factor out the Level to avoid the repetition // because tracing can only work with const Level // which is not the case here + if info.method() == Method::GET && res.status() == StatusCode::OK { - tracing::debug!("{} {} {}", info.method(), info.uri().path(), res.status()); + tracing::debug!( + "{} {} {} {}", + info.method(), + info.uri().path(), + request_id, + res.status() + ); } else { - tracing::info!("{} {} {}", info.method(), info.uri().path(), res.status()); + tracing::info!( + "{} {} {} {}", + info.method(), + info.uri().path(), + request_id, + res.status() + ); } Ok(res) } @@ -63,9 +83,52 @@ async fn prometheus_metrics_handler(_req: Request) -> Result( +) -> Middleware { + Middleware::pre(move |req| async move { + let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) { + Some(request_id) => request_id + .to_str() + .expect("extract request id value") + .to_owned(), + None => { + let request_id = uuid::Uuid::new_v4(); + request_id.to_string() + } + }; + + if req.method() == Method::GET { + tracing::debug!("{} {} {}", req.method(), req.uri().path(), request_id); + } else { + tracing::info!("{} {} {}", req.method(), req.uri().path(), request_id); + } + req.set_context(RequestId(request_id)); + + Ok(req) + }) +} + +async fn add_request_id_header_to_response( + mut res: Response, + req_info: RequestInfo, +) -> Result, ApiError> { + if let Some(request_id) = req_info.context::() { + if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) { + res.headers_mut() + .insert(&X_REQUEST_ID_HEADER, request_header_value); + }; + }; + + Ok(res) +} + pub fn make_router() -> RouterBuilder { Router::builder() + .middleware(add_request_id_middleware()) .middleware(Middleware::post_with_info(logger)) + .middleware(Middleware::post_with_info( + add_request_id_header_to_response, + )) .get("/metrics", prometheus_metrics_handler) .err_handler(error::handler) } @@ -231,3 +294,48 @@ where Ok(()) } +#[cfg(test)] +mod tests { + use super::*; + use futures::future::poll_fn; + use hyper::service::Service; + use routerify::RequestServiceBuilder; + use std::net::{IpAddr, SocketAddr}; + + #[tokio::test] + async fn test_request_id_returned() { + let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); + let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); + let mut service = builder.build(remote_addr); + if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { + panic!("request service is not ready: {:?}", e); + } + + let mut req: Request = Request::default(); + req.headers_mut() + .append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap()); + + let resp: Response = service.call(req).await.unwrap(); + + let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap(); + + assert!(header_val == "42", "response header mismatch"); + } + + #[tokio::test] + async fn test_request_id_empty() { + let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap(); + let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); + let mut service = builder.build(remote_addr); + if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { + panic!("request service is not ready: {:?}", e); + } + + let req: Request = Request::default(); + let resp: Response = service.call(req).await.unwrap(); + + let header_val = resp.headers().get(&X_REQUEST_ID_HEADER); + + assert_ne!(header_val, None, "response header should NOT be empty"); + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index c0cf3c5611..bd21095fff 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -21,7 +21,6 @@ crossbeam-utils = { version = "0.8" } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures = { version = "0.3" } -futures-channel = { version = "0.3", features = ["sink"] } futures-executor = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hashbrown = { version = "0.12", features = ["raw"] } From 8dae87999465898e3e3c39d38358b2f562282c81 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Wed, 1 Mar 2023 23:58:43 -0800 Subject: [PATCH 092/426] Disable VM cgroup shenanigans (#3730) As discussed - temporary, so it can unblock releasing autoscaling. Cleaner to fully remove, then add back rather than commenting it out. --- Dockerfile.vm-compute-node | 11 ++--------- vm-cgconfig.conf | 12 ------------ 2 files changed, 2 insertions(+), 21 deletions(-) delete mode 100644 vm-cgconfig.conf diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node index 9929c9b675..dff40485de 100644 --- a/Dockerfile.vm-compute-node +++ b/Dockerfile.vm-compute-node @@ -11,22 +11,15 @@ RUN set -e \ && touch /etc/inittab RUN set -e \ - && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ - && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab + && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart'" >> /etc/inittab # Combine, starting from non-VM compute node image. FROM $SRC_IMAGE as base -# Temporarily set user back to root so we can run apt update and adduser +# Temporarily set user back to root so we can run adduser USER root -RUN apt update && \ - apt install --no-install-recommends -y \ - cgroup-tools RUN adduser vm-informant --disabled-password --no-create-home USER postgres -ADD vm-cgconfig.conf /etc/cgconfig.conf COPY --from=informant /etc/inittab /etc/inittab COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant - -ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"] diff --git a/vm-cgconfig.conf b/vm-cgconfig.conf deleted file mode 100644 index a2e201708e..0000000000 --- a/vm-cgconfig.conf +++ /dev/null @@ -1,12 +0,0 @@ -# Configuration for cgroups in VM compute nodes -group neon-postgres { - perm { - admin { - uid = vm-informant; - } - task { - gid = users; - } - } - memory {} -} From a60f687ce2b9022f10943b537daf05b09a9d48fc Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Thu, 2 Mar 2023 11:35:25 +0100 Subject: [PATCH 093/426] Compile `rum` extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 68278ed07b..4abc8e9cc5 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -225,6 +225,21 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control +######################################################################################### +# +# Layer "rum-pg-build" +# compile rum extension +# +######################################################################################### +FROM build-deps AS rum-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ + mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control + ######################################################################################### # # Layer "rust extensions" @@ -304,6 +319,7 @@ COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 5e514b8465fa4b470946945ac8887f5ff7803130 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Thu, 2 Mar 2023 14:33:29 +0100 Subject: [PATCH 094/426] Compile pgTAP extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 4abc8e9cc5..1a2cd9fb77 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -240,6 +240,21 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control +######################################################################################### +# +# Layer "pgtap-pg-build" +# compile pgTAP extension +# +######################################################################################### +FROM build-deps AS pgtap-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ + mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control + ######################################################################################### # # Layer "rust extensions" @@ -320,6 +335,7 @@ COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From b23742e09c24b77217c9556699fcdd909332554e Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 3 Mar 2023 14:01:05 +0300 Subject: [PATCH 095/426] Create `/v1/debug_dump` safekeepers endpoint (#3710) Add HTTP endpoint to get full safekeeper state of all existing timelines (all in-memory values and info about all files stored on disk). Example: https://gist.github.com/petuhovskiy/3cbb8f870401e9f486731d145161c286 --- Cargo.lock | 1 + libs/utils/src/http/json.rs | 13 +- safekeeper/Cargo.toml | 1 + safekeeper/src/debug_dump.rs | 264 +++++++++++++++++++++++ safekeeper/src/http/routes.rs | 79 ++++++- safekeeper/src/lib.rs | 1 + safekeeper/src/safekeeper.rs | 3 +- safekeeper/src/timeline.rs | 95 +++++--- safekeeper/src/timelines_global_map.rs | 10 + safekeeper/src/wal_storage.rs | 10 + test_runner/fixtures/neon_fixtures.py | 7 + test_runner/regress/test_wal_acceptor.py | 32 +++ 12 files changed, 468 insertions(+), 48 deletions(-) create mode 100644 safekeeper/src/debug_dump.rs diff --git a/Cargo.lock b/Cargo.lock index 02b03e02fb..fe5aae6ae8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3307,6 +3307,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", + "chrono", "clap 4.1.4", "const_format", "crc32c", diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 8981fdd1dd..40e61e3d0c 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -1,7 +1,9 @@ +use std::fmt::Display; + use anyhow::Context; use bytes::Buf; use hyper::{header, Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Serialize, Serializer}; use super::error::ApiError; @@ -31,3 +33,12 @@ pub fn json_response( .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } + +/// Serialize through Display trait. +pub fn display_serialize(z: &F, s: S) -> Result +where + S: Serializer, + F: Display, +{ + s.serialize_str(&format!("{}", z)) +} diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 4ee8d82203..2424509477 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -10,6 +10,7 @@ anyhow.workspace = true async-trait.workspace = true byteorder.workspace = true bytes.workspace = true +chrono.workspace = true clap = { workspace = true, features = ["derive"] } const_format.workspace = true crc32c.workspace = true diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs new file mode 100644 index 0000000000..674cf9f6eb --- /dev/null +++ b/safekeeper/src/debug_dump.rs @@ -0,0 +1,264 @@ +//! Utils for dumping full state of the safekeeper. + +use std::fs; +use std::fs::DirEntry; +use std::io::BufReader; +use std::io::Read; +use std::path::PathBuf; + +use anyhow::Result; +use chrono::{DateTime, Utc}; +use postgres_ffi::XLogSegNo; +use serde::Serialize; + +use utils::http::json::display_serialize; +use utils::id::NodeId; +use utils::id::TenantTimelineId; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use crate::safekeeper::SafeKeeperState; +use crate::safekeeper::SafekeeperMemState; +use crate::safekeeper::TermHistory; +use crate::SafeKeeperConf; + +use crate::timeline::ReplicaState; +use crate::GlobalTimelines; + +/// Various filters that influence the resulting JSON output. +#[derive(Debug, Serialize)] +pub struct Args { + /// Dump all available safekeeper state. False by default. + pub dump_all: bool, + + /// Dump control_file content. Uses value of `dump_all` by default. + pub dump_control_file: bool, + + /// Dump in-memory state. Uses value of `dump_all` by default. + pub dump_memory: bool, + + /// Dump all disk files in a timeline directory. Uses value of `dump_all` by default. + pub dump_disk_content: bool, + + /// Dump full term history. True by default. + pub dump_term_history: bool, + + /// Filter timelines by tenant_id. + pub tenant_id: Option, + + /// Filter timelines by timeline_id. + pub timeline_id: Option, +} + +/// Response for debug dump request. +#[derive(Debug, Serialize)] +pub struct Response { + pub start_time: DateTime, + pub finish_time: DateTime, + pub timelines: Vec, + pub timelines_count: usize, + pub config: Config, +} + +/// Safekeeper configuration. +#[derive(Debug, Serialize)] +pub struct Config { + pub id: NodeId, + pub workdir: PathBuf, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub no_sync: bool, + pub max_offloader_lag_bytes: u64, + pub wal_backup_enabled: bool, +} + +#[derive(Debug, Serialize)] +pub struct Timeline { + #[serde(serialize_with = "display_serialize")] + pub tenant_id: TenantId, + #[serde(serialize_with = "display_serialize")] + pub timeline_id: TimelineId, + pub control_file: Option, + pub memory: Option, + pub disk_content: Option, +} + +#[derive(Debug, Serialize)] +pub struct Memory { + pub is_cancelled: bool, + pub peers_info_len: usize, + pub replicas: Vec>, + pub wal_backup_active: bool, + pub active: bool, + pub num_computes: u32, + pub last_removed_segno: XLogSegNo, + pub epoch_start_lsn: Lsn, + pub mem_state: SafekeeperMemState, + + // PhysicalStorage state. + pub write_lsn: Lsn, + pub write_record_lsn: Lsn, + pub flush_lsn: Lsn, + pub file_open: bool, +} + +#[derive(Debug, Serialize)] +pub struct DiskContent { + pub files: Vec, +} + +#[derive(Debug, Serialize)] +pub struct FileInfo { + pub name: String, + pub size: u64, + pub created: DateTime, + pub modified: DateTime, + pub start_zeroes: u64, + pub end_zeroes: u64, + // TODO: add sha256 checksum +} + +/// Build debug dump response, using the provided [`Args`] filters. +pub fn build(args: Args) -> Result { + let start_time = Utc::now(); + let timelines_count = GlobalTimelines::timelines_count(); + + let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() { + // If both tenant_id and timeline_id are specified, we can just get the + // timeline directly, without taking a snapshot of the whole list. + let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap()); + if let Ok(tli) = GlobalTimelines::get(ttid) { + vec![tli] + } else { + vec![] + } + } else { + // Otherwise, take a snapshot of the whole list. + GlobalTimelines::get_all() + }; + + // TODO: return Stream instead of Vec + let mut timelines = Vec::new(); + for tli in ptrs_snapshot { + let ttid = tli.ttid; + if let Some(tenant_id) = args.tenant_id { + if tenant_id != ttid.tenant_id { + continue; + } + } + if let Some(timeline_id) = args.timeline_id { + if timeline_id != ttid.timeline_id { + continue; + } + } + + let control_file = if args.dump_control_file { + let mut state = tli.get_state().1; + if !args.dump_term_history { + state.acceptor_state.term_history = TermHistory(vec![]); + } + Some(state) + } else { + None + }; + + let memory = if args.dump_memory { + Some(tli.memory_dump()) + } else { + None + }; + + let disk_content = if args.dump_disk_content { + // build_disk_content can fail, but we don't want to fail the whole + // request because of that. + build_disk_content(&tli.timeline_dir).ok() + } else { + None + }; + + let timeline = Timeline { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + control_file, + memory, + disk_content, + }; + timelines.push(timeline); + } + + let config = GlobalTimelines::get_global_config(); + + Ok(Response { + start_time, + finish_time: Utc::now(), + timelines, + timelines_count, + config: build_config(config), + }) +} + +/// Builds DiskContent from a directory path. It can fail if the directory +/// is deleted between the time we get the path and the time we try to open it. +fn build_disk_content(path: &std::path::Path) -> Result { + let mut files = Vec::new(); + for entry in fs::read_dir(path)? { + if entry.is_err() { + continue; + } + let file = build_file_info(entry?); + if file.is_err() { + continue; + } + files.push(file?); + } + + Ok(DiskContent { files }) +} + +/// Builds FileInfo from DirEntry. Sometimes it can return an error +/// if the file is deleted between the time we get the DirEntry +/// and the time we try to open it. +fn build_file_info(entry: DirEntry) -> Result { + let metadata = entry.metadata()?; + let path = entry.path(); + let name = path + .file_name() + .and_then(|x| x.to_str()) + .unwrap_or("") + .to_owned(); + let mut file = fs::File::open(path)?; + let mut reader = BufReader::new(&mut file).bytes().filter_map(|x| x.ok()); + + let start_zeroes = reader.by_ref().take_while(|&x| x == 0).count() as u64; + let mut end_zeroes = 0; + for b in reader { + if b == 0 { + end_zeroes += 1; + } else { + end_zeroes = 0; + } + } + + Ok(FileInfo { + name, + size: metadata.len(), + created: DateTime::from(metadata.created()?), + modified: DateTime::from(metadata.modified()?), + start_zeroes, + end_zeroes, + }) +} + +/// Converts SafeKeeperConf to Config, filtering out the fields that are not +/// supposed to be exposed. +fn build_config(config: SafeKeeperConf) -> Config { + Config { + id: config.my_id, + workdir: config.workdir, + listen_pg_addr: config.listen_pg_addr, + listen_http_addr: config.listen_http_addr, + no_sync: config.no_sync, + max_offloader_lag_bytes: config.max_offloader_lag_bytes, + wal_backup_enabled: config.wal_backup_enabled, + } +} diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a917d61678..ced9599b36 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -5,14 +5,16 @@ use once_cell::sync::Lazy; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::SkTimelineInfo; use serde::Serialize; -use serde::Serializer; use std::collections::{HashMap, HashSet}; -use std::fmt::Display; +use std::fmt; +use std::str::FromStr; use std::sync::Arc; use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::task::JoinError; +use utils::http::json::display_serialize; +use crate::debug_dump; use crate::safekeeper::ServerInfo; use crate::safekeeper::Term; @@ -54,15 +56,6 @@ fn get_conf(request: &Request) -> &SafeKeeperConf { .as_ref() } -/// Serialize through Display trait. -fn display_serialize(z: &F, s: S) -> Result -where - S: Serializer, - F: Display, -{ - s.serialize_str(&format!("{}", z)) -} - /// Same as TermSwitchEntry, but serializes LSN using display serializer /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. #[derive(Debug, Serialize)] @@ -276,6 +269,69 @@ async fn record_safekeeper_info(mut request: Request) -> Result>(k: &str, v: &str) -> Result { + v.parse() + .map_err(|e| ApiError::BadRequest(anyhow::anyhow!("cannot parse {k}: {e}"))) +} + +/// Dump debug info about all available safekeeper state. +async fn dump_debug_handler(mut request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + ensure_no_body(&mut request).await?; + + let mut dump_all: Option = None; + let mut dump_control_file: Option = None; + let mut dump_memory: Option = None; + let mut dump_disk_content: Option = None; + let mut dump_term_history: Option = None; + let mut tenant_id: Option = None; + let mut timeline_id: Option = None; + + let query = request.uri().query().unwrap_or(""); + let mut values = url::form_urlencoded::parse(query.as_bytes()); + + for (k, v) in &mut values { + match k.as_ref() { + "dump_all" => dump_all = Some(parse_kv_str(&k, &v)?), + "dump_control_file" => dump_control_file = Some(parse_kv_str(&k, &v)?), + "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?), + "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?), + "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?), + "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?), + "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?), + _ => Err(ApiError::BadRequest(anyhow::anyhow!( + "Unknown query parameter: {}", + k + )))?, + } + } + + let dump_all = dump_all.unwrap_or(false); + let dump_control_file = dump_control_file.unwrap_or(dump_all); + let dump_memory = dump_memory.unwrap_or(dump_all); + let dump_disk_content = dump_disk_content.unwrap_or(dump_all); + let dump_term_history = dump_term_history.unwrap_or(true); + + let args = debug_dump::Args { + dump_all, + dump_control_file, + dump_memory, + dump_disk_content, + dump_term_history, + tenant_id, + timeline_id, + }; + + let resp = tokio::task::spawn_blocking(move || { + debug_dump::build(args).map_err(ApiError::InternalServerError) + }) + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + + // TODO: use streaming response + json_response(StatusCode::OK, resp) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let mut router = endpoint::make_router(); @@ -316,6 +372,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/record_safekeeper_info/:tenant_id/:timeline_id", record_safekeeper_info, ) + .get("/v1/debug_dump", dump_debug_handler) } #[cfg(test)] diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 891d73533f..6ab108ceb0 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -10,6 +10,7 @@ mod auth; pub mod broker; pub mod control_file; pub mod control_file_upgrade; +pub mod debug_dump; pub mod handler; pub mod http; pub mod json_ctrl; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fa973a3ede..c37411d667 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -204,7 +204,7 @@ pub struct SafeKeeperState { pub peers: PersistedPeers, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize)] // In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values // are not flushed yet. pub struct SafekeeperMemState { @@ -212,6 +212,7 @@ pub struct SafekeeperMemState { pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, pub remote_consistent_lsn: Lsn, + #[serde(with = "hex")] pub proposer_uuid: PgUuid, } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 43c395574f..7479741774 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Result}; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; use pq_proto::ReplicationFeedback; +use serde::Serialize; use std::cmp::{max, min}; use std::path::PathBuf; use tokio::{ @@ -28,9 +29,9 @@ use crate::send_wal::HotStandbyFeedback; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; -use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; +use crate::{debug_dump, wal_storage}; /// Things safekeeper should know about timeline state on peers. #[derive(Debug, Clone)] @@ -80,7 +81,7 @@ impl PeersInfo { } /// Replica status update + hot standby feedback -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Serialize)] pub struct ReplicaState { /// last known lsn received by replica pub last_received_lsn: Lsn, // None means we don't know @@ -381,7 +382,7 @@ pub struct Timeline { cancellation_rx: watch::Receiver, /// Directory where timeline state is stored. - timeline_dir: PathBuf, + pub timeline_dir: PathBuf, } impl Timeline { @@ -588,38 +589,6 @@ impl Timeline { self.write_shared_state().wal_backup_attend() } - /// Returns full timeline info, required for the metrics. If the timeline is - /// not active, returns None instead. - pub fn info_for_metrics(&self) -> Option { - if self.is_cancelled() { - return None; - } - - let state = self.write_shared_state(); - if state.active { - Some(FullTimelineInfo { - ttid: self.ttid, - replicas: state - .replicas - .iter() - .filter_map(|r| r.as_ref()) - .copied() - .collect(), - wal_backup_active: state.wal_backup_active, - timeline_is_active: state.active, - num_computes: state.num_computes, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, - mem_state: state.sk.inmem.clone(), - persisted_state: state.sk.state.clone(), - flush_lsn: state.sk.wal_store.flush_lsn(), - wal_storage: state.sk.wal_store.get_metrics(), - }) - } else { - None - } - } - /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() @@ -784,6 +753,62 @@ impl Timeline { shared_state.last_removed_segno = horizon_segno; Ok(()) } + + /// Returns full timeline info, required for the metrics. If the timeline is + /// not active, returns None instead. + pub fn info_for_metrics(&self) -> Option { + if self.is_cancelled() { + return None; + } + + let state = self.write_shared_state(); + if state.active { + Some(FullTimelineInfo { + ttid: self.ttid, + replicas: state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: state.wal_backup_active, + timeline_is_active: state.active, + num_computes: state.num_computes, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), + }) + } else { + None + } + } + + /// Returns in-memory timeline state to build a full debug dump. + pub fn memory_dump(&self) -> debug_dump::Memory { + let state = self.write_shared_state(); + + let (write_lsn, write_record_lsn, flush_lsn, file_open) = + state.sk.wal_store.internal_state(); + + debug_dump::Memory { + is_cancelled: self.is_cancelled(), + peers_info_len: state.peers_info.0.len(), + replicas: state.replicas.clone(), + wal_backup_active: state.wal_backup_active, + active: state.active, + num_computes: state.num_computes, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.inmem.clone(), + write_lsn, + write_record_lsn, + flush_lsn, + file_open, + } + } } /// Deletes directory and it's contents. Returns false if directory does not exist. diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 66e0145042..baef17ffa8 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -159,6 +159,16 @@ impl GlobalTimelines { Ok(()) } + /// Get the number of timelines in the map. + pub fn timelines_count() -> usize { + TIMELINES_STATE.lock().unwrap().timelines.len() + } + + /// Get the global safekeeper config. + pub fn get_global_config() -> SafeKeeperConf { + TIMELINES_STATE.lock().unwrap().get_conf().clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub fn create( diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 561104bd27..ae02b3c7bc 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -165,6 +165,16 @@ impl PhysicalStorage { }) } + /// Get all known state of the storage. + pub fn internal_state(&self) -> (Lsn, Lsn, Lsn, bool) { + ( + self.write_lsn, + self.write_record_lsn, + self.flush_record_lsn, + self.file.is_some(), + ) + } + /// Call fdatasync if config requires so. fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c4b3d057f8..56b56b8578 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2988,6 +2988,13 @@ class SafekeeperHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def debug_dump(self, params: Dict[str, str] = {}) -> Dict[str, Any]: + res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + def timeline_create( self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn ): diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 9e3b0ec02f..0ac9127c6b 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -775,6 +775,9 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): if not auth_enabled: wa_http_cli = wa.http_client() wa_http_cli.check_status() + + wa_http_cli_debug = wa.http_client() + wa_http_cli_debug.check_status() else: wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() @@ -785,6 +788,10 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli_noauth = wa.http_client() wa_http_cli_noauth.check_status() + # debug endpoint requires safekeeper scope + wa_http_cli_debug = wa.http_client(auth_token=env.auth_keys.generate_safekeeper_token()) + wa_http_cli_debug.check_status() + # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) epoch = tli_status.acceptor_epoch @@ -795,6 +802,12 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"): cli.timeline_status(tenant_id, timeline_id) + # fetch debug_dump endpoint + debug_dump_0 = wa_http_cli_debug.debug_dump({"dump_all": "true"}) + log.info(f"debug_dump before reboot {debug_dump_0}") + assert debug_dump_0["timelines_count"] == 1 + assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id) + pg.safe_psql("create table t(i int)") # ensure epoch goes up after reboot @@ -808,6 +821,25 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # and timeline_start_lsn stays the same assert tli_status.timeline_start_lsn == timeline_start_lsn + # fetch debug_dump after reboot + debug_dump_1 = wa_http_cli_debug.debug_dump({"dump_all": "true"}) + log.info(f"debug_dump after reboot {debug_dump_1}") + assert debug_dump_1["timelines_count"] == 1 + assert debug_dump_1["timelines"][0]["timeline_id"] == str(timeline_id) + + # check that commit_lsn and flush_lsn not decreased + assert ( + debug_dump_1["timelines"][0]["memory"]["mem_state"]["commit_lsn"] + >= debug_dump_0["timelines"][0]["memory"]["mem_state"]["commit_lsn"] + ) + assert ( + debug_dump_1["timelines"][0]["memory"]["flush_lsn"] + >= debug_dump_0["timelines"][0]["memory"]["flush_lsn"] + ) + + # check .config in response + assert debug_dump_1["config"]["id"] == env.safekeepers[0].id + class SafekeeperEnv: def __init__( From 764d27f6960c4eb48335c5576f601dbb1524a59f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 1 Mar 2023 18:42:14 +0100 Subject: [PATCH 096/426] fix checkpoint_timeout serialization in TenantConf Without this change, when actually setting this conf opt, the tenant would become Broken next time we load it. Why? The serde_toml representation that persist_tenant_conf would write out would be a TOML inline table of `secs` and `nsecs`. But our hand-rolled TenantConf parser expects a TOML string. I checked that all other `Duration` values in TenantConfOpt use the humantime serialization. Issues like this would likely be systematically prevent by https://github.com/neondatabase/neon/issues/3682 --- pageserver/src/tenant/config.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index fca08dd51a..1a52b26ae7 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -103,6 +103,7 @@ pub struct TenantConfOpt { pub checkpoint_distance: Option, #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] #[serde(default)] pub checkpoint_timeout: Option, From 68141a924da9442e4cf7a0bad2bb5153fe41d76f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 1 Mar 2023 12:09:02 +0100 Subject: [PATCH 097/426] eviction: remove needless if-let around resident size decrement The branch was always taken at runtime, so, this should not constitute a behavioral change. refs https://github.com/neondatabase/neon/issues/3722 --- pageserver/src/tenant/timeline.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d46ac26e7d..3926249572 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1047,11 +1047,12 @@ impl Timeline { return Ok(false); } - let layer_metadata = LayerFileMetadata::new( - local_layer - .file_size() - .expect("Local layer should have a file size"), - ); + let layer_file_size = local_layer + .file_size() + .expect("Local layer should have a file size"); + + let layer_metadata = LayerFileMetadata::new(layer_file_size); + let new_remote_layer = Arc::new(match local_layer.filename() { LayerFileName::Image(image_name) => RemoteLayer::new_img( self.tenant_id, @@ -1075,15 +1076,13 @@ impl Timeline { let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? { Replacement::Replaced { .. } => { - let layer_size = local_layer.file_size(); - if let Err(e) = local_layer.delete() { error!("failed to remove layer file on evict after replacement: {e:#?}"); } - if let Some(layer_size) = layer_size { - self.metrics.resident_physical_size_gauge.sub(layer_size); - } + self.metrics + .resident_physical_size_gauge + .sub(layer_file_size); true } From 1b9b9d60d462afd893cf989ca6a6cc55039b7e15 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 1 Mar 2023 13:06:43 +0100 Subject: [PATCH 098/426] eviction: add comment explaining resident size decrement on error https://github.com/neondatabase/neon/issues/3722 --- pageserver/src/tenant/timeline.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3926249572..ca0cd04bd0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1079,7 +1079,16 @@ impl Timeline { if let Err(e) = local_layer.delete() { error!("failed to remove layer file on evict after replacement: {e:#?}"); } - + // Always decrement the physical size gauge, even if we failed to delete the file. + // Rationale: we already replaced the layer with a remote layer in the layer map, + // and any subsequent download_remote_layer will + // 1. overwrite the file on disk and + // 2. add the downloaded size to the resident size gauge. + // + // If there is no re-download, and we restart the pageserver, then load_layer_map + // will treat the file as a local layer again, count it towards resident size, + // and it'll be like the layer removal never happened. + // The bump in resident size is perhaps unexpected but overall a robust behavior. self.metrics .resident_physical_size_gauge .sub(layer_file_size); From 38022ff11c1307d36b83070612d705f07f5f3437 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 1 Mar 2023 17:54:59 +0100 Subject: [PATCH 099/426] gc: only decrement resident size if GC'd layer is resident Before this patch, GC would call PersistentLayer::delete() on every GC'ed layer. RemoteLayer::delete() returned Ok(()) unconditionally. GC would then proceed by decrementing the resident size metric, even though the layer is a RemoteLayer. This patch makes the following changes: - Rename PersistentLayer::delete() to delete_resident_layer_file(). That name is unambiguous. - Make RemoteLayer::delete_resident_layer_file return an Err(). We would have uncovered this bug if we had done that from the start. - Change GC / Timeline::delete_historic_layer check whether the layer is remote or not, and only call delete_resident_layer_file() if it's not remote. This brings us in line with how eviction does it. - Add a regression test. fixes https://github.com/neondatabase/neon/issues/3722 --- pageserver/src/tenant/storage_layer.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 2 +- .../src/tenant/storage_layer/image_layer.rs | 2 +- .../src/tenant/storage_layer/remote_layer.rs | 4 +- pageserver/src/tenant/timeline.rs | 15 +- test_runner/fixtures/neon_fixtures.py | 16 +- test_runner/regress/test_layer_eviction.py | 166 ++++++++++++++++++ 7 files changed, 195 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9198cfd1df..52ce2cab42 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -364,7 +364,7 @@ pub trait PersistentLayer: Layer { } /// Permanently remove this layer from disk. - fn delete(&self) -> Result<()>; + fn delete_resident_layer_file(&self) -> Result<()>; fn downcast_remote_layer(self: Arc) -> Option> { None diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 4d1e08322d..37719dfce5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -438,7 +438,7 @@ impl PersistentLayer for DeltaLayer { )) } - fn delete(&self) -> Result<()> { + fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; Ok(()) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index e48abd38dd..e37e001eda 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -252,7 +252,7 @@ impl PersistentLayer for ImageLayer { unimplemented!(); } - fn delete(&self) -> Result<()> { + fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; Ok(()) diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 8465a99339..dbce2e7888 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -155,8 +155,8 @@ impl PersistentLayer for RemoteLayer { bail!("cannot iterate a remote layer"); } - fn delete(&self) -> Result<()> { - Ok(()) + fn delete_resident_layer_file(&self) -> Result<()> { + bail!("remote layer has no layer file"); } fn downcast_remote_layer<'a>(self: Arc) -> Option> { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ca0cd04bd0..101b27bb97 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1076,7 +1076,7 @@ impl Timeline { let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? { Replacement::Replaced { .. } => { - if let Err(e) = local_layer.delete() { + if let Err(e) = local_layer.delete_resident_layer_file() { error!("failed to remove layer file on evict after replacement: {e:#?}"); } // Always decrement the physical size gauge, even if we failed to delete the file. @@ -1950,11 +1950,14 @@ impl Timeline { layer: Arc, updates: &mut BatchedUpdates<'_, dyn PersistentLayer>, ) -> anyhow::Result<()> { - let layer_size = layer.file_size(); - - layer.delete()?; - if let Some(layer_size) = layer_size { - self.metrics.resident_physical_size_gauge.sub(layer_size); + if !layer.is_remote_layer() { + layer.delete_resident_layer_file()?; + let layer_file_size = layer + .file_size() + .expect("Local layer should have a file size"); + self.metrics + .resident_physical_size_gauge + .sub(layer_file_size); } // TODO Removing from the bottom of the layer map is expensive. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 56b56b8578..49218f3c98 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import tempfile import textwrap import time import uuid +from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass, field from enum import Flag, auto @@ -1516,6 +1517,11 @@ class PageserverHttpClient(requests.Session): assert res.status_code == 200 + def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + info = self.layer_map_info(tenant_id, timeline_id) + for layer in info.historic_layers: + self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + @dataclass class TenantConfig: @@ -1551,6 +1557,14 @@ class LayerMapInfo: return info + def kind_count(self) -> Dict[str, int]: + counts: Dict[str, int] = defaultdict(int) + for inmem_layer in self.in_memory_layers: + counts[inmem_layer.kind] += 1 + for hist_layer in self.historic_layers: + counts[hist_layer.kind] += 1 + return counts + @dataclass class InMemoryLayerInfo: @@ -1567,7 +1581,7 @@ class InMemoryLayerInfo: ) -@dataclass +@dataclass(frozen=True) class HistoricLayerInfo: kind: str layer_file_name: str diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index a03dd88c41..404bd67050 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -1,8 +1,13 @@ +import time + import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, + wait_for_last_flush_lsn, wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -138,3 +143,164 @@ def test_basic_eviction( assert ( redownloaded_layer_map_info == initial_layer_map_info ), "Should have the same layer map after redownloading the evicted layers" + + +def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): + + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.LOCAL_FS, + test_name="test_gc_of_remote_layers", + ) + + env = neon_env_builder.init_start() + + tenant_config = { + "pitr_interval": "1s", # set to non-zero, so GC actually does something + "gc_period": "0s", # we want to control when GC runs + "compaction_period": "0s", # we want to control when compaction runs + "checkpoint_timeout": "24h", # something we won't reach + "checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually + "compaction_threshold": "3", + # "image_creation_threshold": set at runtime + "compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers + } + + def tenant_update_config(changes): + tenant_config.update(changes) + env.neon_cli.config_tenant(tenant_id, tenant_config) + + tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config) + log.info("tenant id is %s", tenant_id) + env.initial_tenant = tenant_id # update_and_gc relies on this + ps_http = env.pageserver.http_client() + + pg = env.postgres.create_start("main") + + log.info("fill with data, creating delta & image layers, some of which are GC'able after") + # no particular reason to create the layers like this, but we are sure + # not to hit the image_creation_threshold here. + with pg.cursor() as cur: + cur.execute("create table a (id bigserial primary key, some_value bigint not null)") + cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # Create delta layers, then turn them into image layers. + # Do it multiple times so that there's something to GC. + for k in range(0, 2): + # produce delta layers => disable image layer creation by setting high threshold + tenant_update_config({"image_creation_threshold": "100"}) + for i in range(0, 2): + for j in range(0, 3): + # create a minimal amount of "delta difficulty" for this table + with pg.cursor() as cur: + cur.execute("update a set some_value = -some_value + %s", (j,)) + + with pg.cursor() as cur: + # vacuuming should aid to reuse keys, though it's not really important + # with image_creation_threshold=1 which we will use on the last compaction + cur.execute("vacuum") + + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + if i == 1 and j == 2 and k == 1: + # last iteration; stop before checkpoint to avoid leaving an inmemory layer + pg.stop_and_destroy() + + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # images should not yet be created, because threshold is too high, + # but these will be reshuffled to L1 layers + ps_http.timeline_compact(tenant_id, timeline_id) + + for _ in range(0, 20): + # loop in case flushing is still in progress + layers = ps_http.layer_map_info(tenant_id, timeline_id) + if not layers.in_memory_layers: + break + time.sleep(0.2) + + # now that we've grown some delta layers, turn them into image layers + tenant_update_config({"image_creation_threshold": "1"}) + ps_http.timeline_compact(tenant_id, timeline_id) + + # wait for all uploads to finish + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + # shutdown safekeepers to avoid on-demand downloads from walreceiver + for sk in env.safekeepers: + sk.stop() + + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info("ensure the code above produced image and delta layers") + pre_evict_info = ps_http.layer_map_info(tenant_id, timeline_id) + log.info("layer map dump: %s", pre_evict_info) + by_kind = pre_evict_info.kind_count() + log.info("by kind: %s", by_kind) + assert by_kind["Image"] > 0 + assert by_kind["Delta"] > 0 + assert by_kind["InMemory"] == 0 + resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) + log.info("resident layers count before eviction: %s", len(resident_layers)) + + log.info("evict all layers") + ps_http.evict_all_layers(tenant_id, timeline_id) + + def ensure_resident_and_remote_size_metrics(): + log.info("ensure that all the layers are gone") + resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) + # we have disabled all background loops, so, this should hold + assert len(resident_layers) == 0 + + info = ps_http.layer_map_info(tenant_id, timeline_id) + log.info("layer map dump: %s", info) + + log.info("ensure that resident_physical_size metric is zero") + resident_physical_size_metric = ps_http.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + assert resident_physical_size_metric == 0 + log.info("ensure that resident_physical_size metric corresponds to layer map dump") + assert resident_physical_size_metric == sum( + [layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote] + ) + + # TODO: the following would be nice to assert, but for some reason, the commented-out + # assert below fails with 113401856.0 != 140427264 + # => https://github.com/neondatabase/neon/issues/3738 + # + # log.info("ensure that remote_physical_size metric matches layer map") + # remote_physical_size_metric = ps_http.get_timeline_metric( + # tenant_id, timeline_id, "pageserver_remote_physical_size" + # ) + # log.info("ensure that remote_physical_size metric corresponds to layer map dump") + # assert remote_physical_size_metric == sum( + # [layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote] + # ) + + log.info("before runnning GC, ensure that remote_physical size is zero") + ensure_resident_and_remote_size_metrics() + + log.info("run GC") + time.sleep(2) # let pitr_interval + 1 second pass + ps_http.timeline_gc(tenant_id, timeline_id, 0) + time.sleep(1) + assert not env.pageserver.log_contains("Nothing to GC") + + log.info("ensure GC deleted some layers, otherwise this test is pointless") + post_gc_info = ps_http.layer_map_info(tenant_id, timeline_id) + log.info("layer map dump: %s", post_gc_info) + log.info("by kind: %s", post_gc_info.kind_count()) + pre_evict_layers = set([layer.layer_file_name for layer in pre_evict_info.historic_layers]) + post_gc_layers = set([layer.layer_file_name for layer in post_gc_info.historic_layers]) + assert post_gc_layers.issubset(pre_evict_layers) + assert len(post_gc_layers) < len(pre_evict_layers) + + log.info("update_gc_info might download some layers. Evict them again.") + ps_http.evict_all_layers(tenant_id, timeline_id) + + log.info("after running GC, ensure that resident size is still zero") + ensure_resident_and_remote_size_metrics() From 1b780fa752ee44af27c88a2bed352ee9554b5e01 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 2 Mar 2023 17:05:54 +0100 Subject: [PATCH 100/426] timeline_checkpoint_handler: add span with tenant and timeline id Before this patch, the logs written by freeze_and_flush() and compact() didn't have any span, which made the test logs annoying to read. --- pageserver/src/http/routes.rs | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index d2d9f24efb..3cfc68c1eb 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -971,19 +971,22 @@ async fn timeline_checkpoint_handler(request: Request) -> Result Date: Fri, 3 Mar 2023 14:53:27 +0100 Subject: [PATCH 101/426] tests: use `parse_metrics` everywhere (#3737) - use parse_metrics() in all places where we parse Prometheus metrics - query_all: make `filter` argument optional - encourage using properly parsed, typed metrics by changing get_metrics() to return already-parsed metrics. The new get_metric_str() method, like in the Safekeeper type, returns the raw text response. --- test_runner/fixtures/benchmark_fixture.py | 12 +---- test_runner/fixtures/metrics.py | 3 +- test_runner/fixtures/neon_fixtures.py | 47 ++++++++++--------- test_runner/regress/test_build_info_metric.py | 2 +- test_runner/regress/test_gc_aggressive.py | 3 +- test_runner/regress/test_metric_collection.py | 3 +- test_runner/regress/test_ondemand_download.py | 12 ++++- test_runner/regress/test_tenant_detach.py | 5 +- test_runner/regress/test_tenant_tasks.py | 14 ++++-- test_runner/regress/test_tenants.py | 6 +-- test_runner/regress/test_timeline_size.py | 33 +++++++------ 11 files changed, 74 insertions(+), 66 deletions(-) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index b1489b7ab1..3428e6db8a 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -366,17 +366,9 @@ class NeonBenchmarker: def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int: """Fetch the value of given int counter from pageserver metrics.""" - # TODO: If we start to collect more of the prometheus metrics in the - # performance test suite like this, we should refactor this to load and - # parse all the metrics into a more convenient structure in one go. - # - # The metric should be an integer, as it's a number of bytes. But in general - # all prometheus metrics are floats. So to be pedantic, read it as a float - # and round to integer. all_metrics = pageserver.http_client().get_metrics() - matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE) - assert matches, f"metric {metric_name} not found" - return int(round(float(matches.group(1)))) + sample = all_metrics.query_one(metric_name) + return int(round(sample.value)) def get_timeline_size( self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index ba0d325c39..450c02735a 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -13,7 +13,8 @@ class Metrics: self.metrics = defaultdict(list) self.name = name - def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]: + def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]: + filter = filter or {} res = [] for sample in self.metrics[name]: try: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 49218f3c98..94ee1d50f7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -29,7 +29,6 @@ import asyncpg import backoff # type: ignore import boto3 import jwt -import prometheus_client import psycopg2 import pytest import requests @@ -37,7 +36,7 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics +from fixtures.metrics import Metrics, parse_metrics from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -46,7 +45,6 @@ from fixtures.utils import ( get_self_dir, subprocess_capture, ) -from prometheus_client.parser import text_string_to_metric_families # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -1437,22 +1435,27 @@ class PageserverHttpClient(requests.Session): assert completed["successful_download_count"] > 0 return completed - def get_metrics(self) -> str: + def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) return res.text - def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str): - raw = self.get_metrics() - family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw)) - [metric] = [m for m in family if m.name == metric_name] - [sample] = [ - s - for s in metric.samples - if s.labels["tenant_id"] == str(tenant_id) - and s.labels["timeline_id"] == str(timeline_id) - ] - return sample.value + def get_metrics(self) -> Metrics: + res = self.get_metrics_str() + return parse_metrics(res) + + def get_timeline_metric( + self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str + ) -> float: + metrics = self.get_metrics() + return metrics.query_one( + metric_name, + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ).value def get_remote_timeline_client_metric( self, @@ -1462,7 +1465,7 @@ class PageserverHttpClient(requests.Session): file_kind: str, op_kind: str, ) -> Optional[float]: - metrics = parse_metrics(self.get_metrics(), "pageserver") + metrics = self.get_metrics() matches = metrics.query_all( name=metric_name, filter={ @@ -1481,14 +1484,16 @@ class PageserverHttpClient(requests.Session): assert len(matches) < 2, "above filter should uniquely identify metric" return value - def get_metric_value(self, name: str) -> Optional[str]: + def get_metric_value( + self, name: str, filter: Optional[Dict[str, str]] = None + ) -> Optional[float]: metrics = self.get_metrics() - relevant = [line for line in metrics.splitlines() if line.startswith(name)] - if len(relevant) == 0: + results = metrics.query_all(name, filter=filter) + if not results: log.info(f'could not find metric "{name}"') return None - assert len(relevant) == 1 - return relevant[0].lstrip(name).strip() + assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" + return results[0].value def layer_map_info( self, diff --git a/test_runner/regress/test_build_info_metric.py b/test_runner/regress/test_build_info_metric.py index b75b5bd775..c622d562fd 100644 --- a/test_runner/regress/test_build_info_metric.py +++ b/test_runner/regress/test_build_info_metric.py @@ -8,7 +8,7 @@ def test_build_info_metric(neon_env_builder: NeonEnvBuilder, link_proxy: NeonPro parsed_metrics = {} - parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics()) + parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics_str()) parsed_metrics["safekeeper"] = parse_metrics(env.safekeepers[0].http_client().get_metrics_str()) parsed_metrics["proxy"] = parse_metrics(link_proxy.get_metrics()) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 5f052bf81a..77438e1b64 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -4,7 +4,6 @@ import random import pytest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, @@ -134,7 +133,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: # Helper function that gets the number of given kind of remote ops from the metrics def get_num_remote_ops(file_kind: str, op_kind: str) -> int: - ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + ps_metrics = env.pageserver.http_client().get_metrics() total = 0.0 for sample in ps_metrics.query_all( name="pageserver_remote_operation_seconds_count", diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 3f252992f5..a33af9a3b2 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -9,7 +9,6 @@ from typing import Iterator import pytest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( PSQL, NeonEnvBuilder, @@ -143,7 +142,7 @@ def test_metric_collection( # Helper function that gets the number of given kind of remote ops from the metrics def get_num_remote_ops(file_kind: str, op_kind: str) -> int: - ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + ps_metrics = env.pageserver.http_client().get_metrics() total = 0.0 for sample in ps_metrics.query_all( name="pageserver_remote_operation_seconds_count", diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 5ee94de32d..e6c580c37c 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -11,6 +11,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PageserverApiException, + PageserverHttpClient, RemoteStorageKind, assert_tenant_status, available_remote_storages, @@ -25,9 +26,16 @@ from fixtures.types import Lsn from fixtures.utils import query_scalar -def get_num_downloaded_layers(client, tenant_id, timeline_id): +def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id): value = client.get_metric_value( - f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}' + "pageserver_remote_operation_seconds_count", + { + "file_kind": "layer", + "op_kind": "download", + "status": "success", + "tenant_id": tenant_id, + "timeline_id": timeline_id, + }, ) if value is None: return 0 diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 6c3454b79b..ac1f7b2891 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -6,7 +6,6 @@ from threading import Thread import asyncpg import pytest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, @@ -79,7 +78,7 @@ def test_tenant_reattach( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) - ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver") + ps_metrics = pageserver_http.get_metrics() tenant_metric_filter = { "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), @@ -93,7 +92,7 @@ def test_tenant_reattach( time.sleep(1) # for metrics propagation - ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver") + ps_metrics = pageserver_http.get_metrics() pageserver_last_record_lsn = int( ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value ) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 4eba4ce942..24b211e368 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -50,16 +50,22 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): wait_until(10, 0.2, lambda: assert_active(tenant_id)) # Assert that all tasks finish quickly after tenant is detached - task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + task_starts = client.get_metric_value("pageserver_tenant_task_events_total", {"event": "start"}) assert task_starts is not None assert int(task_starts) > 0 client.tenant_detach(tenant) client.tenant_detach(env.initial_tenant) def assert_tasks_finish(): - tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}') - tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}') - tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}') + tasks_started = client.get_metric_value( + "pageserver_tenant_task_events_total", {"event": "start"} + ) + tasks_ended = client.get_metric_value( + "pageserver_tenant_task_events_total", {"event": "stop"} + ) + tasks_panicked = client.get_metric_value( + "pageserver_tenant_task_events_total", {"event": "panic"} + ) log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended assert tasks_panicked is None or int(tasks_panicked) == 0 diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index e56bb1b469..bf87cb3ad4 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -107,7 +107,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): assert cur.fetchone() == (5000050000,) collected_metrics = { - "pageserver": env.pageserver.http_client().get_metrics(), + "pageserver": env.pageserver.http_client().get_metrics_str(), } for sk in env.safekeepers: collected_metrics[f"safekeeper{sk.id}"] = sk.http_client().get_metrics_str() @@ -207,7 +207,7 @@ def test_pageserver_metrics_removed_after_detach( assert cur.fetchone() == (5000050000,) def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: - ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + ps_metrics = env.pageserver.http_client().get_metrics() samples = [] for metric_name in ps_metrics.metrics: for sample in ps_metrics.query_all( @@ -307,7 +307,7 @@ def test_pageserver_with_empty_tenants( time.sleep(1) # to allow metrics propagation - ps_metrics = parse_metrics(client.get_metrics(), "pageserver") + ps_metrics = client.get_metrics() broken_tenants_metric_filter = { "tenant_id": str(tenant_without_timelines_dir), "state": "broken", diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 3b41cc5c90..ca4f32fff9 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,7 +1,6 @@ import math import queue import random -import re import threading import time from contextlib import closing @@ -465,26 +464,26 @@ def test_timeline_size_metrics( # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() - matches = re.search( - f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', - metrics, - re.MULTILINE, - ) - assert matches - tl_physical_size_metric = int(matches.group(1)) + tl_physical_size_metric = metrics.query_one( + name="pageserver_resident_physical_size", + filter={ + "tenant_id": str(env.initial_tenant), + "timeline_id": str(new_timeline_id), + }, + ).value # assert that the physical size metric matches the actual physical size on disk timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) # Check that the logical size metric is sane, and matches - matches = re.search( - f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', - metrics, - re.MULTILINE, - ) - assert matches - tl_logical_size_metric = int(matches.group(1)) + tl_logical_size_metric = metrics.query_one( + name="pageserver_current_logical_size", + filter={ + "tenant_id": str(env.initial_tenant), + "timeline_id": str(new_timeline_id), + }, + ).value pgdatadir = test_output_dir / "pgdata-vanilla" pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -575,8 +574,8 @@ def get_physical_size_values( client = env.pageserver.http_client() - res.prometheus_resident_physical = client.get_timeline_metric( - tenant_id, timeline_id, "pageserver_resident_physical_size" + res.prometheus_resident_physical = int( + client.get_timeline_metric(tenant_id, timeline_id, "pageserver_resident_physical_size") ) detail = client.timeline_detail( From 66a5159511ec80e032824d85b6eafca9f01b31a6 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 2 Mar 2023 19:04:54 +0100 Subject: [PATCH 102/426] fix: compaction: no index upload scheduled if no on-demand downloads Commit 0cf7fd0fb82b082d02dfadd9d6a488a7f799d72f Compaction with on-demand download (#3598) introduced a subtle bug: if we don't have to do on-demand downloads, we only take one ROUND in fn compact() and exit early. Thereby, we miss scheduling the index part upload for any layers created by fn compact_inner(). Before that commit, we didn't have this problem. So, this patch fixes it. Since no regression test caught this, I went ahead and extended the timeline size tests to assert that, if remote storage is configured, 1. pageserver_remote_physical_size matches the other physical sizes 2. file sizes reported by the layer map info endpoint match the other physical size metrics Without the pageserver code fix, the regression test would fail at the physical size assertion, complaining that any of the resident physical size != remote physical size metric 50790400.0 != 18399232.0 I figured out what the problem is by comparing the remote storage and local directories like so, and noticed that the image layer in the local directory wasn't present on the remote side. It's size was exactly the difference 50790400.0 - 18399232.0 =32391168.0 fixes https://github.com/neondatabase/neon/issues/3738 --- pageserver/src/tenant/timeline.rs | 4 +- test_runner/fixtures/neon_fixtures.py | 20 +++ test_runner/regress/test_layer_eviction.py | 20 ++- test_runner/regress/test_timeline_size.py | 134 ++++++++++++++++++--- 4 files changed, 145 insertions(+), 33 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 101b27bb97..8b24fd6ecd 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -662,8 +662,8 @@ impl Timeline { // update the index file on next flush iteration too. But it // could take a while until that happens. // - // Additionally, only do this on the terminal round before sleeping. - if last_round { + // Additionally, only do this once before we return from this function. + if last_round || res.is_ok() { if let Some(remote_client) = &self.remote_client { remote_client.schedule_index_upload_for_file_changes()?; } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 94ee1d50f7..ba98563693 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3542,3 +3542,23 @@ def wait_for_sk_commit_lsn_to_reach_remote_storage( ps_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(ps_http, tenant_id, timeline_id, lsn) return lsn + + +def wait_for_upload_queue_empty( + pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId +): + ps_http = pageserver.http_client() + while True: + all_metrics = ps_http.get_metrics() + tl = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_unfinished", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + assert len(tl) > 0 + log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") + if all(m.value == 0 for m in tl): + return + time.sleep(0.2) diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 404bd67050..e7c9713f98 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -268,18 +268,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): [layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote] ) - # TODO: the following would be nice to assert, but for some reason, the commented-out - # assert below fails with 113401856.0 != 140427264 - # => https://github.com/neondatabase/neon/issues/3738 - # - # log.info("ensure that remote_physical_size metric matches layer map") - # remote_physical_size_metric = ps_http.get_timeline_metric( - # tenant_id, timeline_id, "pageserver_remote_physical_size" - # ) - # log.info("ensure that remote_physical_size metric corresponds to layer map dump") - # assert remote_physical_size_metric == sum( - # [layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote] - # ) + log.info("ensure that remote_physical_size metric matches layer map") + remote_physical_size_metric = ps_http.get_timeline_metric( + tenant_id, timeline_id, "pageserver_remote_physical_size" + ) + log.info("ensure that remote_physical_size metric corresponds to layer map dump") + assert remote_physical_size_metric == sum( + layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote + ) log.info("before runnning GC, ensure that remote_physical size is zero") ensure_resident_and_remote_size_metrics() diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index ca4f32fff9..ea4b65c9a8 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -5,6 +5,7 @@ import threading import time from contextlib import closing from pathlib import Path +from typing import Optional import psycopg2.errors import psycopg2.extras @@ -18,9 +19,11 @@ from fixtures.neon_fixtures import ( PgBin, PortDistributor, Postgres, + RemoteStorageKind, VanillaPostgres, assert_tenant_status, wait_for_last_flush_lsn, + wait_for_upload_queue_empty, wait_until, ) from fixtures.types import TenantId, TimelineId @@ -301,8 +304,18 @@ def test_timeline_initial_logical_size_calculation_cancellation( # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists" -def test_timeline_physical_size_init(neon_simple_env: NeonEnv): - env = neon_simple_env +@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) +def test_timeline_physical_size_init( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] +): + + if remote_storage_kind is not None: + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_timeline_physical_size_init" + ) + + env = neon_env_builder.init_start() + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") pg = env.postgres.create_start("test_timeline_physical_size_init") @@ -330,12 +343,22 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): ) assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id) + get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), + remote_storage_kind, ) -def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): - env = neon_simple_env +@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) +def test_timeline_physical_size_post_checkpoint( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] +): + if remote_storage_kind is not None: + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_timeline_physical_size_init" + ) + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") @@ -353,11 +376,21 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id) + get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), + remote_storage_kind, ) -def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) +def test_timeline_physical_size_post_compaction( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] +): + + if remote_storage_kind is not None: + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_timeline_physical_size_init" + ) + # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed neon_env_builder.pageserver_config_override = ( @@ -386,15 +419,33 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + + # shutdown safekeepers to prevent new data from coming in + for sk in env.safekeepers: + sk.stop() + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) + if remote_storage_kind is not None: + wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id) + get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), + remote_storage_kind, ) -def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) +def test_timeline_physical_size_post_gc( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] +): + + if remote_storage_kind is not None: + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_timeline_physical_size_init" + ) + # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}" @@ -430,8 +481,12 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) + if remote_storage_kind is not None: + wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id) + get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), + remote_storage_kind, ) @@ -515,18 +570,29 @@ def test_timeline_size_metrics( assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024) -def test_tenant_physical_size(neon_simple_env: NeonEnv): +@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) +def test_tenant_physical_size( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] +): random.seed(100) - env = neon_simple_env + if remote_storage_kind is not None: + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_timeline_physical_size_init" + ) + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() client = env.pageserver.http_client() tenant, timeline = env.neon_cli.create_tenant() + if remote_storage_kind is not None: + wait_for_upload_queue_empty(env.pageserver, tenant, timeline) def get_timeline_resident_physical_size(timeline: TimelineId): - sizes = get_physical_size_values(env, tenant, timeline) - assert_physical_size_invariants(sizes) + sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind) + assert_physical_size_invariants(sizes, remote_storage_kind) return sizes.prometheus_resident_physical timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline) @@ -546,6 +612,9 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, tenant, timeline) pageserver_http.timeline_checkpoint(tenant, timeline) + if remote_storage_kind is not None: + wait_for_upload_queue_empty(env.pageserver, tenant, timeline) + timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) pg.stop() @@ -563,21 +632,39 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): class TimelinePhysicalSizeValues: api_current_physical: int - prometheus_resident_physical: int + prometheus_resident_physical: float + prometheus_remote_physical: Optional[float] = None python_timelinedir_layerfiles_physical: int + layer_map_file_size_sum: int def get_physical_size_values( - env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId + env: NeonEnv, + tenant_id: TenantId, + timeline_id: TimelineId, + remote_storage_kind: Optional[RemoteStorageKind], ) -> TimelinePhysicalSizeValues: res = TimelinePhysicalSizeValues() client = env.pageserver.http_client() - res.prometheus_resident_physical = int( - client.get_timeline_metric(tenant_id, timeline_id, "pageserver_resident_physical_size") + res.layer_map_file_size_sum = sum( + layer.layer_file_size or 0 + for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers ) + metrics = client.get_metrics() + metrics_filter = {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)} + res.prometheus_resident_physical = metrics.query_one( + "pageserver_resident_physical_size", metrics_filter + ).value + if remote_storage_kind is not None: + res.prometheus_remote_physical = metrics.query_one( + "pageserver_remote_physical_size", metrics_filter + ).value + else: + res.prometheus_remote_physical = None + detail = client.timeline_detail( tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True ) @@ -589,11 +676,20 @@ def get_physical_size_values( return res -def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): +def assert_physical_size_invariants( + sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind] +): # resident phyiscal size is defined as assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical + assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum + # we don't do layer eviction, so, all layers are resident assert sizes.api_current_physical == sizes.prometheus_resident_physical + if remote_storage_kind is not None: + assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical + # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS + else: + assert sizes.prometheus_remote_physical is None # Timeline logical size initialization is an asynchronous background task that runs once, From 9cada8b59d26b97ad361b51c2e2dbf3fc89bd892 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 3 Mar 2023 17:53:43 +0100 Subject: [PATCH 103/426] fix benchmarks, broken by PR #3737 Benchmarks only run on `main` branch, so, the pre-commit tests didn't catch these. --- test_runner/fixtures/benchmark_fixture.py | 17 +++++++++++------ test_runner/fixtures/compare_fixtures.py | 6 +++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 3428e6db8a..a39aaf8241 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -354,20 +354,25 @@ class NeonBenchmarker: """ Fetch the "cumulative # of bytes written" metric from the pageserver """ - metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}' - return self.get_int_counter_value(pageserver, metric_name) + return self.get_int_counter_value( + pageserver, "libmetrics_disk_io_bytes_total", {"io_operation": "write"} + ) def get_peak_mem(self, pageserver: NeonPageserver) -> int: """ Fetch the "maxrss" metric from the pageserver """ - metric_name = r"libmetrics_maxrss_kb" - return self.get_int_counter_value(pageserver, metric_name) + return self.get_int_counter_value(pageserver, "libmetrics_maxrss_kb") - def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int: + def get_int_counter_value( + self, + pageserver: NeonPageserver, + metric_name: str, + label_filters: Optional[Dict[str, str]] = None, + ) -> int: """Fetch the value of given int counter from pageserver metrics.""" all_metrics = pageserver.http_client().get_metrics() - sample = all_metrics.query_one(metric_name) + sample = all_metrics.query_one(metric_name, label_filters) return int(round(sample.value)) def get_timeline_size( diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 17c0b19447..0ba926c8d2 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -144,12 +144,12 @@ class NeonCompare(PgCompare): "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) - params = f'{{tenant_id="{self.tenant}",timeline_id="{self.timeline}"}}' + metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)} total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_created_persistent_files_total" + params + self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters ) total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_written_persistent_bytes_total" + params + self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters ) self.zenbenchmark.record( "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER From 96f65fad682f9c0a51e67d99ce4d7ed07cb66a20 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 6 Mar 2023 10:10:58 +0200 Subject: [PATCH 104/426] Handle crash of walredo process and retry applying wal records (#3739) ## Describe your changes Restart walredo process an d retry applying walredo records i case of abnormal walredo process termination ## Issue ticket number and link See #1700 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- pageserver/src/walredo.rs | 117 ++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 21c6ede27e..72865ad74d 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -256,52 +256,53 @@ impl PostgresRedoManager { pg_version: u32, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; - + const MAX_RETRY_ATTEMPTS: u32 = 1; let start_time = Instant::now(); + let mut n_attempts = 0u32; + loop { + let mut proc = self.stdin.lock().unwrap(); + let lock_time = Instant::now(); - let mut proc = self.stdin.lock().unwrap(); - let lock_time = Instant::now(); + // launch the WAL redo process on first use + if proc.is_none() { + self.launch(&mut proc, pg_version)?; + } + WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); - // launch the WAL redo process on first use - if proc.is_none() { - self.launch(&mut proc, pg_version)?; - } - WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); + // Relational WAL records are applied using wal-redo-postgres + let buf_tag = BufferTag { rel, blknum }; + let result = self + .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout) + .map_err(WalRedoError::IoError); - // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; - let result = self - .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout) - .map_err(WalRedoError::IoError); + let end_time = Instant::now(); + let duration = end_time.duration_since(lock_time); - let end_time = Instant::now(); - let duration = end_time.duration_since(lock_time); + let len = records.len(); + let nbytes = records.iter().fold(0, |acumulator, record| { + acumulator + + match &record.1 { + NeonWalRecord::Postgres { rec, .. } => rec.len(), + _ => unreachable!("Only PostgreSQL records are accepted in this batch"), + } + }); - let len = records.len(); - let nbytes = records.iter().fold(0, |acumulator, record| { - acumulator - + match &record.1 { - NeonWalRecord::Postgres { rec, .. } => rec.len(), - _ => unreachable!("Only PostgreSQL records are accepted in this batch"), - } - }); + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); + WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); - WAL_REDO_TIME.observe(duration.as_secs_f64()); - WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); - WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); + debug!( + "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", + len, + nbytes, + duration.as_micros(), + lsn + ); - debug!( - "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", - len, - nbytes, - duration.as_micros(), - lsn - ); - - // If something went wrong, don't try to reuse the process. Kill it, and - // next request will launch a new one. - if result.is_err() { - error!( + // If something went wrong, don't try to reuse the process. Kill it, and + // next request will launch a new one. + if result.is_err() { + error!( "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), @@ -310,24 +311,28 @@ impl PostgresRedoManager { base_img_lsn, lsn ); - // self.stdin only holds stdin & stderr as_raw_fd(). - // Dropping it as part of take() doesn't close them. - // The owning objects (ChildStdout and ChildStderr) are stored in - // self.stdout and self.stderr, respsectively. - // We intentionally keep them open here to avoid a race between - // currently running `apply_wal_records()` and a `launch()` call - // after we return here. - // The currently running `apply_wal_records()` must not read from - // the newly launched process. - // By keeping self.stdout and self.stderr open here, `launch()` will - // get other file descriptors for the new child's stdout and stderr, - // and hence the current `apply_wal_records()` calls will observe - // `output.stdout.as_raw_fd() != stdout_fd` . - if let Some(proc) = self.stdin.lock().unwrap().take() { - proc.child.kill_and_wait(); + // self.stdin only holds stdin & stderr as_raw_fd(). + // Dropping it as part of take() doesn't close them. + // The owning objects (ChildStdout and ChildStderr) are stored in + // self.stdout and self.stderr, respsectively. + // We intentionally keep them open here to avoid a race between + // currently running `apply_wal_records()` and a `launch()` call + // after we return here. + // The currently running `apply_wal_records()` must not read from + // the newly launched process. + // By keeping self.stdout and self.stderr open here, `launch()` will + // get other file descriptors for the new child's stdout and stderr, + // and hence the current `apply_wal_records()` calls will observe + // `output.stdout.as_raw_fd() != stdout_fd` . + if let Some(proc) = self.stdin.lock().unwrap().take() { + proc.child.kill_and_wait(); + } + } + n_attempts += 1; + if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { + return result; } } - result } /// @@ -771,7 +776,7 @@ impl PostgresRedoManager { &self, mut input: MutexGuard>, tag: BufferTag, - base_img: Option, + base_img: &Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { @@ -787,7 +792,7 @@ impl PostgresRedoManager { let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); build_begin_redo_for_block_msg(tag, &mut writebuf); if let Some(img) = base_img { - build_push_page_msg(tag, &img, &mut writebuf); + build_push_page_msg(tag, img, &mut writebuf); } for (lsn, rec) in records.iter() { if let NeonWalRecord::Postgres { From 7b9057ad0115c721fcdcd2585062576e993fb5a6 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Mon, 6 Mar 2023 18:52:59 +0200 Subject: [PATCH 105/426] Add timeout to download copy (#3675) ## Describe your changes Adding a timeout handling for the remote download of layers of 120 seconds for each operation Note that these downloads are being retried for N times ## Issue ticket number and link Fixes: #3672 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: Joonas Koivunen --- .../tenant/remote_timeline_client/download.rs | 26 +++++++++++++------ pageserver/src/tenant/timeline.rs | 2 +- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 2e79698087..ea8d9858c3 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,11 +6,13 @@ use std::collections::HashSet; use std::future::Future; use std::path::Path; +use std::time::Duration; use anyhow::{anyhow, Context}; use tokio::fs; use tokio::io::AsyncWriteExt; -use tracing::{error, info, warn}; + +use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; @@ -26,6 +28,8 @@ async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Er fs::File::open(path).await?.sync_all().await } +static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); + /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) @@ -64,22 +68,28 @@ pub async fn download_layer_file<'a>( // TODO: this doesn't use the cached fd for some reason? let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { format!( - "Failed to create a destination file for layer '{}'", + "create a destination file for layer '{}'", temp_file_path.display() ) }) .map_err(DownloadError::Other)?; let mut download = storage.download(&remote_path).await.with_context(|| { format!( - "Failed to open a download stream for layer with remote storage path '{remote_path:?}'" + "open a download stream for layer with remote storage path '{remote_path:?}'" ) }) .map_err(DownloadError::Other)?; - let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") - }) - .map_err(DownloadError::Other)?; + + let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file)) + .await + .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))? + .with_context(|| { + format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok((destination_file, bytes_amount)) + }, &format!("download {remote_path:?}"), ).await?; @@ -300,7 +310,7 @@ where } Err(DownloadError::Other(ref err)) => { // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. - error!("{description} still failed after {attempts} retries, giving up: {err:?}"); + warn!("{description} still failed after {attempts} retries, giving up: {err:?}"); return result; } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8b24fd6ecd..c304791ee2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3819,7 +3819,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. - info!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); + error!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download From ca85646df46737339296fae5b286c38a8f097fa9 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 6 Mar 2023 14:20:58 +0400 Subject: [PATCH 106/426] Max peer_horizon_lsn before adopting it. Before this patch, persistent peer_horizon_lsn was always sent to walproposer, making it initially calculate it equal to max of persistent values and in turn pulling back the in memory value. Send instead in memory value and take max when safekeeper sets it. closes https://github.com/neondatabase/neon/issues/3752 --- safekeeper/src/safekeeper.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index c37411d667..7df347427e 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -191,7 +191,8 @@ pub struct SafeKeeperState { /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn /// of last record streamed to everyone). Persisting it helps skipping /// recovery in walproposer, generally we compute it from peers. In - /// walproposer proto called 'truncate_lsn'. + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. pub peer_horizon_lsn: Lsn, /// LSN of the oldest known checkpoint made by pageserver and successfully /// pushed to s3. We don't remove WAL beyond it. Persisted only for @@ -682,7 +683,7 @@ where term: self.state.acceptor_state.term, vote_given: false as u64, flush_lsn: self.flush_lsn(), - truncate_lsn: self.state.peer_horizon_lsn, + truncate_lsn: self.inmem.peer_horizon_lsn, term_history: self.get_term_history(), timeline_start_lsn: self.state.timeline_start_lsn, }; @@ -878,7 +879,13 @@ where if msg.h.commit_lsn != Lsn(0) { self.update_commit_lsn(msg.h.commit_lsn)?; } - self.inmem.peer_horizon_lsn = msg.h.truncate_lsn; + // Value calculated by walproposer can always lag: + // - safekeepers can forget inmem value and send to proposer lower + // persisted one on restart; + // - if we make safekeepers always send persistent value, + // any compute restart would pull it down. + // Thus, take max before adopting. + self.inmem.peer_horizon_lsn = max(self.inmem.peer_horizon_lsn, msg.h.truncate_lsn); // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only From 0acf9ace9a72a298d307ebc95033df08687cff7d Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 6 Mar 2023 17:07:10 +0400 Subject: [PATCH 107/426] Return 404 if timeline is not found in safekeeper HTTP API. --- safekeeper/src/handler.rs | 2 +- safekeeper/src/http/openapi_spec.yaml | 6 ++++++ safekeeper/src/http/routes.rs | 18 ++---------------- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/timeline.rs | 15 ++++++++++++++- safekeeper/src/timelines_global_map.rs | 12 ++++++------ 6 files changed, 30 insertions(+), 25 deletions(-) diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 60df5dd372..99f0e90711 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -218,7 +218,7 @@ impl SafekeeperPostgresHandler { /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> { - let tli = GlobalTimelines::get(self.ttid)?; + let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml index da225f244b..51ce7589a0 100644 --- a/safekeeper/src/http/openapi_spec.yaml +++ b/safekeeper/src/http/openapi_spec.yaml @@ -119,6 +119,12 @@ paths: $ref: "#/components/responses/ForbiddenError" default: $ref: "#/components/responses/GenericError" + "404": + description: Timeline not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" delete: tags: diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index ced9599b36..b157fcb076 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,6 +1,5 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; -use anyhow::Context; use once_cell::sync::Lazy; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::SkTimelineInfo; @@ -112,12 +111,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result Result<(), QueryError> { let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered(); - let tli = GlobalTimelines::get(spg.ttid)?; + let tli = GlobalTimelines::get(spg.ttid).map_err(|e| QueryError::Other(e.into()))?; // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(&tli); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 7479741774..98c565cde4 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,7 +1,7 @@ //! This module implements Timeline lifecycle management and has all neccessary code //! to glue together SafeKeeper and all other background services. -use anyhow::{bail, Result}; +use anyhow::{anyhow, bail, Result}; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; use pq_proto::ReplicationFeedback; @@ -13,6 +13,7 @@ use tokio::{ time::Instant, }; use tracing::*; +use utils::http::error::ApiError; use utils::{ id::{NodeId, TenantTimelineId}, lsn::Lsn, @@ -356,6 +357,18 @@ pub enum TimelineError { UninitialinzedPgVersion(TenantTimelineId), } +// Convert to HTTP API error. +impl From for ApiError { + fn from(te: TimelineError) -> ApiError { + match te { + TimelineError::NotFound(ttid) => { + ApiError::NotFound(anyhow!("timeline {} not found", ttid)) + } + _ => ApiError::InternalServerError(anyhow!("{}", te)), + } + } +} + /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index baef17ffa8..c99ca0a51a 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -5,7 +5,7 @@ use crate::safekeeper::ServerInfo; use crate::timeline::{Timeline, TimelineError}; use crate::SafeKeeperConf; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{bail, Context, Result}; use once_cell::sync::Lazy; use serde::Serialize; use std::collections::HashMap; @@ -50,11 +50,11 @@ impl GlobalTimelinesState { } /// Get timeline from the map. Returns error if timeline doesn't exist. - fn get(&self, ttid: &TenantTimelineId) -> Result> { + fn get(&self, ttid: &TenantTimelineId) -> Result, TimelineError> { self.timelines .get(ttid) .cloned() - .ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid))) + .ok_or(TimelineError::NotFound(*ttid)) } } @@ -240,17 +240,17 @@ impl GlobalTimelines { /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, /// i.e. loaded in memory and not cancelled. - pub fn get(ttid: TenantTimelineId) -> Result> { + pub fn get(ttid: TenantTimelineId) -> Result, TimelineError> { let res = TIMELINES_STATE.lock().unwrap().get(&ttid); match res { Ok(tli) => { if tli.is_cancelled() { - anyhow::bail!(TimelineError::Cancelled(ttid)); + return Err(TimelineError::Cancelled(ttid)); } Ok(tli) } - Err(e) => Err(e), + _ => res, } } From b05e94e4fff8621bf1b02bed7a42ac39d0c182b8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 7 Mar 2023 14:44:04 +0200 Subject: [PATCH 108/426] fix: allow ERROR log to appear per allowed failure (#3696) The test already allows the background thread trying to checkpoint to fail, however the resulting log message is currently not allowed thus causing flakyness. --- test_runner/regress/test_remote_storage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 82bf741a8f..90d69c7b0e 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -618,6 +618,9 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( # checkpoint operations. Hence, checkpoint is allowed to fail now. log.info("sending delete request") checkpoint_allowed_to_fail.set() + env.pageserver.allowed_errors.append( + ".+ERROR Error processing HTTP request: InternalServerError\\(timeline is Stopping" + ) client.timeline_delete(tenant_id, timeline_id) assert not timeline_path.exists() From 069b5b0a0675b80ef5f23e88b6b86520af0423a2 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 24 Feb 2023 00:22:01 +0200 Subject: [PATCH 109/426] Make `postgres --wal-redo` more embeddable. * Stop allocating and maintaining 128MB hash table for last written LSN cache as it is not needed in wal-redo. * Do not require access to the initialized data directory. That saves few dozens megabytes of empty but initialized data directory. Currently such directories do occupy about 10% of the disk space on the pageservers as most of tenants are empty. * Move shmem-initialization code to the extension instead of postgres --- pageserver/src/walredo.rs | 52 +---- pgxn/neon_walredo/walredoproc.c | 246 +++++++++++++++++++--- test_runner/regress/test_compatibility.py | 7 +- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 5 files changed, 229 insertions(+), 80 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 72865ad74d..98730a7637 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -23,13 +23,11 @@ use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; use serde::Serialize; use std::collections::VecDeque; -use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::ops::{Deref, DerefMut}; use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::prelude::CommandExt; -use std::path::PathBuf; use std::process::Stdio; use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; use std::sync::{Mutex, MutexGuard}; @@ -639,26 +637,26 @@ impl PostgresRedoManager { input: &mut MutexGuard>, pg_version: u32, ) -> Result<(), Error> { - // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we - // just create one with constant name. That fails if you try to launch more than - // one WAL redo manager concurrently. - let datadir = path_with_suffix_extension( + // Previous versions of wal-redo required data directory and that directories + // occupied some space on disk. Remove it if we face it. + // + // This code could be dropped after one release cycle. + let legacy_datadir = path_with_suffix_extension( self.conf .tenant_path(&self.tenant_id) .join("wal-redo-datadir"), TEMP_FILE_SUFFIX, ); - - // Create empty data directory for wal-redo postgres, deleting old one first. - if datadir.exists() { - info!("old temporary datadir {datadir:?} exists, removing"); - fs::remove_dir_all(&datadir).map_err(|e| { + if legacy_datadir.exists() { + info!("legacy wal-redo datadir {legacy_datadir:?} exists, removing"); + fs::remove_dir_all(&legacy_datadir).map_err(|e| { Error::new( e.kind(), - format!("Old temporary dir {datadir:?} removal failure: {e}"), + format!("legacy wal-redo datadir {legacy_datadir:?} removal failure: {e}"), ) })?; } + let pg_bin_dir_path = self .conf .pg_bin_dir(pg_version) @@ -668,35 +666,6 @@ impl PostgresRedoManager { .pg_lib_dir(pg_version) .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?; - info!("running initdb in {}", datadir.display()); - let initdb = Command::new(pg_bin_dir_path.join("initdb")) - .args(["-D", &datadir.to_string_lossy()]) - .arg("-N") - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS - .close_fds() - .output() - .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; - - if !initdb.status.success() { - return Err(Error::new( - ErrorKind::Other, - format!( - "initdb failed\nstdout: {}\nstderr:\n{}", - String::from_utf8_lossy(&initdb.stdout), - String::from_utf8_lossy(&initdb.stderr) - ), - )); - } else { - // Limit shared cache for wal-redo-postgres - let mut config = OpenOptions::new() - .append(true) - .open(PathBuf::from(&datadir).join("postgresql.conf"))?; - config.write_all(b"shared_buffers=128kB\n")?; - config.write_all(b"fsync=off\n")?; - } - // Start postgres itself let child = Command::new(pg_bin_dir_path.join("postgres")) .arg("--wal-redo") @@ -706,7 +675,6 @@ impl PostgresRedoManager { .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - .env("PGDATA", &datadir) // The redo process is not trusted, and runs in seccomp mode that // doesn't allow it to open any files. We have to also make sure it // doesn't inherit any file descriptors from the pageserver, that diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index ffbfca5a40..9cce9b2a67 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -65,6 +65,14 @@ #include "rusagestub.h" #endif +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/heapam.h" +#include "access/multixact.h" +#include "access/nbtree.h" +#include "access/subtrans.h" +#include "access/syncscan.h" +#include "access/twophase.h" #include "access/xlog.h" #include "access/xlog_internal.h" #if PG_VERSION_NUM >= 150000 @@ -72,18 +80,36 @@ #endif #include "access/xlogutils.h" #include "catalog/pg_class.h" -#include "libpq/libpq.h" +#include "commands/async.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/origin.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/dsm.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/predicate.h" #include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "storage/sinvaladt.h" #include "storage/smgr.h" +#include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/memutils.h" #include "utils/ps_status.h" +#include "utils/snapmgr.h" #include "inmem_smgr.h" @@ -101,6 +127,7 @@ static void apply_error_callback(void *arg); static bool redo_block_filter(XLogReaderState *record, uint8 block_id); static void GetPage(StringInfo input_message); static ssize_t buffered_read(void *buf, size_t count); +static void CreateFakeSharedMemoryAndSemaphores(); static BufferTag target_redo_tag; @@ -141,7 +168,7 @@ enter_seccomp_mode(void) PG_SCMP_ALLOW(shmctl), PG_SCMP_ALLOW(shmdt), PG_SCMP_ALLOW(unlink), // shm_unlink - */ + */ }; #ifdef MALLOC_NO_MMAP @@ -177,6 +204,7 @@ WalRedoMain(int argc, char *argv[]) * buffers. So let's keep it small (default value is 1024) */ num_temp_buffers = 4; + NBuffers = 4; /* * install the simple in-memory smgr @@ -184,49 +212,33 @@ WalRedoMain(int argc, char *argv[]) smgr_hook = smgr_inmem; smgr_init_hook = smgr_init_inmem; - /* - * Validate we have been given a reasonable-looking DataDir and change into it. - */ - checkDataDir(); - ChangeToDataDir(); - - /* - * Create lockfile for data directory. - */ - CreateDataDirLockFile(false); - - /* read control file (error checking and contains config ) */ - LocalProcessControlFile(false); - - /* - * process any libraries that should be preloaded at postmaster start - */ - process_shared_preload_libraries(); /* Initialize MaxBackends (if under postmaster, was done already) */ + MaxConnections = 1; + max_worker_processes = 0; + max_parallel_workers = 0; + max_wal_senders = 0; InitializeMaxBackends(); -#if PG_VERSION_NUM >= 150000 - /* - * Give preloaded libraries a chance to request additional shared memory. - */ - process_shmem_requests(); + /* Disable lastWrittenLsnCache */ + lastWrittenLsnCacheSize = 0; - /* - * Now that loadable modules have had their chance to request additional - * shared memory, determine the value of any runtime-computed GUCs that - * depend on the amount of shared memory required. - */ +#if PG_VERSION_NUM >= 150000 + process_shmem_requests(); InitializeShmemGUCs(); /* - * Now that modules have been loaded, we can process any custom resource - * managers specified in the wal_consistency_checking GUC. + * This will try to access data directory which we do not set. + * Seems to be pretty safe to disable. */ - InitializeWalConsistencyChecking(); + /* InitializeWalConsistencyChecking(); */ #endif - CreateSharedMemoryAndSemaphores(); + /* + * We have our own version of CreateSharedMemoryAndSemaphores() that + * sets up local memory instead of shared one. + */ + CreateFakeSharedMemoryAndSemaphores(); /* * Remember stand-alone backend startup time,roughly at the same point @@ -354,6 +366,172 @@ WalRedoMain(int argc, char *argv[]) } +/* + * Initialize dummy shmem. + * + * This code follows CreateSharedMemoryAndSemaphores() but manually sets up + * the shmem header and skips few initialization steps that are not needed for + * WAL redo. + * + * I've also tried removing most of initialization functions that request some + * memory (like ApplyLauncherShmemInit and friends) but in reality it haven't had + * any sizeable effect on RSS, so probably such clean up not worth the risk of having + * half-initialized postgres. + */ +static void +CreateFakeSharedMemoryAndSemaphores() +{ + PGShmemHeader *shim = NULL; + PGShmemHeader *hdr; + Size size; + int numSemas; + char cwd[MAXPGPATH]; + +#if PG_VERSION_NUM >= 150000 + size = CalculateShmemSize(&numSemas); +#else + /* + * Postgres v14 doesn't have a separate CalculateShmemSize(). Use result of the + * corresponging calculation in CreateSharedMemoryAndSemaphores() + */ + size = 1409024; + numSemas = 10; +#endif + + /* Dummy implementation of PGSharedMemoryCreate() */ + { + hdr = (PGShmemHeader *) malloc(size); + if (!hdr) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("[neon-wal-redo] can not allocate (pseudo-) shared memory"))); + + hdr->creatorPID = getpid(); + hdr->magic = PGShmemMagic; + hdr->dsm_control = 0; + hdr->device = 42; /* not relevant for non-shared memory */ + hdr->inode = 43; /* not relevant for non-shared memory */ + hdr->totalsize = size; + hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + + shim = hdr; + UsedShmemSegAddr = hdr; + UsedShmemSegID = (unsigned long) 42; /* not relevant for non-shared memory */ + } + + InitShmemAccess(hdr); + + /* + * Reserve semaphores uses dir name as a source of entropy. Set it to cwd(). Rest + * of the code does not need DataDir access so nullify DataDir after + * PGReserveSemaphores() to error out if something will try to access it. + */ + if (!getcwd(cwd, MAXPGPATH)) + ereport(FATAL, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("[neon-wal-redo] can not read current directory name"))); + DataDir = cwd; + PGReserveSemaphores(numSemas); + DataDir = NULL; + + /* + * The rest of function follows CreateSharedMemoryAndSemaphores() closely, + * skipped parts are marked with comments. + */ + InitShmemAllocation(); + + /* + * Now initialize LWLocks, which do shared memory allocation and are + * needed for InitShmemIndex. + */ + CreateLWLocks(); + + /* + * Set up shmem.c index hashtable + */ + InitShmemIndex(); + + dsm_shmem_init(); + + /* + * Set up xlog, clog, and buffers + */ + XLOGShmemInit(); + CLOGShmemInit(); + CommitTsShmemInit(); + SUBTRANSShmemInit(); + MultiXactShmemInit(); + InitBufferPool(); + + /* + * Set up lock manager + */ + InitLocks(); + + /* + * Set up predicate lock manager + */ + InitPredicateLocks(); + + /* + * Set up process table + */ + if (!IsUnderPostmaster) + InitProcGlobal(); + CreateSharedProcArray(); + CreateSharedBackendStatus(); + TwoPhaseShmemInit(); + BackgroundWorkerShmemInit(); + + /* + * Set up shared-inval messaging + */ + CreateSharedInvalidationState(); + + /* + * Set up interprocess signaling mechanisms + */ + PMSignalShmemInit(); + ProcSignalShmemInit(); + CheckpointerShmemInit(); + AutoVacuumShmemInit(); + ReplicationSlotsShmemInit(); + ReplicationOriginShmemInit(); + WalSndShmemInit(); + WalRcvShmemInit(); + PgArchShmemInit(); + ApplyLauncherShmemInit(); + + /* + * Set up other modules that need some shared memory space + */ + SnapMgrInit(); + BTreeShmemInit(); + SyncScanShmemInit(); + /* Skip due to the 'pg_notify' directory check */ + /* AsyncShmemInit(); */ + +#ifdef EXEC_BACKEND + + /* + * Alloc the win32 shared backend array + */ + if (!IsUnderPostmaster) + ShmemBackendArrayAllocation(); +#endif + + /* Initialize dynamic shared memory facilities. */ + if (!IsUnderPostmaster) + dsm_postmaster_startup(shim); + + /* + * Now give loadable modules a chance to set up their shmem allocations + */ + if (shmem_startup_hook) + shmem_startup_hook(); +} + + /* Version compatility wrapper for ReadBufferWithoutRelcache */ static inline Buffer NeonRedoReadBuffer(RelFileNode rnode, diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 332e2f2519..731e78a3e3 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -220,9 +220,12 @@ def prepare_snapshot( for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"): shutil.rmtree(tenant) - # Remove wal-redo temp directory + # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create + # them anymore, but old versions did. for tenant in (repo_dir / "tenants").glob("*"): - shutil.rmtree(tenant / "wal-redo-datadir.___temp") + wal_redo_dir = tenant / "wal-redo-datadir.___temp" + if wal_redo_dir.exists() and wal_redo_dir.is_dir(): + shutil.rmtree(wal_redo_dir) # Update paths and ports in config files pageserver_toml = repo_dir / "pageserver.toml" diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 5fb2e0bba0..9fd9794436 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 5fb2e0bba06cc018ee2506f337c91751ab695454 +Subproject commit 9fd9794436d02fbfe68f8fca5beab218907cec41 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 919851e781..257aaefb25 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 919851e7811fcb2ecfc67f35bfd63a35639c73b5 +Subproject commit 257aaefb251c5c85c44652c01bf68c43db62748a From 1b16de0d0f116a80bb0d58ac5e235d9caf3af5a5 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 7 Mar 2023 10:34:32 +0100 Subject: [PATCH 110/426] Compile `prefix` extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 1a2cd9fb77..4e70bb7a47 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -255,6 +255,21 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control +######################################################################################### +# +# Layer "prefix-pg-build" +# compile Prefix extension +# +######################################################################################### +FROM build-deps AS prefix-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \ + mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control + ######################################################################################### # # Layer "rust extensions" @@ -336,6 +351,7 @@ COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 02b8e0e5afc41c0d630497a18022dae32c0a1bb6 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Tue, 7 Mar 2023 09:08:46 -0800 Subject: [PATCH 111/426] Add OpenAPI spec for do_gc (#3756) ## Describe your changes Adds a field to the OpenAPI spec for the page server which describes the `do_gc` command. ## Issue ticket number and link #3669 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- pageserver/src/http/openapi_spec.yml | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index e68ceb2dc6..3d3a9892bf 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -245,6 +245,53 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + put: + description: Garbage collect given timeline + responses: + "200": + description: OK + content: + application/json: + schema: + type: string + "400": + description: Error when no tenant id found in path, no timeline id or invalid timestamp + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id From fb1581d0b96eb0e54ebbda5e3463390f4c8c171f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 8 Mar 2023 11:39:30 +0200 Subject: [PATCH 112/426] Fix setting "image_creation_threshold" setting in tenant config. (#3762) We have a few tests that try to set image_creation_threshold, but it didn't actually have any effect because we were missing some critical code to load the setting from config file into memory. The two modified tests in `test_remote_storage.py perform compaction and GC, and assert that GC removes some layers. That only happens if new image layers are created by the compaction. The tests explicitly disabled image layer creation by setting image_creation_threshold to a high value, but it didn't take effect because reading image_creation_threshold from config file was broken, which is why the test worked. Fix the test to set image_creation_threshold low, instead, so that GC has work to do. Change 'test_tenant_conf.py' so that it exercises the added code. This might explain why we're apparently missing test coverage for GC (issue #3415), although I didn't try to address that here, nor did I check if this improves the it. --- pageserver/src/config.rs | 6 ++++++ test_runner/regress/test_remote_storage.py | 10 +++++----- test_runner/regress/test_tenant_conf.py | 7 ++++--- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 309e5367a4..7442814c43 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -698,6 +698,12 @@ impl PageServerConf { Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?); } + if let Some(image_creation_threshold) = item.get("image_creation_threshold") { + t_conf.image_creation_threshold = Some( + parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?, + ); + } + if let Some(gc_horizon) = item.get("gc_horizon") { t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?); } diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 90d69c7b0e..24db80c7cc 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -233,8 +233,8 @@ def test_remote_storage_upload_queue_retries( # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # don't create image layers, that causes just noise - "image_creation_threshold": "10000", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", } ) @@ -301,7 +301,7 @@ def test_remote_storage_upload_queue_retries( # Create more churn to generate all upload ops. # The checkpoint / compact / gc ops will block because they call remote_client.wait_completion(). - # So, run this in a differen thread. + # So, run this in a different thread. churn_thread_result = [False] def churn_while_failpoints_active(result): @@ -395,8 +395,8 @@ def test_remote_timeline_client_calls_started_metric( # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # don't create image layers, that causes just noise - "image_creation_threshold": "10000", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", } ) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index e087891bba..c5f9a3d157 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -129,6 +129,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "checkpoint_distance": "15000", "gc_period": "80sec", "compaction_period": "80sec", + "image_creation_threshold": "2", } env.neon_cli.config_tenant( tenant_id=tenant, @@ -149,7 +150,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 80, - "image_creation_threshold": 3, + "image_creation_threshold": 2, "pitr_interval": 604800, }.items() ), f"Unexpected res: {res}" @@ -174,7 +175,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert updated_effective_config["compaction_target_size"] == 1048576 assert updated_effective_config["compaction_threshold"] == 10 assert updated_effective_config["gc_horizon"] == 67108864 - assert updated_effective_config["image_creation_threshold"] == 3 + assert updated_effective_config["image_creation_threshold"] == 2 assert updated_effective_config["pitr_interval"] == "7days" # restart the pageserver and ensure that the config is still correct @@ -195,7 +196,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 80, - "image_creation_threshold": 3, + "image_creation_threshold": 2, "pitr_interval": 604800, }.items() ), f"Unexpected res: {res}" From 177f98679517769037cdfd8f5e4ffb5a44c4362b Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 8 Mar 2023 10:37:57 +0100 Subject: [PATCH 113/426] Compile `hll` extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 4e70bb7a47..64dd122386 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -270,6 +270,21 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O pr make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control +######################################################################################### +# +# Layer "hll-pg-build" +# compile hll extension +# +######################################################################################### +FROM build-deps AS hll-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \ + mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control + ######################################################################################### # # Layer "rust extensions" @@ -352,6 +367,7 @@ COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 37bc6d9be4aa0495277e7beef8264f22f59586a2 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 8 Mar 2023 15:58:05 +0100 Subject: [PATCH 114/426] Compile `plpgsql_check` extension --- Dockerfile.compute-node | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 64dd122386..06c820009b 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -285,6 +285,21 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control +######################################################################################### +# +# Layer "plpgsql-check-pg-build" +# compile plpgsql_check extension +# +######################################################################################### +FROM build-deps AS plpgsql-check-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \ + mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control + ######################################################################################### # # Layer "rust extensions" @@ -368,6 +383,7 @@ COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -427,7 +443,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # Install: # libreadline8 for psql -# libicu67, locales for collations (including ICU) +# libicu67, locales for collations (including ICU and plpgsql_check) # libossp-uuid16 for extension ossp-uuid # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS # libxml2, libxslt1.1 for xml2 From ccf92df4da9be754b4b4ee8caddabc828036757a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 8 Mar 2023 23:08:03 +0200 Subject: [PATCH 115/426] Remove deprecated support to handle ZENITH_AUTH_TOKEN. It's not used anywhere anymore. --- pageserver/src/bin/pageserver.rs | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 01a2c85d74..9caab7955b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -281,33 +281,17 @@ fn start_pageserver( }; info!("Using auth: {:#?}", conf.auth_type); - // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration. - match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) { - (old, Ok(v)) => { + match var("NEON_AUTH_TOKEN") { + Ok(v) => { info!("Loaded JWT token for authentication with Safekeeper"); - if let Ok(v_old) = old { - warn!( - "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated" - ); - if v_old != v { - warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN"); - } - } pageserver::config::SAFEKEEPER_AUTH_TOKEN .set(Arc::new(v)) .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; } - (Ok(v), _) => { - info!("Loaded JWT token for authentication with Safekeeper"); - warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN"); - pageserver::config::SAFEKEEPER_AUTH_TOKEN - .set(Arc::new(v)) - .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; - } - (_, Err(VarError::NotPresent)) => { + Err(VarError::NotPresent) => { info!("No JWT token for authentication with Safekeeper detected"); } - (_, Err(e)) => { + Err(e) => { return Err(e).with_context(|| { "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" }) From 03a2ce9d136251ab42f0c8dc236db96997868372 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 9 Mar 2023 09:24:01 +0200 Subject: [PATCH 116/426] Add tracing spans with request_id into pageserver management API handlers (#3755) Adds a newtype that creates a span with request_id from https://github.com/neondatabase/neon/pull/3708 for every HTTP request served. Moves request logging and error handlers under the new wrapper, so every request-related event now is logged under the request span. For compatibility reasons, error handler is left on the general router, since not every service uses the new handler wrappers yet. --- libs/utils/src/http/endpoint.rs | 180 ++++++++++++++++++++------------ pageserver/src/http/routes.rs | 92 +++++++++------- 2 files changed, 168 insertions(+), 104 deletions(-) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 41975f6944..616f2b8468 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -3,14 +3,14 @@ use crate::http::error; use anyhow::{anyhow, Context}; use hyper::header::{HeaderName, AUTHORIZATION}; use hyper::http::HeaderValue; +use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; -use hyper::{Method, StatusCode}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService}; use tokio::task::JoinError; -use tracing; +use tracing::{self, debug, info, info_span, warn, Instrument}; use std::future::Future; use std::net::TcpListener; @@ -32,31 +32,77 @@ static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HE #[derive(Debug, Default, Clone)] struct RequestId(String); -async fn logger(res: Response, info: RequestInfo) -> Result, ApiError> { - let request_id = info.context::().unwrap_or_default().0; +/// Adds a tracing info_span! instrumentation around the handler events, +/// logs the request start and end events for non-GET requests and non-200 responses. +/// +/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped +/// in this type will get request info logged in the wrapping span, including the unique request ID. +/// +/// There could be other ways to implement similar functionality: +/// +/// * procmacros placed on top of all handler methods +/// With all the drawbacks of procmacros, brings no difference implementation-wise, +/// and little code reduction compared to the existing approach. +/// +/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic, +/// implemented for [`RouterBuilder`]. +/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later. +/// +/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped +/// later, in a post-response middleware. +/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures` +/// tries to achive with its `.instrument` used in the current approach. +/// +/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced. +pub struct RequestSpan(pub H) +where + E: Into> + 'static, + R: Future, E>> + Send + 'static, + H: Fn(Request) -> R + Send + Sync + 'static; - // cannot factor out the Level to avoid the repetition - // because tracing can only work with const Level - // which is not the case here +impl RequestSpan +where + E: Into> + 'static, + R: Future, E>> + Send + 'static, + H: Fn(Request) -> R + Send + Sync + 'static, +{ + /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span. + /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled. + pub async fn handle(self, request: Request) -> Result, E> { + let request_id = request.context::().unwrap_or_default().0; + let method = request.method(); + let path = request.uri().path(); + let request_span = info_span!("request", %method, %path, %request_id); - if info.method() == Method::GET && res.status() == StatusCode::OK { - tracing::debug!( - "{} {} {} {}", - info.method(), - info.uri().path(), - request_id, - res.status() - ); - } else { - tracing::info!( - "{} {} {} {}", - info.method(), - info.uri().path(), - request_id, - res.status() - ); + let log_quietly = method == Method::GET; + async move { + if log_quietly { + debug!("Handling request"); + } else { + info!("Handling request"); + } + + // Note that we reuse `error::handler` here and not returning and error at all, + // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation. + // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call. + // + // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally. + match (self.0)(request).await { + Ok(response) => { + let response_status = response.status(); + if log_quietly && response_status.is_success() { + debug!("Request handled, status: {response_status}"); + } else { + info!("Request handled, status: {response_status}"); + } + Ok(response) + } + Err(e) => Ok(error::handler(e.into()).await), + } + } + .instrument(request_span) + .await } - Ok(res) } async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { @@ -96,12 +142,6 @@ pub fn add_request_id_middleware RouterBuilder { Router::builder() .middleware(add_request_id_middleware()) - .middleware(Middleware::post_with_info(logger)) .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) - .get("/metrics", prometheus_metrics_handler) + .get("/metrics", |r| { + RequestSpan(prometheus_metrics_handler).handle(r) + }) .err_handler(error::handler) } @@ -139,40 +180,43 @@ pub fn attach_openapi_ui( spec_mount_path: &'static str, ui_mount_path: &'static str, ) -> RouterBuilder { - router_builder.get(spec_mount_path, move |_| async move { - Ok(Response::builder().body(Body::from(spec)).unwrap()) - }).get(ui_mount_path, move |_| async move { - Ok(Response::builder().body(Body::from(format!(r#" - - - - rweb - - - -
- - - - - "#, spec_mount_path))).unwrap()) - }) + router_builder + .get(spec_mount_path, move |r| { + RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) }) + .handle(r) + }) + .get(ui_mount_path, move |r| RequestSpan( move |_| async move { + Ok(Response::builder().body(Body::from(format!(r#" + + + + rweb + + + +
+ + + + + "#, spec_mount_path))).unwrap()) + }).handle(r)) } fn parse_token(header_value: &str) -> Result<&str, ApiError> { @@ -234,7 +278,7 @@ where async move { let headers = response.headers_mut(); if headers.contains_key(&name) { - tracing::warn!( + warn!( "{} response already contains header {:?}", request_info.uri(), &name, @@ -274,7 +318,7 @@ pub fn serve_thread_main( where S: Future + Send + Sync, { - tracing::info!("Starting an HTTP endpoint at {}", listener.local_addr()?); + info!("Starting an HTTP endpoint at {}", listener.local_addr()?); // Create a Service from the router above to handle incoming requests. let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap(); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3cfc68c1eb..111bc480c4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -10,6 +10,7 @@ use remote_storage::GenericRemoteStorage; use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::http::endpoint::RequestSpan; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use super::models::{ @@ -1091,7 +1092,8 @@ pub fn make_router( let handler = $handler; #[cfg(not(feature = "testing"))] let handler = cfg_disabled; - handler + + move |r| RequestSpan(handler).handle(r) }}; } @@ -1099,35 +1101,55 @@ pub fn make_router( .data(Arc::new( State::new(conf, auth, remote_storage).context("Failed to initialize router state")?, )) - .get("/v1/status", status_handler) + .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) .put( "/v1/failpoints", testing_api!("manage failpoints", failpoints_handler), ) - .get("/v1/tenant", tenant_list_handler) - .post("/v1/tenant", tenant_create_handler) - .get("/v1/tenant/:tenant_id", tenant_status) - .get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler) - .put("/v1/tenant/config", update_tenant_config_handler) - .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler) - .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) - .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) - .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) - .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) - .post("/v1/tenant/:tenant_id/load", tenant_load_handler) - .post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_detail_handler, - ) + .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r)) + .post("/v1/tenant", |r| { + RequestSpan(tenant_create_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id", |r| { + RequestSpan(tenant_status).handle(r) + }) + .get("/v1/tenant/:tenant_id/synthetic_size", |r| { + RequestSpan(tenant_size_handler).handle(r) + }) + .put("/v1/tenant/config", |r| { + RequestSpan(update_tenant_config_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/config", |r| { + RequestSpan(get_tenant_config_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/timeline", |r| { + RequestSpan(timeline_list_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/timeline", |r| { + RequestSpan(timeline_create_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/attach", |r| { + RequestSpan(tenant_attach_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/detach", |r| { + RequestSpan(tenant_detach_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/load", |r| { + RequestSpan(tenant_load_handler).handle(r) + }) + .post("/v1/tenant/:tenant_id/ignore", |r| { + RequestSpan(tenant_ignore_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + RequestSpan(timeline_detail_handler).handle(r) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", - get_lsn_by_timestamp_handler, - ) - .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", - timeline_gc_handler, + |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r), ) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { + RequestSpan(timeline_gc_handler).handle(r) + }) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", testing_api!("run timeline compaction", timeline_compact_handler), @@ -1138,28 +1160,26 @@ pub fn make_router( ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - timeline_download_remote_layers_handler_post, + |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r), ) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - timeline_download_remote_layers_handler_get, - ) - .delete( - "/v1/tenant/:tenant_id/timeline/:timeline_id", - timeline_delete_handler, - ) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer", - layer_map_info_handler, + |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r), ) + .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + RequestSpan(timeline_delete_handler).handle(r) + }) + .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { + RequestSpan(layer_map_info_handler).handle(r) + }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - layer_download_handler, + |r| RequestSpan(layer_download_handler).handle(r), ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - evict_timeline_layer_handler, + |r| RequestSpan(evict_timeline_layer_handler).handle(r), ) - .get("/v1/panic", always_panic_handler) + .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) .any(handler_404)) } From 8459e0265e52bdbf28ead418f0db03cda71a6eac Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 8 Mar 2023 13:23:37 +0200 Subject: [PATCH 117/426] Add performance test for compaction and image layer creation --- test_runner/performance/test_compaction.py | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 test_runner/performance/test_compaction.py diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py new file mode 100644 index 0000000000..89818ee8bd --- /dev/null +++ b/test_runner/performance/test_compaction.py @@ -0,0 +1,58 @@ +from contextlib import closing + +import pytest +from fixtures.compare_fixtures import NeonCompare +from fixtures.neon_fixtures import wait_for_last_flush_lsn + + +# +# Test compaction and image layer creation performance. +# +# This creates a few tables and runs some simple INSERTs and UPDATEs on them to generate +# some delta layers. Then it runs manual compaction, measuring how long it takes. +# +@pytest.mark.timeout(1000) +def test_compaction(neon_compare: NeonCompare): + env = neon_compare.env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # Disable background GC and compaction, we'll run compaction manually. + "gc_period": "0s", + "compaction_period": "0s", + # Make checkpoint distance somewhat smaller than default, to create + # more delta layers quicker, to trigger compaction. + "checkpoint_distance": "25000000", # 25 MB + # Force image layer creation when we run compaction. + "image_creation_threshold": "1", + } + ) + neon_compare.tenant = tenant_id + neon_compare.timeline = timeline_id + + # Create some tables, and run a bunch of INSERTs and UPDATes on them, + # to generate WAL and layers + pg = env.postgres.create_start( + "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"] + ) + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + for i in range(100): + cur.execute(f"create table tbl{i} (i int, j int);") + cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);") + for j in range(100): + cur.execute(f"update tbl{i} set j = {j};") + + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + # First compaction generates L1 layers + with neon_compare.zenbenchmark.record_duration("compaction"): + pageserver_http.timeline_compact(tenant_id, timeline_id) + + # And second compaction triggers image layer creation + with neon_compare.zenbenchmark.record_duration("image_creation"): + pageserver_http.timeline_compact(tenant_id, timeline_id) + + neon_compare.report_size() From e43c413a3f6ea14a6059967a40de03a0e2452bc7 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 9 Mar 2023 14:21:10 +0100 Subject: [PATCH 118/426] [compute_tools] Add /insights endpoint to compute_ctl (#3704) This commit adds a basic HTTP API endpoint that allows scraping the `pg_stat_statements` data and getting a list of slow queries. New insights like cache hit rate and so on could be added later. Extension `pg_stat_statements` is checked / created only if compute tries to load the corresponding shared library. The latter is configured by control-plane and currently covered with feature flag. Co-authored by Eduard Dyckman (bird.duskpoet@gmail.com) --- compute_tools/src/compute.rs | 41 ++++++++++++++++++++++++ compute_tools/src/http/api.rs | 7 ++++ compute_tools/src/http/openapi_spec.yaml | 37 +++++++++++++++++---- compute_tools/src/pg_helpers.rs | 2 ++ compute_tools/src/spec.rs | 15 +++++++++ 5 files changed, 96 insertions(+), 6 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index c8af8822b7..8ceef44d61 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -25,6 +25,7 @@ use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; use serde::{Serialize, Serializer}; +use tokio_postgres; use tracing::{info, instrument, warn}; use crate::checker::create_writability_check_data; @@ -284,6 +285,7 @@ impl ComputeNode { handle_role_deletions(self, &mut client)?; handle_grants(self, &mut client)?; create_writability_check_data(&mut client)?; + handle_extensions(&self.spec, &mut client)?; // 'Close' connection drop(client); @@ -400,4 +402,43 @@ impl ComputeNode { Ok(()) } + + /// Select `pg_stat_statements` data and return it as a stringified JSON + pub async fn collect_insights(&self) -> String { + let mut result_rows: Vec = Vec::new(); + let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await; + let (client, connection) = connect_result.unwrap(); + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + let result = client + .simple_query( + "SELECT + row_to_json(pg_stat_statements) +FROM + pg_stat_statements +WHERE + userid != 'cloud_admin'::regrole::oid +ORDER BY + (mean_exec_time + mean_plan_time) DESC +LIMIT 100", + ) + .await; + + if let Ok(raw_rows) = result { + for message in raw_rows.iter() { + if let postgres::SimpleQueryMessage::Row(row) = message { + if let Some(json) = row.get(0) { + result_rows.push(json.to_string()); + } + } + } + + format!("{{\"pg_stat_statements\": [{}]}}", result_rows.join(",")) + } else { + "{{\"pg_stat_statements\": []}}".to_string() + } + } } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 589a8e1434..2392863303 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -33,6 +33,13 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /insights GET request"); + let insights = compute.collect_insights().await; + Response::new(Body::from(insights)) + } + (&Method::POST, "/check_writability") => { info!("serving /check_writability POST request"); let res = crate::checker::check_writability(compute).await; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index a857531d26..3a8e9fc1dc 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -10,12 +10,12 @@ paths: /status: get: tags: - - "info" + - Info summary: Get compute node internal status description: "" operationId: getComputeStatus responses: - "200": + 200: description: ComputeState content: application/json: @@ -25,27 +25,43 @@ paths: /metrics.json: get: tags: - - "info" + - Info summary: Get compute node startup metrics in JSON format description: "" operationId: getComputeMetricsJSON responses: - "200": + 200: description: ComputeMetrics content: application/json: schema: $ref: "#/components/schemas/ComputeMetrics" + /insights: + get: + tags: + - Info + summary: Get current compute insights in JSON format + description: | + Note, that this doesn't include any historical data + operationId: getComputeInsights + responses: + 200: + description: Compute insights + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeInsights" + /check_writability: post: tags: - - "check" + - Check summary: Check that we can write new data on this compute description: "" operationId: checkComputeWritability responses: - "200": + 200: description: Check result content: text/plain: @@ -96,6 +112,15 @@ components: type: string description: Text of the error during compute startup, if any + ComputeInsights: + type: object + properties: + pg_stat_statements: + description: Contains raw output from pg_stat_statements in JSON format + type: array + items: + type: object + ComputeStatus: type: string enum: diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 6ab2864721..6a1377b6aa 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -63,6 +63,8 @@ impl GenericOption { /// Represent `GenericOption` as configuration option. pub fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { + // TODO: check in the console DB that we don't have these settings + // set for any non-deleted project and drop this override. let name = match self.name.as_str() { "safekeepers" => "neon.safekeepers", "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index bbd0ec21ed..47f1d69cff 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -515,3 +515,18 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { Ok(()) } + +/// Create required system extensions +#[instrument(skip_all)] +pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> { + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + if libs.contains("pg_stat_statements") { + // Create extension only if this compute really needs it + let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements"; + info!("creating system extensions with query: {}", query); + client.simple_query(query)?; + } + } + + Ok(()) +} From 3f11a647c03891e6059734541f51ed988ed72289 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 1 Mar 2023 20:05:56 +0400 Subject: [PATCH 119/426] Rename write_message to write_message_noflush in postgres_backend_async.rs To make it unifrom across the project; proxy stream.rs and older postgres_backend uses write_message_noflush. --- libs/utils/src/postgres_backend_async.rs | 48 ++++++++++++------------ pageserver/src/page_service.rs | 46 +++++++++++------------ 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index b804c54709..442b06ed01 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -233,7 +233,7 @@ impl PostgresBackend { } /// Write message into internal output buffer. - pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { + pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buf_out, message)?; Ok(self) } @@ -383,7 +383,7 @@ impl PostgresBackend { FeStartupPacket::SslRequest => { debug!("SSL requested"); - self.write_message(&BeMessage::EncryptionResponse(have_tls))?; + self.write_message_noflush(&BeMessage::EncryptionResponse(have_tls))?; if have_tls { self.start_tls().await?; self.state = ProtoState::Encrypted; @@ -391,11 +391,11 @@ impl PostgresBackend { } FeStartupPacket::GssEncRequest => { debug!("GSS requested"); - self.write_message(&BeMessage::EncryptionResponse(false))?; + self.write_message_noflush(&BeMessage::EncryptionResponse(false))?; } FeStartupPacket::StartupMessage { .. } => { if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse( + self.write_message_noflush(&BeMessage::ErrorResponse( "must connect with TLS", None, ))?; @@ -410,15 +410,17 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => { - self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeMessage::CLIENT_ENCODING)? + self.write_message_noflush(&BeMessage::AuthenticationOk)? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? // The async python driver requires a valid server_version - .write_message(&BeMessage::server_version("14.1"))? - .write_message(&BeMessage::ReadyForQuery)?; + .write_message_noflush(&BeMessage::server_version("14.1"))? + .write_message_noflush(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } AuthType::NeonJWT => { - self.write_message(&BeMessage::AuthenticationCleartextPassword)?; + self.write_message_noflush( + &BeMessage::AuthenticationCleartextPassword, + )?; self.state = ProtoState::Authentication; } } @@ -441,7 +443,7 @@ impl PostgresBackend { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse( + self.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))?; @@ -449,9 +451,9 @@ impl PostgresBackend { } } } - self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeMessage::CLIENT_ENCODING)? - .write_message(&BeMessage::ReadyForQuery)?; + self.write_message_noflush(&BeMessage::AuthenticationOk)? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? + .write_message_noflush(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } @@ -486,30 +488,30 @@ impl PostgresBackend { if let Err(e) = handler.process_query(self, query_string).await { log_query_error(query_string, &e); let short_error = short_error(&e); - self.write_message(&BeMessage::ErrorResponse( + self.write_message_noflush(&BeMessage::ErrorResponse( &short_error, Some(e.pg_error_code()), ))?; } - self.write_message(&BeMessage::ReadyForQuery)?; + self.write_message_noflush(&BeMessage::ReadyForQuery)?; } FeMessage::Parse(m) => { *unnamed_query_string = m.query_string; - self.write_message(&BeMessage::ParseComplete)?; + self.write_message_noflush(&BeMessage::ParseComplete)?; } FeMessage::Describe(_) => { - self.write_message(&BeMessage::ParameterDescription)? - .write_message(&BeMessage::NoData)?; + self.write_message_noflush(&BeMessage::ParameterDescription)? + .write_message_noflush(&BeMessage::NoData)?; } FeMessage::Bind(_) => { - self.write_message(&BeMessage::BindComplete)?; + self.write_message_noflush(&BeMessage::BindComplete)?; } FeMessage::Close(_) => { - self.write_message(&BeMessage::CloseComplete)?; + self.write_message_noflush(&BeMessage::CloseComplete)?; } FeMessage::Execute(_) => { @@ -517,7 +519,7 @@ impl PostgresBackend { trace!("got execute {query_string:?}"); if let Err(e) = handler.process_query(self, query_string).await { log_query_error(query_string, &e); - self.write_message(&BeMessage::ErrorResponse( + self.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))?; @@ -529,7 +531,7 @@ impl PostgresBackend { } FeMessage::Sync => { - self.write_message(&BeMessage::ReadyForQuery)?; + self.write_message_noflush(&BeMessage::ReadyForQuery)?; } FeMessage::Terminate => { @@ -579,7 +581,7 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> { // XXX: if the input is large, we should split it into multiple messages. // Not sure what the threshold should be, but the ultimate hard limit is that // the length cannot exceed u32. - this.pgb.write_message(&BeMessage::CopyData(buf))?; + this.pgb.write_message_noflush(&BeMessage::CopyData(buf))?; Poll::Ready(Ok(buf.len())) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 878928ae06..b362e25424 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -64,7 +64,7 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { // We were requested to shut down. let msg = format!("pageserver is shutting down"); - let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None)); + let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)); Err(QueryError::Other(anyhow::anyhow!(msg))) } @@ -80,13 +80,13 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { let msg = "client terminated connection with Terminate message during COPY"; let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; + pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; break; } m => { let msg = format!("unexpected message {m:?}"); - pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?; + pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None))?; Err(io::Error::new(io::ErrorKind::Other, msg))?; break; } @@ -97,7 +97,7 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { let msg = "client closed connection during COPY"; let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; + pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; pgb.flush().await?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; } @@ -311,7 +311,7 @@ impl PageServerHandler { let timeline = tenant.get_timeline(timeline_id, true)?; // switch client to COPYBOTH - pgb.write_message(&BeMessage::CopyBothResponse)?; + pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; pgb.flush().await?; let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id); @@ -380,7 +380,7 @@ impl PageServerHandler { }) }); - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?; pgb.flush().await?; } Ok(()) @@ -416,7 +416,7 @@ impl PageServerHandler { // Import basebackup provided via CopyData info!("importing basebackup"); - pgb.write_message(&BeMessage::CopyInResponse)?; + pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; let mut copyin_stream = Box::pin(copyin_stream(pgb)); @@ -468,7 +468,7 @@ impl PageServerHandler { // Import wal provided via CopyData info!("importing wal"); - pgb.write_message(&BeMessage::CopyInResponse)?; + pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; let mut copyin_stream = Box::pin(copyin_stream(pgb)); let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream); @@ -678,7 +678,7 @@ impl PageServerHandler { } // switch client to COPYOUT - pgb.write_message(&BeMessage::CopyOutResponse)?; + pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; pgb.flush().await?; // Send a tarball of the latest layer on the timeline @@ -695,7 +695,7 @@ impl PageServerHandler { .await?; } - pgb.write_message(&BeMessage::CopyDone)?; + pgb.write_message_noflush(&BeMessage::CopyDone)?; pgb.flush().await?; info!("basebackup complete"); @@ -812,7 +812,7 @@ impl postgres_backend_async::Handler for PageServerHandler { // Check that the timeline exists self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx) .await?; - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -835,15 +835,15 @@ impl postgres_backend_async::Handler for PageServerHandler { let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message(&BeMessage::RowDescription(&[ + pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::text_col(b"prev_lsn"), RowDescriptor::text_col(b"last_lsn"), ]))? - .write_message(&BeMessage::DataRow(&[ + .write_message_noflush(&BeMessage::DataRow(&[ Some(end_of_timeline.prev.to_string().as_bytes()), Some(end_of_timeline.last.to_string().as_bytes()), ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } // same as basebackup, but result includes relational data as well else if query_string.starts_with("fullbackup ") { @@ -884,7 +884,7 @@ impl postgres_backend_async::Handler for PageServerHandler { // Check that the timeline exists self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx) .await?; - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. // Assumes the tenant already exists on this pageserver. @@ -929,10 +929,10 @@ impl postgres_backend_async::Handler for PageServerHandler { ) .await { - Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse( + pgb.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))? @@ -965,10 +965,10 @@ impl postgres_backend_async::Handler for PageServerHandler { .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx) .await { - Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, Err(e) => { error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message(&BeMessage::ErrorResponse( + pgb.write_message_noflush(&BeMessage::ErrorResponse( &e.to_string(), Some(e.pg_error_code()), ))? @@ -977,7 +977,7 @@ impl postgres_backend_async::Handler for PageServerHandler { } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); @@ -993,7 +993,7 @@ impl postgres_backend_async::Handler for PageServerHandler { self.check_permission(Some(tenant_id))?; let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?; - pgb.write_message(&BeMessage::RowDescription(&[ + pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), RowDescriptor::int8_col(b"compaction_target_size"), @@ -1004,7 +1004,7 @@ impl postgres_backend_async::Handler for PageServerHandler { RowDescriptor::int8_col(b"image_creation_threshold"), RowDescriptor::int8_col(b"pitr_interval"), ]))? - .write_message(&BeMessage::DataRow(&[ + .write_message_noflush(&BeMessage::DataRow(&[ Some(tenant.get_checkpoint_distance().to_string().as_bytes()), Some( tenant @@ -1027,7 +1027,7 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_image_creation_threshold().to_string().as_bytes()), Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { return Err(QueryError::Other(anyhow::anyhow!( "unknown command {query_string}" From 7627d85345bee3795b5b85e40cf290e274399ca2 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 7 Mar 2023 10:08:57 +0400 Subject: [PATCH 120/426] Move async postgres_backend to its own crate. To untie cyclic dependency between sync and async versions of postgres_backend, copy QueryError and some logging/error routines to postgres_backend.rs. This is temporal glue to make commits smaller, sync version will be dropped by the upcoming commit completely. --- Cargo.lock | 26 +++++++- Cargo.toml | 1 + libs/postgres_backend/Cargo.toml | 27 ++++++++ .../src/lib.rs} | 4 +- libs/utils/Cargo.toml | 36 +++++------ libs/utils/src/lib.rs | 1 - libs/utils/src/postgres_backend.rs | 63 ++++++++++++++++++- libs/utils/tests/ssl_test.rs | 2 +- pageserver/Cargo.toml | 1 + pageserver/src/page_service.rs | 4 +- .../walreceiver/walreceiver_connection.rs | 3 +- proxy/src/console/mgmt.rs | 2 +- safekeeper/src/handler.rs | 2 +- safekeeper/src/json_ctrl.rs | 2 +- safekeeper/src/receive_wal.rs | 2 +- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/wal_service.rs | 2 +- 17 files changed, 144 insertions(+), 36 deletions(-) create mode 100644 libs/postgres_backend/Cargo.toml rename libs/{utils/src/postgres_backend_async.rs => postgres_backend/src/lib.rs} (99%) diff --git a/Cargo.lock b/Cargo.lock index fe5aae6ae8..ab2f69929e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2454,6 +2454,7 @@ dependencies = [ "postgres", "postgres-protocol", "postgres-types", + "postgres_backend", "postgres_connection", "postgres_ffi", "pq_proto", @@ -2676,6 +2677,29 @@ dependencies = [ "postgres-protocol", ] +[[package]] +name = "postgres_backend" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "futures", + "once_cell", + "pq_proto", + "rustls", + "rustls-pemfile", + "serde", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-postgres-rustls", + "tokio-rustls", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "postgres_connection" version = "0.1.0" @@ -4507,7 +4531,6 @@ dependencies = [ "bytes", "criterion", "futures", - "git-version", "heapless", "hex", "hex-literal", @@ -4532,7 +4555,6 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "tokio-rustls", "tracing", "tracing-subscriber", "url", diff --git a/Cargo.toml b/Cargo.toml index ea22b04124..bbd4975603 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -133,6 +133,7 @@ heapless = { default-features=false, features=[], git = "https://github.com/japa consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } +postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml new file mode 100644 index 0000000000..bead77c4d6 --- /dev/null +++ b/libs/postgres_backend/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "postgres_backend" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +async-trait.workspace = true +anyhow.workspace = true +bytes.workspace = true +futures.workspace = true +rustls.workspace = true +serde.workspace = true +thiserror.workspace = true +tokio.workspace = true +tokio-rustls.workspace = true +tracing.workspace = true + +pq_proto.workspace = true +utils.workspace = true +workspace_hack.workspace = true + +[dev-dependencies] +once_cell.workspace = true +rustls-pemfile.workspace = true +tokio-postgres.workspace = true +tokio-postgres-rustls.workspace = true \ No newline at end of file diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/postgres_backend/src/lib.rs similarity index 99% rename from libs/utils/src/postgres_backend_async.rs rename to libs/postgres_backend/src/lib.rs index 442b06ed01..6e96e65a52 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/postgres_backend/src/lib.rs @@ -3,7 +3,6 @@ //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. -use crate::postgres_backend::AuthType; use anyhow::Context; use bytes::{Buf, Bytes, BytesMut}; use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR}; @@ -14,6 +13,7 @@ use std::sync::Arc; use std::task::Poll; use std::{future::Future, task::ready}; use tracing::{debug, error, info, trace}; +use utils::postgres_backend::AuthType; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; use tokio_rustls::TlsAcceptor; @@ -617,7 +617,7 @@ pub fn short_error(e: &QueryError) -> String { } } -pub(super) fn log_query_error(query: &str, e: &QueryError) { +pub fn log_query_error(query: &str, e: &QueryError) { match e { QueryError::Disconnected(ConnectionError::Socket(io_error)) => { if is_expected_io_error(io_error) { diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 6acdb6fa53..206e40fce9 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -12,42 +12,40 @@ anyhow.workspace = true bincode.workspace = true bytes.workspace = true heapless.workspace = true +hex = { workspace = true, features = ["serde"] } hyper = { workspace = true, features = ["full"] } futures = { workspace = true} -routerify.workspace = true -serde.workspace = true -serde_json.workspace = true -thiserror.workspace = true -tokio.workspace = true -tokio-rustls.workspace = true -tracing.workspace = true -tracing-subscriber = { workspace = true, features = ["json"] } -nix.workspace = true -signal-hook.workspace = true -rand.workspace = true jsonwebtoken.workspace = true -hex = { workspace = true, features = ["serde"] } +nix.workspace = true +once_cell.workspace = true +routerify.workspace = true rustls.workspace = true rustls-split.workspace = true -git-version.workspace = true +serde.workspace = true +serde_json.workspace = true +signal-hook.workspace = true +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true +tracing-subscriber = { workspace = true, features = ["json"] } +rand.workspace = true serde_with.workspace = true -once_cell.workspace = true strum.workspace = true strum_macros.workspace = true +url.workspace = true +uuid = { version = "1.2", features = ["v4", "serde"] } metrics.workspace = true pq_proto.workspace = true - workspace_hack.workspace = true -url.workspace = true -uuid = { version = "1.2", features = ["v4", "serde"] } + [dev-dependencies] byteorder.workspace = true bytes.workspace = true -hex-literal.workspace = true -tempfile.workspace = true criterion.workspace = true +hex-literal.workspace = true rustls-pemfile.workspace = true +tempfile.workspace = true [[bench]] name = "benchmarks" diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 9ddd702c72..7408eb66cd 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -14,7 +14,6 @@ pub mod vec_map; pub mod bin_ser; pub mod postgres_backend; -pub mod postgres_backend_async; // helper functions for creating and fsyncing pub mod crashsafe; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index f3e3835bda..fc49aa6696 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -3,11 +3,10 @@ //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. -use crate::postgres_backend_async::{log_query_error, short_error, QueryError}; use crate::sock_split::{BidiStream, ReadStream, WriteStream}; use anyhow::Context; use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; +use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR}; use serde::{Deserialize, Serialize}; use std::fmt; use std::io::{self, Write}; @@ -17,6 +16,41 @@ use std::sync::Arc; use std::time::Duration; use tracing::*; +pub fn is_expected_io_error(e: &io::Error) -> bool { + use io::ErrorKind::*; + matches!( + e.kind(), + ConnectionRefused | ConnectionAborted | ConnectionReset + ) +} + +/// An error, occurred during query processing: +/// either during the connection ([`ConnectionError`]) or before/after it. +#[derive(thiserror::Error, Debug)] +pub enum QueryError { + /// The connection was lost while processing the query. + #[error(transparent)] + Disconnected(#[from] ConnectionError), + /// Some other error + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for QueryError { + fn from(e: io::Error) -> Self { + Self::Disconnected(ConnectionError::Socket(e)) + } +} + +impl QueryError { + pub fn pg_error_code(&self) -> &'static [u8; 5] { + match self { + Self::Disconnected(_) => b"08006", // connection failure + Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error + } + } +} + pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this @@ -483,3 +517,28 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } } + +pub fn short_error(e: &QueryError) -> String { + match e { + QueryError::Disconnected(connection_error) => connection_error.to_string(), + QueryError::Other(e) => format!("{e:#}"), + } +} + +pub(super) fn log_query_error(query: &str, e: &QueryError) { + match e { + QueryError::Disconnected(ConnectionError::Socket(io_error)) => { + if is_expected_io_error(io_error) { + info!("query handler for '{query}' failed with expected io error: {io_error}"); + } else { + error!("query handler for '{query}' failed with io error: {io_error}"); + } + } + QueryError::Disconnected(other_connection_error) => { + error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") + } + QueryError::Other(e) => { + error!("query handler for '{query}' failed: {e:?}"); + } + } +} diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs index fae707f049..bf09f1b37d 100644 --- a/libs/utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -10,8 +10,8 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; use once_cell::sync::Lazy; use utils::{ + postgres_backend::QueryError, postgres_backend::{AuthType, Handler, PostgresBackend}, - postgres_backend_async::QueryError, }; fn make_tcp_pair() -> (TcpStream, TcpStream) { diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index d2f0b84863..8d6641a387 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -37,6 +37,7 @@ num-traits.workspace = true once_cell.workspace = true pin-project-lite.workspace = true postgres.workspace = true +postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true rand.workspace = true diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b362e25424..dc4be9dd65 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -20,6 +20,7 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; +use postgres_backend::{self, is_expected_io_error, PostgresBackend, QueryError}; use pq_proto::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; @@ -36,7 +37,6 @@ use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, postgres_backend::AuthType, - postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError}, simple_rcu::RcuReadGuard, }; @@ -721,7 +721,7 @@ impl PageServerHandler { } #[async_trait::async_trait] -impl postgres_backend_async::Handler for PageServerHandler { +impl postgres_backend::Handler for PageServerHandler { fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 7e06c398af..f9d1e819a1 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -33,10 +33,11 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; +use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; use pq_proto::ReplicationFeedback; -use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error}; +use utils::lsn::Lsn; /// Status of the connection. #[derive(Debug, Clone, Copy)] diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index c00c06fbb7..41f370add4 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -8,8 +8,8 @@ use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use std::{net::TcpStream, thread}; use tracing::{error, info, info_span}; use utils::{ + postgres_backend::QueryError, postgres_backend::{self, AuthType, PostgresBackend}, - postgres_backend_async::QueryError, }; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 99f0e90711..d1cd76459b 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -17,7 +17,7 @@ use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use std::str; use tracing::info; use utils::auth::{Claims, Scope}; -use utils::postgres_backend_async::QueryError; +use utils::postgres_backend::QueryError; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 32a24a4978..3d102a98d9 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -13,7 +13,7 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; use utils::id::TenantTimelineId; -use utils::postgres_backend_async::QueryError; +use utils::postgres_backend::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 671e5470a0..0cf921d97a 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -8,7 +8,7 @@ use anyhow::Context; use bytes::BytesMut; use tracing::*; use utils::lsn::Lsn; -use utils::postgres_backend_async::QueryError; +use utils::postgres_backend::QueryError; use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 411d0708b5..169ab03f0a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -16,7 +16,7 @@ use std::net::Shutdown; use std::sync::Arc; use std::time::Duration; use std::{io, str, thread}; -use utils::postgres_backend_async::QueryError; +use utils::postgres_backend::QueryError; use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use tokio::sync::watch::Receiver; diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 3ca651d060..40448be949 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -6,7 +6,7 @@ use regex::Regex; use std::net::{TcpListener, TcpStream}; use std::thread; use tracing::*; -use utils::postgres_backend_async::QueryError; +use utils::postgres_backend::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; From 0d8ced8534110219cac468c416501db901d31a42 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 2 Feb 2023 12:03:45 +0400 Subject: [PATCH 121/426] Remove sync postgres_backend, tidy up its split usage. - Add support for splitting async postgres_backend into read and write halfes. Safekeeper needs this for bidirectional streams. To this end, encapsulate reading-writing postgres messages to framed.rs with split support without any additional changes (relying on BufRead for reading and BytesMut out buffer for writing). - Use async postgres_backend throughout safekeeper (and in proxy auth link part). - In both safekeeper COPY streams, do read-write from the same thread/task with select! for easier error handling. - Tidy up finishing CopyBoth streams in safekeeper sending and receiving WAL -- join split parts back catching errors from them before returning. Initially I hoped to do that read-write without split at all, through polling IO: https://github.com/neondatabase/neon/pull/3522 However that turned out to be more complicated than I initially expected due to 1) borrow checking and 2) anon Future types. 1) required Rc> which is Send construct just to satisfy the checker; 2) can be workaround with transmute. But this is so messy that I decided to leave split. --- Cargo.lock | 23 +- control_plane/Cargo.toml | 1 + control_plane/src/bin/neon_local.rs | 2 +- control_plane/src/compute.rs | 2 +- control_plane/src/local_env.rs | 2 +- control_plane/src/pageserver.rs | 2 +- libs/postgres_backend/Cargo.toml | 1 - libs/postgres_backend/src/lib.rs | 723 ++++++++++++------ .../tests/cert.pem | 0 .../{utils => postgres_backend}/tests/key.pem | 0 libs/postgres_backend/tests/simple_select.rs | 139 ++++ libs/pq_proto/src/framed.rs | 175 +++++ libs/pq_proto/src/lib.rs | 309 ++++---- libs/pq_proto/src/sync.rs | 179 ----- libs/remote_storage/src/lib.rs | 2 +- libs/utils/Cargo.toml | 4 - libs/utils/src/lib.rs | 4 - libs/utils/src/postgres_backend.rs | 544 ------------- libs/utils/src/sock_split.rs | 206 ----- libs/utils/tests/ssl_test.rs | 238 ------ pageserver/src/bin/pageserver.rs | 5 +- pageserver/src/config.rs | 2 +- pageserver/src/page_service.rs | 20 +- .../walreceiver/walreceiver_connection.rs | 4 +- proxy/Cargo.toml | 1 + proxy/src/console/mgmt.rs | 43 +- proxy/src/stream.rs | 4 +- run_clippy.sh | 10 +- safekeeper/Cargo.toml | 1 + safekeeper/src/bin/safekeeper.rs | 2 +- safekeeper/src/handler.rs | 62 +- safekeeper/src/http/routes.rs | 9 +- safekeeper/src/json_ctrl.rs | 41 +- safekeeper/src/lib.rs | 3 +- safekeeper/src/receive_wal.rs | 421 ++++++---- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/send_wal.rs | 454 ++++++----- safekeeper/src/timeline.rs | 11 +- safekeeper/src/timelines_global_map.rs | 42 +- safekeeper/src/wal_backup.rs | 4 +- safekeeper/src/wal_service.rs | 87 ++- safekeeper/src/wal_storage.rs | 4 +- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_wal_acceptor.py | 4 +- workspace_hack/Cargo.toml | 6 + 45 files changed, 1657 insertions(+), 2145 deletions(-) rename libs/{utils => postgres_backend}/tests/cert.pem (100%) rename libs/{utils => postgres_backend}/tests/key.pem (100%) create mode 100644 libs/postgres_backend/tests/simple_select.rs create mode 100644 libs/pq_proto/src/framed.rs delete mode 100644 libs/pq_proto/src/sync.rs delete mode 100644 libs/utils/src/postgres_backend.rs delete mode 100644 libs/utils/src/sock_split.rs delete mode 100644 libs/utils/tests/ssl_test.rs diff --git a/Cargo.lock b/Cargo.lock index ab2f69929e..e380e72dc0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -913,6 +913,7 @@ dependencies = [ "once_cell", "pageserver_api", "postgres", + "postgres_backend", "postgres_connection", "regex", "reqwest", @@ -2696,7 +2697,6 @@ dependencies = [ "tokio-postgres-rustls", "tokio-rustls", "tracing", - "utils", "workspace_hack", ] @@ -2922,6 +2922,7 @@ dependencies = [ "opentelemetry", "parking_lot", "pin-project-lite", + "postgres_backend", "pq_proto", "prometheus", "rand", @@ -3301,15 +3302,6 @@ dependencies = [ "base64 0.21.0", ] -[[package]] -name = "rustls-split" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78802c9612b4689d207acff746f38132ca1b12dadb55d471aa5f10fd580f47d3" -dependencies = [ - "rustls", -] - [[package]] name = "rustversion" version = "1.0.11" @@ -3346,6 +3338,7 @@ dependencies = [ "parking_lot", "postgres", "postgres-protocol", + "postgres_backend", "postgres_ffi", "pq_proto", "regex", @@ -4539,12 +4532,8 @@ dependencies = [ "metrics", "nix", "once_cell", - "pq_proto", "rand", "routerify", - "rustls", - "rustls-pemfile", - "rustls-split", "sentry", "serde", "serde_json", @@ -4858,14 +4847,19 @@ name = "workspace_hack" version = "0.1.0" dependencies = [ "anyhow", + "byteorder", "bytes", "chrono", "clap 4.1.4", "crossbeam-utils", + "digest", "either", "fail", "futures", + "futures-channel", + "futures-core", "futures-executor", + "futures-sink", "futures-util", "hashbrown 0.12.3", "indexmap", @@ -4890,6 +4884,7 @@ dependencies = [ "socket2", "syn", "tokio", + "tokio-rustls", "tokio-util", "tonic", "tower", diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 309887e1fa..ba39747e03 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -24,6 +24,7 @@ url.workspace = true # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api # instead, so that recompile times are better. pageserver_api.workspace = true +postgres_backend.workspace = true safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 4b2aa3c957..49b1d31dbc 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -17,6 +17,7 @@ use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; +use postgres_backend::AuthType; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, @@ -30,7 +31,6 @@ use utils::{ auth::{Claims, Scope}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, project_git_version, }; diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 8731cf2583..b7029aabc5 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -11,10 +11,10 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; +use postgres_backend::AuthType; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, }; use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 003152c578..09180d96c4 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -5,6 +5,7 @@ use anyhow::{bail, ensure, Context}; +use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; @@ -19,7 +20,6 @@ use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - postgres_backend::AuthType, }; use crate::safekeeper::SafekeeperNode; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index c49bd39f09..4b7180c250 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -11,6 +11,7 @@ use anyhow::{bail, Context}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; +use postgres_backend::AuthType; use postgres_connection::{parse_host_port, PgConnectionConfig}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; @@ -20,7 +21,6 @@ use utils::{ http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, }; use crate::{background_process, local_env::LocalEnv}; diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml index bead77c4d6..8e249c09f7 100644 --- a/libs/postgres_backend/Cargo.toml +++ b/libs/postgres_backend/Cargo.toml @@ -17,7 +17,6 @@ tokio-rustls.workspace = true tracing.workspace = true pq_proto.workspace = true -utils.workspace = true workspace_hack.workspace = true [dev-dependencies] diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 6e96e65a52..ba28add9f9 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -2,29 +2,26 @@ //! To use, create PostgresBackend and run() it, passing the Handler //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. - use anyhow::Context; -use bytes::{Buf, Bytes, BytesMut}; -use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR}; -use std::io; +use bytes::Bytes; +use futures::pin_mut; +use serde::{Deserialize, Serialize}; +use std::io::ErrorKind; use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; -use std::task::Poll; -use std::{future::Future, task::ready}; -use tracing::{debug, error, info, trace}; -use utils::postgres_backend::AuthType; - -use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; +use std::task::{ready, Poll}; +use std::{fmt, io}; +use std::{future::Future, str::FromStr}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; +use tracing::{debug, error, info, trace}; -pub fn is_expected_io_error(e: &io::Error) -> bool { - use io::ErrorKind::*; - matches!( - e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset - ) -} +use pq_proto::framed::{Framed, FramedReader, FramedWriter}; +use pq_proto::{ + BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR, + SQLSTATE_SUCCESSFUL_COMPLETION, +}; /// An error, occurred during query processing: /// either during the connection ([`ConnectionError`]) or before/after it. @@ -53,12 +50,20 @@ impl QueryError { } } +pub fn is_expected_io_error(e: &io::Error) -> bool { + use io::ErrorKind::*; + matches!( + e.kind(), + ConnectionRefused | ConnectionAborted | ConnectionReset + ) +} + #[async_trait::async_trait] pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't - /// care). + /// care). It will also flush out the output buffer. async fn process_query( &mut self, pgb: &mut PostgresBackend, @@ -92,9 +97,13 @@ pub trait Handler { /// XXX: The order of the constructors matters. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] pub enum ProtoState { + /// Nothing happened yet. Initialization, + /// Encryption handshake is done; waiting for encrypted Startup message. Encrypted, + /// Waiting for password (auth token). Authentication, + /// Performed handshake and auth, ReadyForQuery is issued. Established, Closed, } @@ -105,15 +114,13 @@ pub enum ProcessMsgResult { Break, } -/// Always-writeable sock_split stream. -/// May not be readable. See [`PostgresBackend::take_stream_in`] -pub enum Stream { - Unencrypted(BufReader), - Tls(Box>>), - Broken, +/// Either plain TCP stream or encrypted one, implementing AsyncRead + AsyncWrite. +pub enum MaybeTlsStream { + Unencrypted(tokio::net::TcpStream), + Tls(Box>), } -impl AsyncWrite for Stream { +impl AsyncWrite for MaybeTlsStream { fn poll_write( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -122,14 +129,12 @@ impl AsyncWrite for Stream { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), - Self::Broken => unreachable!(), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll> { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), Self::Tls(stream) => Pin::new(stream).poll_flush(cx), - Self::Broken => unreachable!(), } } fn poll_shutdown( @@ -139,11 +144,10 @@ impl AsyncWrite for Stream { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), - Self::Broken => unreachable!(), } } } -impl AsyncRead for Stream { +impl AsyncRead for MaybeTlsStream { fn poll_read( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -152,18 +156,96 @@ impl AsyncRead for Stream { match self.get_mut() { Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), - Self::Broken => unreachable!(), + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] +pub enum AuthType { + Trust, + // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT + NeonJWT, +} + +impl FromStr for AuthType { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "Trust" => Ok(Self::Trust), + "NeonJWT" => Ok(Self::NeonJWT), + _ => anyhow::bail!("invalid value \"{s}\" for auth type"), + } + } +} + +impl fmt::Display for AuthType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + AuthType::Trust => "Trust", + AuthType::NeonJWT => "NeonJWT", + }) + } +} + +/// Either full duplex Framed or write only half; the latter is left in +/// PostgresBackend after call to `split`. In principle we could always store a +/// pair of splitted handles, but that would force to to pay splitting price +/// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver). +enum MaybeWriteOnly { + Full(Framed), + WriteOnly(FramedWriter), + Broken, // temporary value palmed off during the split +} + +impl MaybeWriteOnly { + async fn read_startup_message(&mut self) -> Result, ConnectionError> { + match self { + MaybeWriteOnly::Full(framed) => framed.read_startup_message().await, + MaybeWriteOnly::WriteOnly(_) => { + Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + } + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + async fn read_message(&mut self) -> Result, ConnectionError> { + match self { + MaybeWriteOnly::Full(framed) => framed.read_message().await, + MaybeWriteOnly::WriteOnly(_) => { + Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + } + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ConnectionError> { + match self { + MaybeWriteOnly::Full(framed) => framed.write_message(msg), + MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.write_message_noflush(msg), + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + async fn flush(&mut self) -> io::Result<()> { + match self { + MaybeWriteOnly::Full(framed) => framed.flush().await, + MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.flush().await, + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), + } + } + + async fn shutdown(&mut self) -> io::Result<()> { + match self { + MaybeWriteOnly::Full(framed) => framed.shutdown().await, + MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.shutdown().await, + MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } } } pub struct PostgresBackend { - stream: Stream, - - // Output buffer. c.f. BeMessage::write why we are using BytesMut here. - // The data between 0 and "current position" as tracked by the bytes::Buf - // implementation of BytesMut, have already been written. - buf_out: BytesMut, + framed: MaybeWriteOnly, pub state: ProtoState, @@ -183,7 +265,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec { query_string } -// Cast a byte slice to a string slice, dropping null terminator if there's one. +/// Cast a byte slice to a string slice, dropping null terminator if there's one. fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) @@ -196,10 +278,10 @@ impl PostgresBackend { tls_config: Option>, ) -> io::Result { let peer_addr = socket.peer_addr()?; + let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { - stream: Stream::Unencrypted(BufReader::new(socket)), - buf_out: BytesMut::with_capacity(10 * 1024), + framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, tls_config, @@ -211,30 +293,52 @@ impl PostgresBackend { &self.peer_addr } - /// Read full message or return None if connection is closed. - pub async fn read_message(&mut self) -> Result, QueryError> { - use ProtoState::*; - match self.state { - Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, - Authentication | Established => FeMessage::read_fut(&mut self.stream).await, - Closed => Ok(None), + /// Read full message or return None if connection is cleanly closed with no + /// unprocessed data. + pub async fn read_message(&mut self) -> Result, ConnectionError> { + if let ProtoState::Closed = self.state { + Ok(None) + } else { + let m = self.framed.read_message().await?; + trace!("read msg {:?}", m); + Ok(m) } - .map_err(QueryError::from) + } + + /// Write message into internal output buffer, doesn't flush it. Technically + /// error type can be only ProtocolError here (if, unlikely, serialization + /// fails), but callers typically wrap it anyway. + pub fn write_message_noflush( + &mut self, + message: &BeMessage<'_>, + ) -> Result<&mut Self, ConnectionError> { + self.framed.write_message_noflush(message)?; + trace!("wrote msg {:?}", message); + Ok(self) } /// Flush output buffer into the socket. pub async fn flush(&mut self) -> io::Result<()> { - while self.buf_out.has_remaining() { - let bytes_written = self.stream.write(self.buf_out.chunk()).await?; - self.buf_out.advance(bytes_written); - } - self.buf_out.clear(); - Ok(()) + self.framed.flush().await } - /// Write message into internal output buffer. - pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { - BeMessage::write(&mut self.buf_out, message)?; + /// Polling version of `flush()`, saves the caller need to pin. + pub fn poll_flush( + &mut self, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let flush_fut = self.flush(); + pin_mut!(flush_fut); + flush_fut.poll(cx) + } + + /// Write message into internal output buffer and flush it to the stream. + pub async fn write_message( + &mut self, + message: &BeMessage<'_>, + ) -> Result<&mut Self, ConnectionError> { + self.write_message_noflush(message)?; + self.flush().await?; Ok(self) } @@ -246,26 +350,7 @@ impl PostgresBackend { CopyDataWriter { pgb: self } } - /// A polling function that tries to write all the data from 'buf_out' to the - /// underlying stream. - fn poll_write_buf( - &mut self, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - while self.buf_out.has_remaining() { - match ready!(Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk())) { - Ok(bytes_written) => self.buf_out.advance(bytes_written), - Err(err) => return Poll::Ready(Err(err)), - } - } - Poll::Ready(Ok(())) - } - - fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - Pin::new(&mut self.stream).poll_flush(cx) - } - - // Wrapper for run_message_loop() that shuts down socket when we are done + /// Wrapper for run_message_loop() that shuts down socket when we are done pub async fn run( mut self, handler: &mut impl Handler, @@ -276,7 +361,9 @@ impl PostgresBackend { S: Future, { let ret = self.run_message_loop(handler, shutdown_watcher).await; - let _ = self.stream.shutdown(); + // socket might be already closed, e.g. if previously received error, + // so ignore result. + self.framed.shutdown().await.ok(); ret } @@ -300,30 +387,12 @@ impl PostgresBackend { return Ok(()) }, - result = async { - while self.state < ProtoState::Established { - if let Some(msg) = self.read_message().await? { - trace!("got message {msg:?} during handshake"); - - match self.process_handshake_message(handler, msg).await? { - ProcessMsgResult::Continue => { - self.flush().await?; - continue; - } - ProcessMsgResult::Break => { - trace!("postgres backend to {:?} exited during handshake", self.peer_addr); - return Ok(()); - } - } - } else { - trace!("postgres backend to {:?} exited during handshake", self.peer_addr); - return Ok(()); - } - } - Ok::<(), QueryError>(()) - } => { + result = self.handshake(handler) => { // Handshake complete. result?; + if self.state == ProtoState::Closed { + return Ok(()); // EOF during handshake + } } ); @@ -355,114 +424,207 @@ impl PostgresBackend { Ok(()) } - async fn start_tls(&mut self) -> anyhow::Result<()> { - if let Stream::Unencrypted(plain_stream) = - std::mem::replace(&mut self.stream, Stream::Broken) - { - let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap()); - let tls_stream = acceptor.accept(plain_stream).await?; - - self.stream = Stream::Tls(Box::new(tls_stream)); - return Ok(()); - }; - anyhow::bail!("TLS already started"); - } - - async fn process_handshake_message( - &mut self, - handler: &mut impl Handler, - msg: FeMessage, - ) -> Result { - assert!(self.state < ProtoState::Established); - let have_tls = self.tls_config.is_some(); - match msg { - FeMessage::StartupPacket(m) => { - trace!("got startup message {m:?}"); - - match m { - FeStartupPacket::SslRequest => { - debug!("SSL requested"); - - self.write_message_noflush(&BeMessage::EncryptionResponse(have_tls))?; - if have_tls { - self.start_tls().await?; - self.state = ProtoState::Encrypted; - } - } - FeStartupPacket::GssEncRequest => { - debug!("GSS requested"); - self.write_message_noflush(&BeMessage::EncryptionResponse(false))?; - } - FeStartupPacket::StartupMessage { .. } => { - if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message_noflush(&BeMessage::ErrorResponse( - "must connect with TLS", - None, - ))?; - return Err(QueryError::Other(anyhow::anyhow!( - "client did not connect with TLS" - ))); - } - - // NB: startup() may change self.auth_type -- we are using that in proxy code - // to bypass auth for new users. - handler.startup(self, &m)?; - - match self.auth_type { - AuthType::Trust => { - self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeMessage::CLIENT_ENCODING)? - // The async python driver requires a valid server_version - .write_message_noflush(&BeMessage::server_version("14.1"))? - .write_message_noflush(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - AuthType::NeonJWT => { - self.write_message_noflush( - &BeMessage::AuthenticationCleartextPassword, - )?; - self.state = ProtoState::Authentication; - } - } - } - FeStartupPacket::CancelRequest { .. } => { - self.state = ProtoState::Closed; - return Ok(ProcessMsgResult::Break); - } - } + /// Try to upgrade MaybeTlsStream into actual TLS one, performing handshake. + async fn tls_upgrade( + src: MaybeTlsStream, + tls_config: Arc, + ) -> anyhow::Result { + match src { + MaybeTlsStream::Unencrypted(s) => { + let acceptor = TlsAcceptor::from(tls_config); + let tls_stream = acceptor.accept(s).await?; + Ok(MaybeTlsStream::Tls(Box::new(tls_stream))) } - - FeMessage::PasswordMessage(m) => { - trace!("got password message '{:?}'", m); - - assert!(self.state == ProtoState::Authentication); - - match self.auth_type { - AuthType::Trust => unreachable!(), - AuthType::NeonJWT => { - let (_, jwt_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))?; - return Err(e); - } - } - } - self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeMessage::CLIENT_ENCODING)? - .write_message_noflush(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - - _ => { - self.state = ProtoState::Closed; - return Ok(ProcessMsgResult::Break); + MaybeTlsStream::Tls(_) => { + anyhow::bail!("TLS already started"); } } - Ok(ProcessMsgResult::Continue) + } + + async fn start_tls(&mut self) -> anyhow::Result<()> { + // temporary replace stream with fake to cook TLS one, Indiana Jones style + match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { + MaybeWriteOnly::Full(framed) => { + let tls_config = self + .tls_config + .as_ref() + .context("start_tls called without conf")? + .clone(); + let tls_framed = framed + .map_stream(|s| PostgresBackend::tls_upgrade(s, tls_config)) + .await?; + // push back ready TLS stream + self.framed = MaybeWriteOnly::Full(tls_framed); + Ok(()) + } + MaybeWriteOnly::WriteOnly(_) => { + anyhow::bail!("TLS upgrade attempt in split state") + } + MaybeWriteOnly::Broken => panic!("TLS upgrade on framed in invalid state"), + } + } + + /// Split off owned read part from which messages can be read in different + /// task/thread. + pub fn split(&mut self) -> anyhow::Result { + // temporary replace stream with fake to cook split one, Indiana Jones style + match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { + MaybeWriteOnly::Full(framed) => { + let (reader, writer) = framed.split(); + self.framed = MaybeWriteOnly::WriteOnly(writer); + Ok(PostgresBackendReader(reader)) + } + MaybeWriteOnly::WriteOnly(_) => { + anyhow::bail!("PostgresBackend is already split") + } + MaybeWriteOnly::Broken => panic!("split on framed in invalid state"), + } + } + + /// Join read part back. + pub fn unsplit(&mut self, reader: PostgresBackendReader) -> anyhow::Result<()> { + // temporary replace stream with fake to cook joined one, Indiana Jones style + match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { + MaybeWriteOnly::Full(_) => { + anyhow::bail!("PostgresBackend is not split") + } + MaybeWriteOnly::WriteOnly(writer) => { + let joined = Framed::unsplit(reader.0, writer); + self.framed = MaybeWriteOnly::Full(joined); + Ok(()) + } + MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"), + } + } + + /// Perform handshake with the client, transitioning to Established. + /// In case of EOF during handshake logs this, sets state to Closed and returns Ok(()). + async fn handshake(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { + while self.state < ProtoState::Authentication { + match self.framed.read_startup_message().await? { + Some(msg) => { + self.process_startup_message(handler, msg).await?; + } + None => { + trace!( + "postgres backend to {:?} received EOF during handshake", + self.peer_addr + ); + self.state = ProtoState::Closed; + return Ok(()); + } + } + } + + // Perform auth, if needed. + if self.state == ProtoState::Authentication { + match self.framed.read_message().await? { + Some(FeMessage::PasswordMessage(m)) => { + assert!(self.auth_type == AuthType::NeonJWT); + + let (_, jwt_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_jwt(self, jwt_response) { + self.write_message_noflush(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))?; + return Err(e); + } + + self.write_message_noflush(&BeMessage::AuthenticationOk)? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? + .write_message(&BeMessage::ReadyForQuery) + .await?; + self.state = ProtoState::Established; + } + Some(m) => { + return Err(QueryError::Other(anyhow::anyhow!( + "Unexpected message {:?} while waiting for handshake", + m + ))); + } + None => { + trace!( + "postgres backend to {:?} received EOF during auth", + self.peer_addr + ); + self.state = ProtoState::Closed; + return Ok(()); + } + } + } + + Ok(()) + } + + /// Process startup packet: + /// - transition to Established if auth type is trust + /// - transition to Authentication if auth type is NeonJWT. + /// - or perform TLS handshake -- then need to call this again to receive + /// actual startup packet. + async fn process_startup_message( + &mut self, + handler: &mut impl Handler, + msg: FeStartupPacket, + ) -> Result<(), QueryError> { + assert!(self.state < ProtoState::Authentication); + let have_tls = self.tls_config.is_some(); + match msg { + FeStartupPacket::SslRequest => { + debug!("SSL requested"); + + self.write_message(&BeMessage::EncryptionResponse(have_tls)) + .await?; + + if have_tls { + self.start_tls().await?; + self.state = ProtoState::Encrypted; + } + } + FeStartupPacket::GssEncRequest => { + debug!("GSS requested"); + self.write_message(&BeMessage::EncryptionResponse(false)) + .await?; + } + FeStartupPacket::StartupMessage { .. } => { + if have_tls && !matches!(self.state, ProtoState::Encrypted) { + self.write_message(&BeMessage::ErrorResponse("must connect with TLS", None)) + .await?; + return Err(QueryError::Other(anyhow::anyhow!( + "client did not connect with TLS" + ))); + } + + // NB: startup() may change self.auth_type -- we are using that in proxy code + // to bypass auth for new users. + handler.startup(self, &msg)?; + + match self.auth_type { + AuthType::Trust => { + self.write_message_noflush(&BeMessage::AuthenticationOk)? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? + .write_message_noflush(&BeMessage::INTEGER_DATETIMES)? + // The async python driver requires a valid server_version + .write_message_noflush(&BeMessage::server_version("14.1"))? + .write_message(&BeMessage::ReadyForQuery) + .await?; + self.state = ProtoState::Established; + } + AuthType::NeonJWT => { + self.write_message(&BeMessage::AuthenticationCleartextPassword) + .await?; + self.state = ProtoState::Authentication; + } + } + } + FeStartupPacket::CancelRequest { .. } => { + return Err(QueryError::Other(anyhow::anyhow!( + "Unexpected CancelRequest message during handshake" + ))); + } + } + Ok(()) } async fn process_message( @@ -476,10 +638,6 @@ impl PostgresBackend { assert!(self.state == ProtoState::Established); match msg { - FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { - return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); - } - FeMessage::Query(body) => { // remove null terminator let query_string = cstr_to_str(&body)?; @@ -540,16 +698,114 @@ impl PostgresBackend { // We prefer explicit pattern matching to wildcards, because // this helps us spot the places where new variants are missing - FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { + FeMessage::CopyData(_) + | FeMessage::CopyDone + | FeMessage::CopyFail + | FeMessage::PasswordMessage(_) + | FeMessage::StartupPacket(_) => { return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message type: {:?}", - msg + "unexpected message type: {msg:?}", ))); } } Ok(ProcessMsgResult::Continue) } + + /// Log as info/error result of handling COPY stream and send back + /// ErrorResponse if that makes sense. Shutdown the stream if we got + /// Terminate. TODO: transition into waiting for Sync msg if we initiate the + /// close. + pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) { + use CopyStreamHandlerEnd::*; + + let expected_end = match &end { + ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true, + CopyStreamHandlerEnd::Disconnected(ConnectionError::Socket(io_error)) + if is_expected_io_error(io_error) => + { + true + } + _ => false, + }; + if expected_end { + info!("terminated: {:#}", end); + } else { + error!("terminated: {:?}", end); + } + + // Note: no current usages ever send this + if let CopyDone = &end { + if let Err(e) = self.write_message(&BeMessage::CopyDone).await { + error!("failed to send CopyDone: {}", e); + } + } + + if let Terminate = &end { + self.state = ProtoState::Closed; + } + + let err_to_send_and_errcode = match &end { + ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), + Other(_) => Some((end.to_string(), SQLSTATE_INTERNAL_ERROR)), + // Note: CopyFail in duplex copy is somewhat unexpected (at least to + // PG walsender; evidently and per my docs reading client should + // finish it with CopyDone). It is not a problem to recover from it + // finishing the stream in both directions like we do, but note that + // sync rust-postgres client (which we don't use anymore) hangs if + // socket is not closed here. + // https://github.com/sfackler/rust-postgres/issues/755 + // https://github.com/neondatabase/neon/issues/935 + // + // Currently, the version of tokio_postgres replication patch we use + // sends this when it closes the stream (e.g. pageserver decided to + // switch conn to another safekeeper and client gets dropped). + // Moreover, seems like 'connection' task errors with 'unexpected + // message from server' when it receives ErrorResponse (anything but + // CopyData/CopyDone) back. + CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), + _ => None, + }; + if let Some((err, errcode)) = err_to_send_and_errcode { + if let Err(ee) = self + .write_message(&BeMessage::ErrorResponse(&err, Some(errcode))) + .await + { + error!("failed to send ErrorResponse: {}", ee); + } + } + } +} + +pub struct PostgresBackendReader(FramedReader); + +impl PostgresBackendReader { + /// Read full message or return None if connection is cleanly closed with no + /// unprocessed data. + pub async fn read_message(&mut self) -> Result, ConnectionError> { + let m = self.0.read_message().await?; + trace!("read msg {:?}", m); + Ok(m) + } + + /// Get CopyData contents of the next message in COPY stream or error + /// closing it. The error type is wider than actual errors which can happen + /// here -- it includes 'Other' and 'ServerInitiated', but that's ok for + /// current callers. + pub async fn read_copy_message(&mut self) -> Result { + match self.read_message().await? { + Some(msg) => match msg { + FeMessage::CopyData(m) => Ok(m), + FeMessage::CopyDone => Err(CopyStreamHandlerEnd::CopyDone), + FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail), + FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate), + _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol( + format!("unexpected message in COPY stream {:?}", msg), + ))), + }, + None => Err(CopyStreamHandlerEnd::EOF), + } + } } /// @@ -572,16 +828,19 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> { // It's not strictly required to flush between each message, but makes it easier // to view in wireshark, and usually the messages that the callers write are // decently-sized anyway. - match ready!(this.pgb.poll_write_buf(cx)) { - Ok(()) => {} - Err(err) => return Poll::Ready(Err(err)), + if let Err(err) = ready!(this.pgb.poll_flush(cx)) { + return Poll::Ready(Err(err)); } // CopyData // XXX: if the input is large, we should split it into multiple messages. // Not sure what the threshold should be, but the ultimate hard limit is that // the length cannot exceed u32. - this.pgb.write_message_noflush(&BeMessage::CopyData(buf))?; + this.pgb + .write_message_noflush(&BeMessage::CopyData(buf)) + // write_message only writes to the buffer, so it can fail iff the + // message is invaid, but CopyData can't be invalid. + .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?; Poll::Ready(Ok(buf.len())) } @@ -591,21 +850,14 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> { cx: &mut std::task::Context<'_>, ) -> Poll> { let this = self.get_mut(); - match ready!(this.pgb.poll_write_buf(cx)) { - Ok(()) => {} - Err(err) => return Poll::Ready(Err(err)), - } this.pgb.poll_flush(cx) } + fn poll_shutdown( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> Poll> { let this = self.get_mut(); - match ready!(this.pgb.poll_write_buf(cx)) { - Ok(()) => {} - Err(err) => return Poll::Ready(Err(err)), - } this.pgb.poll_flush(cx) } } @@ -617,7 +869,7 @@ pub fn short_error(e: &QueryError) -> String { } } -pub fn log_query_error(query: &str, e: &QueryError) { +fn log_query_error(query: &str, e: &QueryError) { match e { QueryError::Disconnected(ConnectionError::Socket(io_error)) => { if is_expected_io_error(io_error) { @@ -634,3 +886,26 @@ pub fn log_query_error(query: &str, e: &QueryError) { } } } + +/// Something finishing handling of COPY stream, see handle_copy_stream_end. +/// This is not always a real error, but it allows to use ? and thiserror impls. +#[derive(thiserror::Error, Debug)] +pub enum CopyStreamHandlerEnd { + /// Handler initiates the end of streaming. + #[error("{0}")] + ServerInitiated(String), + #[error("received CopyDone")] + CopyDone, + #[error("received CopyFail")] + CopyFail, + #[error("received Terminate")] + Terminate, + #[error("EOF on COPY stream")] + EOF, + /// The connection was lost + #[error(transparent)] + Disconnected(#[from] ConnectionError), + /// Some other error + #[error(transparent)] + Other(#[from] anyhow::Error), +} diff --git a/libs/utils/tests/cert.pem b/libs/postgres_backend/tests/cert.pem similarity index 100% rename from libs/utils/tests/cert.pem rename to libs/postgres_backend/tests/cert.pem diff --git a/libs/utils/tests/key.pem b/libs/postgres_backend/tests/key.pem similarity index 100% rename from libs/utils/tests/key.pem rename to libs/postgres_backend/tests/key.pem diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs new file mode 100644 index 0000000000..a310171c70 --- /dev/null +++ b/libs/postgres_backend/tests/simple_select.rs @@ -0,0 +1,139 @@ +/// Test postgres_backend_async with tokio_postgres +use once_cell::sync::Lazy; +use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; +use pq_proto::{BeMessage, RowDescriptor}; +use std::io::Cursor; +use std::{future, sync::Arc}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_postgres::config::SslMode; +use tokio_postgres::tls::MakeTlsConnect; +use tokio_postgres::{Config, NoTls, SimpleQueryMessage}; +use tokio_postgres_rustls::MakeRustlsConnect; + +// generate client, server test streams +async fn make_tcp_pair() -> (TcpStream, TcpStream) { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let client_stream = TcpStream::connect(addr).await.unwrap(); + let (server_stream, _) = listener.accept().await.unwrap(); + (client_stream, server_stream) +} + +struct TestHandler {} + +#[async_trait::async_trait] +impl Handler for TestHandler { + // return single col 'hey' for any query + async fn process_query( + &mut self, + pgb: &mut PostgresBackend, + _query_string: &str, + ) -> Result<(), QueryError> { + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"hey", + )]))? + .write_message_noflush(&BeMessage::DataRow(&[Some("hey".as_bytes())]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + Ok(()) + } +} + +// test that basic select works +#[tokio::test] +async fn simple_select() { + let (client_sock, server_sock) = make_tcp_pair().await; + + // create and run pgbackend + let pgbackend = + PostgresBackend::new(server_sock, AuthType::Trust, None).expect("pgbackend creation"); + + tokio::spawn(async move { + let mut handler = TestHandler {}; + pgbackend.run(&mut handler, future::pending::<()>).await + }); + + let conf = Config::new(); + let (client, connection) = conf.connect_raw(client_sock, NoTls).await.expect("connect"); + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0]; + if let SimpleQueryMessage::Row(row) = first_val { + let first_col = row.get(0).expect("first column"); + assert_eq!(first_col, "hey"); + } else { + panic!("expected SimpleQueryMessage::Row"); + } +} + +static KEY: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("key.pem")); + rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) +}); + +static CERT: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("cert.pem")); + rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) +}); + +// test that basic select with ssl works +#[tokio::test] +async fn simple_select_ssl() { + let (client_sock, server_sock) = make_tcp_pair().await; + + let server_cfg = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone()) + .unwrap(); + let tls_config = Some(Arc::new(server_cfg)); + let pgbackend = + PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation"); + + tokio::spawn(async move { + let mut handler = TestHandler {}; + pgbackend.run(&mut handler, future::pending::<()>).await + }); + + let client_cfg = rustls::ClientConfig::builder() + .with_safe_defaults() + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(&CERT).unwrap(); + store + }) + .with_no_client_auth(); + let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg); + let tls_connect = >::make_tls_connect( + &mut make_tls_connect, + "localhost", + ) + .expect("make_tls_connect"); + + let mut conf = Config::new(); + conf.ssl_mode(SslMode::Require); + let (client, connection) = conf + .connect_raw(client_sock, tls_connect) + .await + .expect("connect"); + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0]; + if let SimpleQueryMessage::Row(row) = first_val { + let first_col = row.get(0).expect("first column"); + assert_eq!(first_col, "hey"); + } else { + panic!("expected SimpleQueryMessage::Row"); + } +} diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs new file mode 100644 index 0000000000..7c33222e6e --- /dev/null +++ b/libs/pq_proto/src/framed.rs @@ -0,0 +1,175 @@ +//! Provides `Framed` -- writing/flushing and reading Postgres messages to/from +//! the async stream. +use bytes::{Buf, BytesMut}; +use std::{ + future::Future, + io::{self, ErrorKind}, +}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader, ReadHalf, WriteHalf}; + +use crate::{BeMessage, ConnectionError, FeMessage, FeStartupPacket}; + +const INITIAL_CAPACITY: usize = 8 * 1024; + +/// Wraps async io `stream`, providing messages to write/flush + read Postgres +/// messages. +pub struct Framed { + stream: BufReader, + write_buf: BytesMut, +} + +impl Framed { + pub fn new(stream: S) -> Self { + Self { + stream: BufReader::new(stream), + write_buf: BytesMut::with_capacity(INITIAL_CAPACITY), + } + } + + /// Get a shared reference to the underlying stream. + pub fn get_ref(&self) -> &S { + self.stream.get_ref() + } + + /// Extract the underlying stream. + pub fn into_inner(self) -> S { + self.stream.into_inner() + } + + /// Return new Framed with stream type transformed by async f, for TLS + /// upgrade. + pub async fn map_stream(self, f: F) -> Result, E> + where + F: FnOnce(S) -> Fut, + Fut: Future>, + { + let stream = f(self.stream.into_inner()).await?; + Ok(Framed { + stream: BufReader::new(stream), + write_buf: self.write_buf, + }) + } +} + +impl Framed { + pub async fn read_startup_message( + &mut self, + ) -> Result, ConnectionError> { + let msg = FeStartupPacket::read(&mut self.stream).await?; + + match msg { + Some(FeMessage::StartupPacket(packet)) => Ok(Some(packet)), + None => Ok(None), + _ => panic!("unreachable state"), + } + } + + pub async fn read_message(&mut self) -> Result, ConnectionError> { + FeMessage::read(&mut self.stream).await + } +} + +impl Framed { + /// Write next message to the output buffer; doesn't flush. + pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ConnectionError> { + BeMessage::write(&mut self.write_buf, msg).map_err(|e| e.into()) + } + + /// Flush out the buffer. This function is cancellation safe: it can be + /// interrupted and flushing will be continued in the next call. + pub async fn flush(&mut self) -> Result<(), io::Error> { + flush(&mut self.stream, &mut self.write_buf).await + } + + /// Flush out the buffer and shutdown the stream. + pub async fn shutdown(&mut self) -> Result<(), io::Error> { + shutdown(&mut self.stream, &mut self.write_buf).await + } +} + +impl Framed { + /// Split into owned read and write parts. Beware of potential issues with + /// using halves in different tasks on TLS stream: + /// https://github.com/tokio-rs/tls/issues/40 + pub fn split(self) -> (FramedReader, FramedWriter) { + let (read_half, write_half) = tokio::io::split(self.stream); + let reader = FramedReader { stream: read_half }; + let writer = FramedWriter { + stream: write_half, + write_buf: self.write_buf, + }; + (reader, writer) + } + + /// Join read and write parts back. + pub fn unsplit(reader: FramedReader, writer: FramedWriter) -> Self { + Self { + stream: reader.stream.unsplit(writer.stream), + write_buf: writer.write_buf, + } + } +} + +/// Read-only version of `Framed`. +pub struct FramedReader { + stream: ReadHalf>, +} + +impl FramedReader { + pub async fn read_message(&mut self) -> Result, ConnectionError> { + FeMessage::read(&mut self.stream).await + } +} + +/// Write-only version of `Framed`. +pub struct FramedWriter { + stream: WriteHalf>, + write_buf: BytesMut, +} + +impl FramedWriter { + /// Write next message to the output buffer; doesn't flush. + pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ConnectionError> { + BeMessage::write(&mut self.write_buf, msg).map_err(|e| e.into()) + } + + /// Flush out the buffer. This function is cancellation safe: it can be + /// interrupted and flushing will be continued in the next call. + pub async fn flush(&mut self) -> Result<(), io::Error> { + flush(&mut self.stream, &mut self.write_buf).await + } + + /// Flush out the buffer and shutdown the stream. + pub async fn shutdown(&mut self) -> Result<(), io::Error> { + shutdown(&mut self.stream, &mut self.write_buf).await + } +} + +async fn flush( + stream: &mut S, + write_buf: &mut BytesMut, +) -> Result<(), io::Error> { + while write_buf.has_remaining() { + let bytes_written = stream.write(write_buf.chunk()).await?; + if bytes_written == 0 { + return Err(io::Error::new( + ErrorKind::WriteZero, + "failed to write message", + )); + } + // The advanced part will be garbage collected, likely during shifting + // data left on next attempt to write to buffer when free space is not + // enough. + write_buf.advance(bytes_written); + } + write_buf.clear(); + stream.flush().await +} + +async fn shutdown( + stream: &mut S, + write_buf: &mut BytesMut, +) -> Result<(), io::Error> { + flush(stream, write_buf).await?; + stream.shutdown().await +} diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index b7995c840c..6980c4afae 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -2,8 +2,7 @@ //! //! on message formats. -// Tools for calling certain async methods in sync contexts. -pub mod sync; +pub mod framed; use anyhow::{ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -13,12 +12,10 @@ use std::{ borrow::Cow, collections::HashMap, fmt, - future::Future, io::{self, Cursor}, str, time::{Duration, SystemTime}, }; -use sync::{AsyncishRead, SyncFuture}; use tokio::io::AsyncReadExt; use tracing::{trace, warn}; @@ -211,7 +208,7 @@ macro_rules! retry_read { pub enum ConnectionError { /// IO error during writing to or reading from the connection socket. #[error("Socket IO error: {0}")] - Socket(std::io::Error), + Socket(#[from] std::io::Error), /// Invalid packet was received from client #[error("Protocol error: {0}")] Protocol(String), @@ -238,87 +235,56 @@ impl ConnectionError { impl FeMessage { /// Read one message from the stream. /// This function returns `Ok(None)` in case of EOF. - /// One way to handle this properly: - /// - /// ``` - /// # use std::io; - /// # use pq_proto::FeMessage; - /// # - /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> { - /// # Ok(()) - /// # }; - /// # - /// fn do_the_job(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<()> { - /// while let Some(msg) = FeMessage::read(stream)? { - /// process_message(msg)?; - /// } - /// - /// Ok(()) - /// } - /// ``` - #[inline(never)] - pub fn read( - stream: &mut (impl io::Read + Unpin), - ) -> Result, ConnectionError> { - Self::read_fut(&mut AsyncishRead(stream)).wait() - } - - /// Read one message from the stream. - /// See documentation for `Self::read`. - pub fn read_fut( - stream: &mut Reader, - ) -> SyncFuture, ConnectionError>> + '_> + pub async fn read(stream: &mut Reader) -> Result, ConnectionError> where Reader: tokio::io::AsyncRead + Unpin, { // We return a Future that's sync (has a `wait` method) if and only if the provided stream is SyncProof. // SyncFuture contract: we are only allowed to await on sync-proof futures, the AsyncRead and // AsyncReadExt methods of the stream. - SyncFuture::new(async move { - // Each libpq message begins with a message type byte, followed by message length - // If the client closes the connection, return None. But if the client closes the - // connection in the middle of a message, we will return an error. - let tag = match retry_read!(stream.read_u8().await) { - Ok(b) => b, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(ConnectionError::Socket(e)), - }; + // Each libpq message begins with a message type byte, followed by message length + // If the client closes the connection, return None. But if the client closes the + // connection in the middle of a message, we will return an error. + let tag = match retry_read!(stream.read_u8().await) { + Ok(b) => b, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(ConnectionError::Socket(e)), + }; - // The message length includes itself, so it better be at least 4. - let len = retry_read!(stream.read_u32().await) - .map_err(ConnectionError::Socket)? - .checked_sub(4) - .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?; + // The message length includes itself, so it better be at least 4. + let len = retry_read!(stream.read_u32().await) + .map_err(ConnectionError::Socket)? + .checked_sub(4) + .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?; - let body = { - let mut buffer = vec![0u8; len as usize]; - stream - .read_exact(&mut buffer) - .await - .map_err(ConnectionError::Socket)?; - Bytes::from(buffer) - }; + let body = { + let mut buffer = vec![0u8; len as usize]; + stream + .read_exact(&mut buffer) + .await + .map_err(ConnectionError::Socket)?; + Bytes::from(buffer) + }; - match tag { - b'Q' => Ok(Some(FeMessage::Query(body))), - b'P' => Ok(Some(FeParseMessage::parse(body)?)), - b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), - b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), - b'B' => Ok(Some(FeBindMessage::parse(body)?)), - b'C' => Ok(Some(FeCloseMessage::parse(body)?)), - b'S' => Ok(Some(FeMessage::Sync)), - b'X' => Ok(Some(FeMessage::Terminate)), - b'd' => Ok(Some(FeMessage::CopyData(body))), - b'c' => Ok(Some(FeMessage::CopyDone)), - b'f' => Ok(Some(FeMessage::CopyFail)), - b'p' => Ok(Some(FeMessage::PasswordMessage(body))), - tag => { - return Err(ConnectionError::Protocol(format!( - "unknown message tag: {tag},'{body:?}'" - ))) - } + match tag { + b'Q' => Ok(Some(FeMessage::Query(body))), + b'P' => Ok(Some(FeParseMessage::parse(body)?)), + b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), + b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), + b'B' => Ok(Some(FeBindMessage::parse(body)?)), + b'C' => Ok(Some(FeCloseMessage::parse(body)?)), + b'S' => Ok(Some(FeMessage::Sync)), + b'X' => Ok(Some(FeMessage::Terminate)), + b'd' => Ok(Some(FeMessage::CopyData(body))), + b'c' => Ok(Some(FeMessage::CopyDone)), + b'f' => Ok(Some(FeMessage::CopyFail)), + b'p' => Ok(Some(FeMessage::PasswordMessage(body))), + tag => { + return Err(ConnectionError::Protocol(format!( + "unknown message tag: {tag},'{body:?}'" + ))) } - }) + } } } @@ -326,18 +292,7 @@ impl FeStartupPacket { /// Read startup message from the stream. // XXX: It's tempting yet undesirable to accept `stream` by value, // since such a change will cause user-supplied &mut references to be consumed - pub fn read( - stream: &mut (impl io::Read + Unpin), - ) -> Result, ConnectionError> { - Self::read_fut(&mut AsyncishRead(stream)).wait() - } - - /// Read startup message from the stream. - // XXX: It's tempting yet undesirable to accept `stream` by value, - // since such a change will cause user-supplied &mut references to be consumed - pub fn read_fut( - stream: &mut Reader, - ) -> SyncFuture, ConnectionError>> + '_> + pub async fn read(stream: &mut Reader) -> Result, ConnectionError> where Reader: tokio::io::AsyncRead + Unpin, { @@ -347,99 +302,96 @@ impl FeStartupPacket { const NEGOTIATE_SSL_CODE: u32 = 5679; const NEGOTIATE_GSS_CODE: u32 = 5680; - SyncFuture::new(async move { - // Read length. If the connection is closed before reading anything (or before - // reading 4 bytes, to be precise), return None to indicate that the connection - // was closed. This matches the PostgreSQL server's behavior, which avoids noise - // in the log if the client opens connection but closes it immediately. - let len = match retry_read!(stream.read_u32().await) { - Ok(len) => len as usize, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(ConnectionError::Socket(e)), - }; + // Read length. If the connection is closed before reading anything (or before + // reading 4 bytes, to be precise), return None to indicate that the connection + // was closed. This matches the PostgreSQL server's behavior, which avoids noise + // in the log if the client opens connection but closes it immediately. + let len = match retry_read!(stream.read_u32().await) { + Ok(len) => len as usize, + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(ConnectionError::Socket(e)), + }; - #[allow(clippy::manual_range_contains)] - if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { + #[allow(clippy::manual_range_contains)] + if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { + return Err(ConnectionError::Protocol(format!( + "invalid message length {len}" + ))); + } + + let request_code = retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?; + + // the rest of startup packet are params + let params_len = len - 8; + let mut params_bytes = vec![0u8; params_len]; + stream + .read_exact(params_bytes.as_mut()) + .await + .map_err(ConnectionError::Socket)?; + + // Parse params depending on request code + let req_hi = request_code >> 16; + let req_lo = request_code & ((1 << 16) - 1); + let message = match (req_hi, req_lo) { + (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { + if params_len != 8 { + return Err(ConnectionError::Protocol( + "expected 8 bytes for CancelRequest params".to_string(), + )); + } + let mut cursor = Cursor::new(params_bytes); + FeStartupPacket::CancelRequest(CancelKeyData { + backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?, + cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?, + }) + } + (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + // Requested upgrade to SSL (aka TLS) + FeStartupPacket::SslRequest + } + (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + // Requested upgrade to GSSAPI + FeStartupPacket::GssEncRequest + } + (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { return Err(ConnectionError::Protocol(format!( - "invalid message length {len}" + "Unrecognized request code {unrecognized_code}" ))); } + // TODO bail if protocol major_version is not 3? + (major_version, minor_version) => { + // Parse pairs of null-terminated strings (key, value). + // See `postgres: ProcessStartupPacket, build_startup_packet`. + let mut tokens = str::from_utf8(¶ms_bytes) + .context("StartupMessage params: invalid utf-8")? + .strip_suffix('\0') // drop packet's own null + .ok_or_else(|| { + ConnectionError::Protocol( + "StartupMessage params: missing null terminator".to_string(), + ) + })? + .split_terminator('\0'); - let request_code = - retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?; + let mut params = HashMap::new(); + while let Some(name) = tokens.next() { + let value = tokens.next().ok_or_else(|| { + ConnectionError::Protocol( + "StartupMessage params: key without value".to_string(), + ) + })?; - // the rest of startup packet are params - let params_len = len - 8; - let mut params_bytes = vec![0u8; params_len]; - stream - .read_exact(params_bytes.as_mut()) - .await - .map_err(ConnectionError::Socket)?; - - // Parse params depending on request code - let req_hi = request_code >> 16; - let req_lo = request_code & ((1 << 16) - 1); - let message = match (req_hi, req_lo) { - (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { - if params_len != 8 { - return Err(ConnectionError::Protocol( - "expected 8 bytes for CancelRequest params".to_string(), - )); - } - let mut cursor = Cursor::new(params_bytes); - FeStartupPacket::CancelRequest(CancelKeyData { - backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?, - cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?, - }) + params.insert(name.to_owned(), value.to_owned()); } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { - // Requested upgrade to SSL (aka TLS) - FeStartupPacket::SslRequest - } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { - // Requested upgrade to GSSAPI - FeStartupPacket::GssEncRequest - } - (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { - return Err(ConnectionError::Protocol(format!( - "Unrecognized request code {unrecognized_code}" - ))); - } - // TODO bail if protocol major_version is not 3? - (major_version, minor_version) => { - // Parse pairs of null-terminated strings (key, value). - // See `postgres: ProcessStartupPacket, build_startup_packet`. - let mut tokens = str::from_utf8(¶ms_bytes) - .context("StartupMessage params: invalid utf-8")? - .strip_suffix('\0') // drop packet's own null - .ok_or_else(|| { - ConnectionError::Protocol( - "StartupMessage params: missing null terminator".to_string(), - ) - })? - .split_terminator('\0'); - let mut params = HashMap::new(); - while let Some(name) = tokens.next() { - let value = tokens.next().ok_or_else(|| { - ConnectionError::Protocol( - "StartupMessage params: key without value".to_string(), - ) - })?; - - params.insert(name.to_owned(), value.to_owned()); - } - - FeStartupPacket::StartupMessage { - major_version, - minor_version, - params: StartupMessageParams { params }, - } + FeStartupPacket::StartupMessage { + major_version, + minor_version, + params: StartupMessageParams { params }, } - }; + } + }; - Ok(Some(FeMessage::StartupPacket(message))) - }) + Ok(Some(FeMessage::StartupPacket(message))) } } @@ -559,6 +511,11 @@ impl<'a> BeMessage<'a> { value: b"UTF8", }; + pub const INTEGER_DATETIMES: Self = Self::ParameterStatus { + name: b"integer_datetimes", + value: b"on", + }; + /// Build a [`BeMessage::ParameterStatus`] holding the server version. pub fn server_version(version: &'a str) -> Self { Self::ParameterStatus { @@ -698,6 +655,7 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result { } pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; +pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000"; impl<'a> BeMessage<'a> { /// Write message to the given buf. @@ -1149,15 +1107,6 @@ mod tests { let params = make_params("foo\\ bar \\ \\\\ baz\\ lol"); assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); } - - // Make sure that `read` is sync/async callable - async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) { - let _ = FeMessage::read(&mut [].as_ref()); - let _ = FeMessage::read_fut(stream).await; - - let _ = FeStartupPacket::read(&mut [].as_ref()); - let _ = FeStartupPacket::read_fut(stream).await; - } } fn terminate_code(code: &[u8; 5]) -> [u8; 6] { diff --git a/libs/pq_proto/src/sync.rs b/libs/pq_proto/src/sync.rs deleted file mode 100644 index b7ff1fb70b..0000000000 --- a/libs/pq_proto/src/sync.rs +++ /dev/null @@ -1,179 +0,0 @@ -use pin_project_lite::pin_project; -use std::future::Future; -use std::marker::PhantomData; -use std::pin::Pin; -use std::{io, task}; - -pin_project! { - /// We use this future to mark certain methods - /// as callable in both sync and async modes. - #[repr(transparent)] - pub struct SyncFuture { - #[pin] - inner: T, - _marker: PhantomData, - } -} - -/// This wrapper lets us synchronously wait for inner future's completion -/// (see [`SyncFuture::wait`]) **provided that `S` implements [`SyncProof`]**. -/// For instance, `S` may be substituted with types implementing -/// [`tokio::io::AsyncRead`], but it's not the only viable option. -impl SyncFuture { - /// NOTE: caller should carefully pick a type for `S`, - /// because we don't want to enable [`SyncFuture::wait`] when - /// it's in fact impossible to run the future synchronously. - /// Violation of this contract will not cause UB, but - /// panics and async event loop freezes won't please you. - /// - /// Example: - /// - /// ``` - /// # use pq_proto::sync::SyncFuture; - /// # use std::future::Future; - /// # use tokio::io::AsyncReadExt; - /// # - /// // Parse a pair of numbers from a stream - /// pub fn parse_pair( - /// stream: &mut Reader, - /// ) -> SyncFuture> + '_> - /// where - /// Reader: tokio::io::AsyncRead + Unpin, - /// { - /// // If `Reader` is a `SyncProof`, this will give caller - /// // an opportunity to use `SyncFuture::wait`, because - /// // `.await` will always result in `Poll::Ready`. - /// SyncFuture::new(async move { - /// let x = stream.read_u32().await?; - /// let y = stream.read_u64().await?; - /// Ok((x, y)) - /// }) - /// } - /// ``` - pub fn new(inner: T) -> Self { - Self { - inner, - _marker: PhantomData, - } - } -} - -impl Future for SyncFuture { - type Output = T::Output; - - /// In async code, [`SyncFuture`] behaves like a regular wrapper. - #[inline(always)] - fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { - self.project().inner.poll(cx) - } -} - -/// Postulates that we can call [`SyncFuture::wait`]. -/// If implementer is also a [`Future`], it should always -/// return [`task::Poll::Ready`] from [`Future::poll`]. -/// -/// Each implementation should document which futures -/// specifically are being declared sync-proof. -pub trait SyncPostulate {} - -impl SyncPostulate for &T {} -impl SyncPostulate for &mut T {} - -impl SyncFuture { - /// Synchronously wait for future completion. - pub fn wait(mut self) -> T::Output { - const RAW_WAKER: task::RawWaker = task::RawWaker::new( - std::ptr::null(), - &task::RawWakerVTable::new( - |_| RAW_WAKER, - |_| panic!("SyncFuture: failed to wake"), - |_| panic!("SyncFuture: failed to wake by ref"), - |_| { /* drop is no-op */ }, - ), - ); - - // SAFETY: We never move `self` during this call; - // furthermore, it will be dropped in the end regardless of panics - let this = unsafe { Pin::new_unchecked(&mut self) }; - - // SAFETY: This waker doesn't do anything apart from panicking - let waker = unsafe { task::Waker::from_raw(RAW_WAKER) }; - let context = &mut task::Context::from_waker(&waker); - - match this.poll(context) { - task::Poll::Ready(res) => res, - _ => panic!("SyncFuture: unexpected pending!"), - } - } -} - -/// This wrapper turns any [`std::io::Read`] into a blocking [`tokio::io::AsyncRead`], -/// which lets us abstract over sync & async readers in methods returning [`SyncFuture`]. -/// NOTE: you **should not** use this in async code. -#[repr(transparent)] -pub struct AsyncishRead(pub T); - -/// This lets us call [`SyncFuture, _>::wait`], -/// and allows the future to await on any of the [`AsyncRead`] -/// and [`AsyncReadExt`] methods on `AsyncishRead`. -impl SyncPostulate for AsyncishRead {} - -impl tokio::io::AsyncRead for AsyncishRead { - #[inline(always)] - fn poll_read( - mut self: Pin<&mut Self>, - _cx: &mut task::Context<'_>, - buf: &mut tokio::io::ReadBuf<'_>, - ) -> task::Poll> { - task::Poll::Ready( - // `Read::read` will block, meaning we don't need a real event loop! - self.0 - .read(buf.initialize_unfilled()) - .map(|sz| buf.advance(sz)), - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tokio::io::{AsyncReadExt, AsyncWriteExt}; - - // async helper(stream: &mut impl AsyncRead) -> io::Result - fn bytes_add( - stream: &mut Reader, - ) -> SyncFuture> + '_> - where - Reader: tokio::io::AsyncRead + Unpin, - { - SyncFuture::new(async move { - let a = stream.read_u32().await?; - let b = stream.read_u32().await?; - Ok(a + b) - }) - } - - #[test] - fn test_sync() { - let bytes = [100u32.to_be_bytes(), 200u32.to_be_bytes()].concat(); - let res = bytes_add(&mut AsyncishRead(&mut &bytes[..])) - .wait() - .unwrap(); - assert_eq!(res, 300); - } - - // We need a single-threaded executor for this test - #[tokio::test(flavor = "current_thread")] - async fn test_async() { - let (mut tx, mut rx) = tokio::net::UnixStream::pair().unwrap(); - - let write = async move { - tx.write_u32(100).await?; - tx.write_u32(200).await?; - Ok(()) - }; - - let (res, ()) = tokio::try_join!(bytes_add(&mut rx), write).unwrap(); - assert_eq!(res, 300); - } -} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 1091a8bd5c..901f849801 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -111,7 +111,7 @@ pub trait RemoteStorage: Send + Sync + 'static { } pub struct Download { - pub download_stream: Pin>, + pub download_stream: Pin>, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 206e40fce9..b24de57f99 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,8 +19,6 @@ jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true routerify.workspace = true -rustls.workspace = true -rustls-split.workspace = true serde.workspace = true serde_json.workspace = true signal-hook.workspace = true @@ -36,7 +34,6 @@ url.workspace = true uuid = { version = "1.2", features = ["v4", "serde"] } metrics.workspace = true -pq_proto.workspace = true workspace_hack.workspace = true [dev-dependencies] @@ -44,7 +41,6 @@ byteorder.workspace = true bytes.workspace = true criterion.workspace = true hex-literal.workspace = true -rustls-pemfile.workspace = true tempfile.workspace = true [[bench]] diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 7408eb66cd..acb5273943 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -13,7 +13,6 @@ pub mod simple_rcu; pub mod vec_map; pub mod bin_ser; -pub mod postgres_backend; // helper functions for creating and fsyncing pub mod crashsafe; @@ -26,9 +25,6 @@ pub mod id; // http endpoint utils pub mod http; -// socket splitting utils -pub mod sock_split; - // common log initialisation routine pub mod logging; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs deleted file mode 100644 index fc49aa6696..0000000000 --- a/libs/utils/src/postgres_backend.rs +++ /dev/null @@ -1,544 +0,0 @@ -//! Server-side synchronous Postgres connection, as limited as we need. -//! To use, create PostgresBackend and run() it, passing the Handler -//! implementation determining how to process the queries. Currently its API -//! is rather narrow, but we can extend it once required. - -use crate::sock_split::{BidiStream, ReadStream, WriteStream}; -use anyhow::Context; -use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR}; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::io::{self, Write}; -use std::net::{Shutdown, SocketAddr, TcpStream}; -use std::str::FromStr; -use std::sync::Arc; -use std::time::Duration; -use tracing::*; - -pub fn is_expected_io_error(e: &io::Error) -> bool { - use io::ErrorKind::*; - matches!( - e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset - ) -} - -/// An error, occurred during query processing: -/// either during the connection ([`ConnectionError`]) or before/after it. -#[derive(thiserror::Error, Debug)] -pub enum QueryError { - /// The connection was lost while processing the query. - #[error(transparent)] - Disconnected(#[from] ConnectionError), - /// Some other error - #[error(transparent)] - Other(#[from] anyhow::Error), -} - -impl From for QueryError { - fn from(e: io::Error) -> Self { - Self::Disconnected(ConnectionError::Socket(e)) - } -} - -impl QueryError { - pub fn pg_error_code(&self) -> &'static [u8; 5] { - match self { - Self::Disconnected(_) => b"08006", // connection failure - Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error - } - } -} - -pub trait Handler { - /// Handle single query. - /// postgres_backend will issue ReadyForQuery after calling this (this - /// might be not what we want after CopyData streaming, but currently we don't - /// care). - fn process_query( - &mut self, - pgb: &mut PostgresBackend, - query_string: &str, - ) -> Result<(), QueryError>; - - /// Called on startup packet receival, allows to process params. - /// - /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users - /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow - /// to override whole init logic in implementations. - fn startup( - &mut self, - _pgb: &mut PostgresBackend, - _sm: &FeStartupPacket, - ) -> Result<(), QueryError> { - Ok(()) - } - - /// Check auth jwt - fn check_auth_jwt( - &mut self, - _pgb: &mut PostgresBackend, - _jwt_response: &[u8], - ) -> Result<(), QueryError> { - Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) - } - - fn is_shutdown_requested(&self) -> bool { - false - } -} - -/// PostgresBackend protocol state. -/// XXX: The order of the constructors matters. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] -pub enum ProtoState { - Initialization, - Encrypted, - Authentication, - Established, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] -pub enum AuthType { - Trust, - // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT - NeonJWT, -} - -impl FromStr for AuthType { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - match s { - "Trust" => Ok(Self::Trust), - "NeonJWT" => Ok(Self::NeonJWT), - _ => anyhow::bail!("invalid value \"{s}\" for auth type"), - } - } -} - -impl fmt::Display for AuthType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(match self { - AuthType::Trust => "Trust", - AuthType::NeonJWT => "NeonJWT", - }) - } -} - -#[derive(Clone, Copy)] -pub enum ProcessMsgResult { - Continue, - Break, -} - -/// Always-writeable sock_split stream. -/// May not be readable. See [`PostgresBackend::take_stream_in`] -pub enum Stream { - Bidirectional(BidiStream), - WriteOnly(WriteStream), -} - -impl Stream { - fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Bidirectional(bidi_stream) => bidi_stream.shutdown(how), - Self::WriteOnly(write_stream) => write_stream.shutdown(how), - } - } -} - -impl io::Write for Stream { - fn write(&mut self, buf: &[u8]) -> io::Result { - match self { - Self::Bidirectional(bidi_stream) => bidi_stream.write(buf), - Self::WriteOnly(write_stream) => write_stream.write(buf), - } - } - - fn flush(&mut self) -> io::Result<()> { - match self { - Self::Bidirectional(bidi_stream) => bidi_stream.flush(), - Self::WriteOnly(write_stream) => write_stream.flush(), - } - } -} - -pub struct PostgresBackend { - stream: Option, - // Output buffer. c.f. BeMessage::write why we are using BytesMut here. - buf_out: BytesMut, - - pub state: ProtoState, - - auth_type: AuthType, - - peer_addr: SocketAddr, - pub tls_config: Option>, -} - -pub fn query_from_cstring(query_string: Bytes) -> Vec { - let mut query_string = query_string.to_vec(); - if let Some(ch) = query_string.last() { - if *ch == 0 { - query_string.pop(); - } - } - query_string -} - -// Helper function for socket read loops -pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { - for cause in error.chain() { - if let Some(io_error) = cause.downcast_ref::() { - if io_error.kind() == std::io::ErrorKind::WouldBlock { - return true; - } - } - } - false -} - -// Cast a byte slice to a string slice, dropping null terminator if there's one. -fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { - let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); - std::str::from_utf8(without_null).map_err(|e| e.into()) -} - -impl PostgresBackend { - pub fn new( - socket: TcpStream, - auth_type: AuthType, - tls_config: Option>, - set_read_timeout: bool, - ) -> io::Result { - let peer_addr = socket.peer_addr()?; - if set_read_timeout { - socket - .set_read_timeout(Some(Duration::from_secs(5))) - .unwrap(); - } - - Ok(Self { - stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))), - buf_out: BytesMut::with_capacity(10 * 1024), - state: ProtoState::Initialization, - auth_type, - tls_config, - peer_addr, - }) - } - - pub fn into_stream(self) -> Stream { - self.stream.unwrap() - } - - /// Get direct reference (into the Option) to the read stream. - fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> { - match &mut self.stream { - Some(Stream::Bidirectional(stream)) => Ok(stream), - _ => anyhow::bail!("reader taken"), - } - } - - pub fn get_peer_addr(&self) -> &SocketAddr { - &self.peer_addr - } - - pub fn take_stream_in(&mut self) -> Option { - let stream = self.stream.take(); - match stream { - Some(Stream::Bidirectional(bidi_stream)) => { - let (read, write) = bidi_stream.split(); - self.stream = Some(Stream::WriteOnly(write)); - Some(read) - } - stream => { - self.stream = stream; - None - } - } - } - - /// Read full message or return None if connection is closed. - pub fn read_message(&mut self) -> Result, QueryError> { - let (state, stream) = (self.state, self.get_stream_in()?); - - use ProtoState::*; - match state { - Initialization | Encrypted => FeStartupPacket::read(stream), - Authentication | Established => FeMessage::read(stream), - } - .map_err(QueryError::from) - } - - /// Write message into internal output buffer. - pub fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<&mut Self> { - BeMessage::write(&mut self.buf_out, message)?; - Ok(self) - } - - /// Flush output buffer into the socket. - pub fn flush(&mut self) -> io::Result<&mut Self> { - let stream = self.stream.as_mut().unwrap(); - stream.write_all(&self.buf_out)?; - self.buf_out.clear(); - Ok(self) - } - - /// Write message into internal buffer and flush it. - pub fn write_message(&mut self, message: &BeMessage) -> io::Result<&mut Self> { - self.write_message_noflush(message)?; - self.flush() - } - - // Wrapper for run_message_loop() that shuts down socket when we are done - pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> { - let ret = self.run_message_loop(handler); - if let Some(stream) = self.stream.as_mut() { - let _ = stream.shutdown(Shutdown::Both); - } - ret - } - - fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { - trace!("postgres backend to {:?} started", self.peer_addr); - - let mut unnamed_query_string = Bytes::new(); - - while !handler.is_shutdown_requested() { - match self.read_message() { - Ok(message) => { - if let Some(msg) = message { - trace!("got message {msg:?}"); - - match self.process_message(handler, msg, &mut unnamed_query_string)? { - ProcessMsgResult::Continue => continue, - ProcessMsgResult::Break => break, - } - } else { - break; - } - } - Err(e) => { - if let QueryError::Other(e) = &e { - if is_socket_read_timed_out(e) { - continue; - } - } - return Err(e); - } - } - } - - trace!("postgres backend to {:?} exited", self.peer_addr); - Ok(()) - } - - pub fn start_tls(&mut self) -> anyhow::Result<()> { - match self.stream.take() { - Some(Stream::Bidirectional(bidi_stream)) => { - let conn = rustls::ServerConnection::new(self.tls_config.clone().unwrap())?; - self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(conn)?)); - Ok(()) - } - stream => { - self.stream = stream; - anyhow::bail!("can't start TLs without bidi stream"); - } - } - } - - fn process_message( - &mut self, - handler: &mut impl Handler, - msg: FeMessage, - unnamed_query_string: &mut Bytes, - ) -> Result { - // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth - // TODO: change that to proper top-level match of protocol state with separate message handling for each state - if self.state < ProtoState::Established - && !matches!( - msg, - FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_) - ) - { - return Err(QueryError::Other(anyhow::anyhow!("protocol violation"))); - } - - let have_tls = self.tls_config.is_some(); - match msg { - FeMessage::StartupPacket(m) => { - trace!("got startup message {m:?}"); - - match m { - FeStartupPacket::SslRequest => { - debug!("SSL requested"); - - self.write_message(&BeMessage::EncryptionResponse(have_tls))?; - if have_tls { - self.start_tls()?; - self.state = ProtoState::Encrypted; - } - } - FeStartupPacket::GssEncRequest => { - debug!("GSS requested"); - self.write_message(&BeMessage::EncryptionResponse(false))?; - } - FeStartupPacket::StartupMessage { .. } => { - if have_tls && !matches!(self.state, ProtoState::Encrypted) { - self.write_message(&BeMessage::ErrorResponse( - "must connect with TLS", - None, - ))?; - return Err(QueryError::Other(anyhow::anyhow!( - "client did not connect with TLS" - ))); - } - - // NB: startup() may change self.auth_type -- we are using that in proxy code - // to bypass auth for new users. - handler.startup(self, &m)?; - - match self.auth_type { - AuthType::Trust => { - self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeMessage::CLIENT_ENCODING)? - // The async python driver requires a valid server_version - .write_message_noflush(&BeMessage::server_version("14.1"))? - .write_message(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - AuthType::NeonJWT => { - self.write_message(&BeMessage::AuthenticationCleartextPassword)?; - self.state = ProtoState::Authentication; - } - } - } - FeStartupPacket::CancelRequest { .. } => { - return Ok(ProcessMsgResult::Break); - } - } - } - - FeMessage::PasswordMessage(m) => { - trace!("got password message '{:?}'", m); - - assert!(self.state == ProtoState::Authentication); - - match self.auth_type { - AuthType::Trust => unreachable!(), - AuthType::NeonJWT => { - let (_, jwt_response) = m.split_last().context("protocol violation")?; - - if let Err(e) = handler.check_auth_jwt(self, jwt_response) { - self.write_message(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))?; - return Err(e); - } - } - } - self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeMessage::CLIENT_ENCODING)? - .write_message(&BeMessage::ReadyForQuery)?; - self.state = ProtoState::Established; - } - - FeMessage::Query(body) => { - // remove null terminator - let query_string = cstr_to_str(&body)?; - - trace!("got query {query_string:?}"); - if let Err(e) = handler.process_query(self, query_string) { - log_query_error(query_string, &e); - let short_error = short_error(&e); - self.write_message_noflush(&BeMessage::ErrorResponse( - &short_error, - Some(e.pg_error_code()), - ))?; - } - self.write_message(&BeMessage::ReadyForQuery)?; - } - - FeMessage::Parse(m) => { - *unnamed_query_string = m.query_string; - self.write_message(&BeMessage::ParseComplete)?; - } - - FeMessage::Describe(_) => { - self.write_message_noflush(&BeMessage::ParameterDescription)? - .write_message(&BeMessage::NoData)?; - } - - FeMessage::Bind(_) => { - self.write_message(&BeMessage::BindComplete)?; - } - - FeMessage::Close(_) => { - self.write_message(&BeMessage::CloseComplete)?; - } - - FeMessage::Execute(_) => { - let query_string = cstr_to_str(unnamed_query_string)?; - trace!("got execute {query_string:?}"); - if let Err(e) = handler.process_query(self, query_string) { - log_query_error(query_string, &e); - self.write_message(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))?; - } - // NOTE there is no ReadyForQuery message. This handler is used - // for basebackup and it uses CopyOut which doesn't require - // ReadyForQuery message and backend just switches back to - // processing mode after sending CopyDone or ErrorResponse. - } - - FeMessage::Sync => { - self.write_message(&BeMessage::ReadyForQuery)?; - } - - FeMessage::Terminate => { - return Ok(ProcessMsgResult::Break); - } - - // We prefer explicit pattern matching to wildcards, because - // this helps us spot the places where new variants are missing - FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { - return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message type: {msg:?}" - ))); - } - } - - Ok(ProcessMsgResult::Continue) - } -} - -pub fn short_error(e: &QueryError) -> String { - match e { - QueryError::Disconnected(connection_error) => connection_error.to_string(), - QueryError::Other(e) => format!("{e:#}"), - } -} - -pub(super) fn log_query_error(query: &str, e: &QueryError) { - match e { - QueryError::Disconnected(ConnectionError::Socket(io_error)) => { - if is_expected_io_error(io_error) { - info!("query handler for '{query}' failed with expected io error: {io_error}"); - } else { - error!("query handler for '{query}' failed with io error: {io_error}"); - } - } - QueryError::Disconnected(other_connection_error) => { - error!("query handler for '{query}' failed with connection error: {other_connection_error:?}") - } - QueryError::Other(e) => { - error!("query handler for '{query}' failed: {e:?}"); - } - } -} diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs deleted file mode 100644 index b0e5a0bf6a..0000000000 --- a/libs/utils/src/sock_split.rs +++ /dev/null @@ -1,206 +0,0 @@ -use std::{ - io::{self, BufReader, Write}, - net::{Shutdown, TcpStream}, - sync::Arc, -}; - -use rustls::Connection; - -/// Wrapper supporting reads of a shared TcpStream. -pub struct ArcTcpRead(Arc); - -impl io::Read for ArcTcpRead { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - (&*self.0).read(buf) - } -} - -impl std::ops::Deref for ArcTcpRead { - type Target = TcpStream; - - fn deref(&self) -> &Self::Target { - self.0.deref() - } -} - -/// Wrapper around a TCP Stream supporting buffered reads. -pub struct BufStream(BufReader); - -impl io::Read for BufStream { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - self.0.read(buf) - } -} - -impl io::Write for BufStream { - fn write(&mut self, buf: &[u8]) -> io::Result { - self.get_ref().write(buf) - } - - fn flush(&mut self) -> io::Result<()> { - self.get_ref().flush() - } -} - -impl BufStream { - /// Unwrap into the internal BufReader. - fn into_reader(self) -> BufReader { - self.0 - } - - /// Returns a reference to the underlying TcpStream. - fn get_ref(&self) -> &TcpStream { - &self.0.get_ref().0 - } -} - -pub enum ReadStream { - Tcp(BufReader), - Tls(rustls_split::ReadHalf), -} - -impl io::Read for ReadStream { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Tcp(reader) => reader.read(buf), - Self::Tls(read_half) => read_half.read(buf), - } - } -} - -impl ReadStream { - pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.get_ref().shutdown(how), - Self::Tls(write_half) => write_half.shutdown(how), - } - } -} - -pub enum WriteStream { - Tcp(Arc), - Tls(rustls_split::WriteHalf), -} - -impl WriteStream { - pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.shutdown(how), - Self::Tls(write_half) => write_half.shutdown(how), - } - } -} - -impl io::Write for WriteStream { - fn write(&mut self, buf: &[u8]) -> io::Result { - match self { - Self::Tcp(stream) => stream.as_ref().write(buf), - Self::Tls(write_half) => write_half.write(buf), - } - } - - fn flush(&mut self) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.as_ref().flush(), - Self::Tls(write_half) => write_half.flush(), - } - } -} - -type TlsStream = rustls::StreamOwned; - -pub enum BidiStream { - Tcp(BufStream), - /// This variant is boxed, because [`rustls::ServerConnection`] is quite larger than [`BufStream`]. - Tls(Box>), -} - -impl BidiStream { - pub fn from_tcp(stream: TcpStream) -> Self { - Self::Tcp(BufStream(BufReader::new(ArcTcpRead(Arc::new(stream))))) - } - - pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.get_ref().shutdown(how), - Self::Tls(tls_boxed) => { - if how == Shutdown::Read { - tls_boxed.sock.get_ref().shutdown(how) - } else { - tls_boxed.conn.send_close_notify(); - let res = tls_boxed.flush(); - tls_boxed.sock.get_ref().shutdown(how)?; - res - } - } - } - } - - /// Split the bi-directional stream into two owned read and write halves. - pub fn split(self) -> (ReadStream, WriteStream) { - match self { - Self::Tcp(stream) => { - let reader = stream.into_reader(); - let stream: Arc = reader.get_ref().0.clone(); - - (ReadStream::Tcp(reader), WriteStream::Tcp(stream)) - } - Self::Tls(tls_boxed) => { - let reader = tls_boxed.sock.into_reader(); - let buffer_data = reader.buffer().to_owned(); - let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192); - let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192); - - // TODO would be nice to avoid the Arc here - let socket = Arc::try_unwrap(reader.into_inner().0).unwrap(); - - let (read_half, write_half) = rustls_split::split( - socket, - Connection::Server(tls_boxed.conn), - read_buf_cfg, - write_buf_cfg, - ); - (ReadStream::Tls(read_half), WriteStream::Tls(write_half)) - } - } - } - - pub fn start_tls(self, mut conn: rustls::ServerConnection) -> io::Result { - match self { - Self::Tcp(mut stream) => { - conn.complete_io(&mut stream)?; - assert!(!conn.is_handshaking()); - Ok(Self::Tls(Box::new(TlsStream::new(conn, stream)))) - } - Self::Tls { .. } => Err(io::Error::new( - io::ErrorKind::InvalidInput, - "TLS is already started on this stream", - )), - } - } -} - -impl io::Read for BidiStream { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Tcp(stream) => stream.read(buf), - Self::Tls(tls_boxed) => tls_boxed.read(buf), - } - } -} - -impl io::Write for BidiStream { - fn write(&mut self, buf: &[u8]) -> io::Result { - match self { - Self::Tcp(stream) => stream.write(buf), - Self::Tls(tls_boxed) => tls_boxed.write(buf), - } - } - - fn flush(&mut self) -> io::Result<()> { - match self { - Self::Tcp(stream) => stream.flush(), - Self::Tls(tls_boxed) => tls_boxed.flush(), - } - } -} diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs deleted file mode 100644 index bf09f1b37d..0000000000 --- a/libs/utils/tests/ssl_test.rs +++ /dev/null @@ -1,238 +0,0 @@ -use std::{ - collections::HashMap, - io::{Cursor, Read, Write}, - net::{TcpListener, TcpStream}, - sync::Arc, -}; - -use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; -use once_cell::sync::Lazy; - -use utils::{ - postgres_backend::QueryError, - postgres_backend::{AuthType, Handler, PostgresBackend}, -}; - -fn make_tcp_pair() -> (TcpStream, TcpStream) { - let listener = TcpListener::bind("127.0.0.1:0").unwrap(); - let addr = listener.local_addr().unwrap(); - let client_stream = TcpStream::connect(addr).unwrap(); - let (server_stream, _) = listener.accept().unwrap(); - (server_stream, client_stream) -} - -static KEY: Lazy = Lazy::new(|| { - let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) -}); - -static CERT: Lazy = Lazy::new(|| { - let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) -}); - -#[test] -// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274), -// we resize the vector so doing some modifications after all -#[allow(clippy::read_zero_byte_vec)] -fn ssl() { - let (mut client_sock, server_sock) = make_tcp_pair(); - - const QUERY: &str = "hello world"; - - let client_jh = std::thread::spawn(move || { - // SSLRequest - client_sock.write_u32::(8).unwrap(); - client_sock.write_u32::(80877103).unwrap(); - - let ssl_response = client_sock.read_u8().unwrap(); - assert_eq!(b'S', ssl_response); - - let cfg = rustls::ClientConfig::builder() - .with_safe_defaults() - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(&CERT).unwrap(); - store - }) - .with_no_client_auth(); - let client_config = Arc::new(cfg); - - let dns_name = "localhost".try_into().unwrap(); - let mut conn = rustls::ClientConnection::new(client_config, dns_name).unwrap(); - - conn.complete_io(&mut client_sock).unwrap(); - assert!(!conn.is_handshaking()); - - let mut stream = rustls::Stream::new(&mut conn, &mut client_sock); - - // StartupMessage - stream.write_u32::(9).unwrap(); - stream.write_u32::(196608).unwrap(); - stream.write_u8(0).unwrap(); - stream.flush().unwrap(); - - // wait for ReadyForQuery - let mut msg_buf = Vec::new(); - loop { - let msg = stream.read_u8().unwrap(); - let size = stream.read_u32::().unwrap() - 4; - msg_buf.resize(size as usize, 0); - stream.read_exact(&mut msg_buf).unwrap(); - - if msg == b'Z' { - // ReadyForQuery - break; - } - } - - // Query - stream.write_u8(b'Q').unwrap(); - stream - .write_u32::(4u32 + QUERY.len() as u32) - .unwrap(); - stream.write_all(QUERY.as_ref()).unwrap(); - stream.flush().unwrap(); - - // ReadyForQuery - let msg = stream.read_u8().unwrap(); - assert_eq!(msg, b'Z'); - }); - - struct TestHandler { - got_query: bool, - } - impl Handler for TestHandler { - fn process_query( - &mut self, - _pgb: &mut PostgresBackend, - query_string: &str, - ) -> Result<(), QueryError> { - self.got_query = query_string == QUERY; - Ok(()) - } - } - let mut handler = TestHandler { got_query: false }; - - let cfg = rustls::ServerConfig::builder() - .with_safe_defaults() - .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) - .unwrap(); - let tls_config = Some(Arc::new(cfg)); - - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap(); - pgb.run(&mut handler).unwrap(); - assert!(handler.got_query); - - client_jh.join().unwrap(); - - // TODO consider shutdown behavior -} - -#[test] -fn no_ssl() { - let (mut client_sock, server_sock) = make_tcp_pair(); - - let client_jh = std::thread::spawn(move || { - let mut buf = BytesMut::new(); - - // SSLRequest - buf.put_u32(8); - buf.put_u32(80877103); - client_sock.write_all(&buf).unwrap(); - buf.clear(); - - let ssl_response = client_sock.read_u8().unwrap(); - assert_eq!(b'N', ssl_response); - }); - - struct TestHandler; - - impl Handler for TestHandler { - fn process_query( - &mut self, - _pgb: &mut PostgresBackend, - _query_string: &str, - ) -> Result<(), QueryError> { - panic!() - } - } - - let mut handler = TestHandler; - - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap(); - pgb.run(&mut handler).unwrap(); - - client_jh.join().unwrap(); -} - -#[test] -fn server_forces_ssl() { - let (mut client_sock, server_sock) = make_tcp_pair(); - - let client_jh = std::thread::spawn(move || { - // StartupMessage - client_sock.write_u32::(9).unwrap(); - client_sock.write_u32::(196608).unwrap(); - client_sock.write_u8(0).unwrap(); - client_sock.flush().unwrap(); - - // ErrorResponse - assert_eq!(client_sock.read_u8().unwrap(), b'E'); - let len = client_sock.read_u32::().unwrap() - 4; - - let mut body = vec![0; len as usize]; - client_sock.read_exact(&mut body).unwrap(); - let mut body = Bytes::from(body); - - let mut errors = HashMap::new(); - loop { - let field_type = body.get_u8(); - if field_type == 0u8 { - break; - } - - let end_idx = body.iter().position(|&b| b == 0u8).unwrap(); - let mut value = body.split_to(end_idx + 1); - assert_eq!(value[end_idx], 0u8); - value.truncate(end_idx); - let old = errors.insert(field_type, value); - assert!(old.is_none()); - } - - assert!(!body.has_remaining()); - - assert_eq!("must connect with TLS", errors.get(&b'M').unwrap()); - - // TODO read failure - }); - - struct TestHandler; - impl Handler for TestHandler { - fn process_query( - &mut self, - _pgb: &mut PostgresBackend, - _query_string: &str, - ) -> Result<(), QueryError> { - panic!() - } - } - let mut handler = TestHandler; - - let cfg = rustls::ServerConfig::builder() - .with_safe_defaults() - .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) - .unwrap(); - let tls_config = Some(Arc::new(cfg)); - - let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap(); - let res = pgb.run(&mut handler).unwrap_err(); - assert_eq!("client did not connect with TLS", format!("{}", res)); - - client_jh.join().unwrap(); - - // TODO consider shutdown behavior -} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9caab7955b..564a3de82c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -23,11 +23,10 @@ use pageserver::{ tenant::mgr, virtual_file, }; +use postgres_backend::AuthType; use utils::{ auth::JwtAuth, - logging, - postgres_backend::AuthType, - project_git_version, + logging, project_git_version, sentry_init::init_sentry, signals::{self, Signal}, tcp_listener, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 7442814c43..fde889d01a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -21,10 +21,10 @@ use std::time::Duration; use toml_edit; use toml_edit::{Document, Item}; +use postgres_backend::AuthType; use utils::{ id::{NodeId, TenantId, TimelineId}, logging::LogFormat, - postgres_backend::AuthType, }; use crate::tenant::config::TenantConf; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index dc4be9dd65..bdcd71a20f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -20,7 +20,7 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; -use postgres_backend::{self, is_expected_io_error, PostgresBackend, QueryError}; +use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; @@ -36,7 +36,6 @@ use utils::{ auth::{Claims, JwtAuth, Scope}, id::{TenantId, TimelineId}, lsn::Lsn, - postgres_backend::AuthType, simple_rcu::RcuReadGuard, }; @@ -68,7 +67,7 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { msg } + msg = pgb.read_message() => { msg.map_err(QueryError::from)} }; match msg { @@ -79,14 +78,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream continue, FeMessage::Terminate => { let msg = "client terminated connection with Terminate message during COPY"; - let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; + let query_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + // error can't happen here, ErrorResponse serialization should be always ok + pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; break; } m => { let msg = format!("unexpected message {m:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None))?; + // error can't happen here, ErrorResponse serialization should be always ok + pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?; Err(io::Error::new(io::ErrorKind::Other, msg))?; break; } @@ -96,8 +97,9 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { let msg = "client closed connection during COPY"; - let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?; + let query_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + // error can't happen here, ErrorResponse serialization should be always ok + pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; pgb.flush().await?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; } @@ -105,7 +107,7 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { - Err(io::Error::new(io::ErrorKind::Other, other))?; + Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?; } }; } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index f9d1e819a1..41ac61b7b6 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -435,8 +435,8 @@ fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result> = Lazy::new(Default::default); @@ -33,7 +31,7 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N /// Console management API listener task. /// It spawns console response handlers needed for the link auth. -pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> { +pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> { scopeguard::defer! { info!("mgmt has shut down"); } @@ -42,18 +40,12 @@ pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> let (socket, peer_addr) = listener.accept().await?; info!("accepted connection from {peer_addr}"); - let socket = socket.into_std()?; socket .set_nodelay(true) .context("failed to set client socket option")?; - socket - .set_nonblocking(false) - .context("failed to set client socket option")?; - // TODO: replace with async tasks. - thread::spawn(move || { - let tid = std::thread::current().id(); - let span = info_span!("mgmt", thread = format_args!("{tid:?}")); + tokio::task::spawn(async move { + let span = info_span!("mgmt", peer = %peer_addr); let _enter = span.enter(); info!("started a new console management API thread"); @@ -61,16 +53,16 @@ pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> info!("console management API thread is about to finish"); } - if let Err(e) = handle_connection(socket) { + if let Err(e) = handle_connection(socket).await { error!("thread failed with an error: {e}"); } }); } } -fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { - let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?; - pgbackend.run(&mut MgmtHandler) +async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { + let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; + pgbackend.run(&mut MgmtHandler, future::pending::<()>).await } /// A message received by `mgmt` when a compute node is ready. @@ -78,16 +70,21 @@ pub type ComputeReady = Result; // TODO: replace with an http-based protocol. struct MgmtHandler; +#[async_trait::async_trait] impl postgres_backend::Handler for MgmtHandler { - fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { - try_process_query(pgb, query).map_err(|e| { + async fn process_query( + &mut self, + pgb: &mut PostgresBackend, + query: &str, + ) -> Result<(), QueryError> { + try_process_query(pgb, query).await.map_err(|e| { error!("failed to process response: {e:?}"); e }) } } -fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { +async fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); @@ -98,11 +95,11 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), Query Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } Err(e) => { error!("failed to deliver response to per-client task"); - pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?; + pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string(), None))?; } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 02a0fabe9a..e0cf1326b9 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -51,7 +51,7 @@ impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. pub async fn read_startup_packet(&mut self) -> io::Result { // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` - let msg = FeStartupPacket::read_fut(&mut self.stream) + let msg = FeStartupPacket::read(&mut self.stream) .await .map_err(ConnectionError::into_io_error)? .ok_or_else(err_connection)?; @@ -73,7 +73,7 @@ impl PqStream { } async fn read_message(&mut self) -> io::Result { - FeMessage::read_fut(&mut self.stream) + FeMessage::read(&mut self.stream) .await .map_err(ConnectionError::into_io_error)? .ok_or_else(err_connection) diff --git a/run_clippy.sh b/run_clippy.sh index fe0e745d7d..0558541089 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -11,12 +11,18 @@ # Not every feature is supported in macOS builds. Avoid running regular linting # script that checks every feature. +# +# manual-range-contains wants +# !(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len) +# instead of +# len < 4 || len > MAX_STARTUP_PACKET_LENGTH +# , let's disagree. if [[ "$OSTYPE" == "darwin"* ]]; then # no extra features to test currently, add more here when needed - cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -A clippy::manual-range-contains -D warnings else # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) - cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings fi diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 2424509477..36ee15347d 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -36,6 +36,7 @@ toml_edit.workspace = true tracing.workspace = true url.workspace = true metrics.workspace = true +postgres_backend.workspace = true postgres_ffi.workspace = true pq_proto.workspace = true remote_storage.workspace = true diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 683050e9cd..d2cb9f79b9 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -236,7 +236,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() - .name("safekeeper thread".into()) + .name("WAL service thread".into()) .spawn(|| wal_service::thread_main(conf_cloned, pg_listener)) .unwrap(); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index d1cd76459b..3e7bafbd2f 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -1,27 +1,23 @@ //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres //! protocol commands. +use anyhow::Context; +use std::str; +use tracing::{info, info_span, Instrument}; + use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; -use crate::receive_wal::ReceiveWalConn; - -use crate::send_wal::ReplicationConn; use crate::{GlobalTimelines, SafeKeeperConf}; -use anyhow::Context; - +use postgres_backend::QueryError; +use postgres_backend::{self, PostgresBackend}; use postgres_ffi::PG_TLI; -use regex::Regex; - use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use std::str; -use tracing::info; +use regex::Regex; use utils::auth::{Claims, Scope}; -use utils::postgres_backend::QueryError; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, - postgres_backend::{self, PostgresBackend}, }; /// Safekeeper handler of postgres commands @@ -53,7 +49,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { let start_lsn = caps .next() .map(|cap| cap[1].parse::()) - .context("failed to parse start LSN from START_REPLICATION command")??; + .context("parse start LSN from START_REPLICATION command")??; Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) } else if cmd.starts_with("IDENTIFY_SYSTEM") { Ok(SafekeeperPostgresCommand::IdentifySystem) @@ -67,6 +63,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } } +#[async_trait::async_trait] impl postgres_backend::Handler for SafekeeperPostgresHandler { // tenant_id and timeline_id are passed in connection string params fn startup( @@ -137,7 +134,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { Ok(()) } - fn process_query( + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, @@ -147,9 +144,10 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { .starts_with("set datestyle to ") { // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; return Ok(()); } + let cmd = parse_cmd(query_string)?; info!( @@ -161,26 +159,23 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { let timeline_id = self.timeline_id.context("timelineid is required")?; self.check_permission(Some(tenant_id))?; self.ttid = TenantTimelineId::new(tenant_id, timeline_id); + let span_ttid = self.ttid; // satisfy borrow checker - let res = match cmd { - SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), + match cmd { + SafekeeperPostgresCommand::StartWalPush => { + self.handle_start_wal_push(pgb) + .instrument(info_span!("WAL receiver", ttid = %span_ttid)) + .await + } SafekeeperPostgresCommand::StartReplication { start_lsn } => { - ReplicationConn::new(pgb).run(self, pgb, start_lsn) + self.handle_start_replication(pgb, start_lsn) + .instrument(info_span!("WAL sender", ttid = %span_ttid)) + .await } - SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), - }; - - match res { - Ok(()) => Ok(()), - Err(QueryError::Disconnected(connection_error)) => { - info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}"); - Err(QueryError::Disconnected(connection_error)) + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { + handle_json_ctrl(self, pgb, cmd).await } - Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!( - "Failed to process query for timeline {}", - self.ttid - )))), } } } @@ -217,7 +212,10 @@ impl SafekeeperPostgresHandler { /// /// Handle IDENTIFY_SYSTEM replication command /// - fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> { + async fn handle_identify_system( + &mut self, + pgb: &mut PostgresBackend, + ) -> Result<(), QueryError> { let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; let lsn = if self.is_walproposer_recovery() { @@ -267,7 +265,7 @@ impl SafekeeperPostgresHandler { Some(lsn_bytes), None, ]))? - .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; + .write_message_noflush(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; Ok(()) } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index b157fcb076..14badebd95 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -168,12 +168,9 @@ async fn timeline_create_handler(mut request: Request) -> Result anyhow::Result> { +async fn prepare_safekeeper( + ttid: TenantTimelineId, + pg_version: u32, +) -> anyhow::Result> { GlobalTimelines::create( ttid, ServerInfo { @@ -106,6 +110,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result Lsn::INVALID, Lsn::INVALID, ) + .await } fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { @@ -128,15 +133,15 @@ fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::R } #[derive(Debug, Serialize, Deserialize)] -struct InsertedWAL { +pub struct InsertedWAL { begin_lsn: Lsn, - end_lsn: Lsn, + pub end_lsn: Lsn, append_response: AppendResponse, } /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message( +pub fn append_logical_message( tli: &Arc, msg: &AppendLogicalMessage, ) -> anyhow::Result { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 6ab108ceb0..03df546a4d 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,8 +1,7 @@ -use storage_broker::Uri; -// use remote_storage::RemoteStorageConfig; use std::path::PathBuf; use std::time::Duration; +use storage_broker::Uri; use utils::id::{NodeId, TenantId, TenantTimelineId}; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 0cf921d97a..22c9871026 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,204 +2,284 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use anyhow::anyhow; -use anyhow::Context; - -use bytes::BytesMut; -use tracing::*; -use utils::lsn::Lsn; -use utils::postgres_backend::QueryError; - +use crate::handler::SafekeeperPostgresHandler; +use crate::safekeeper::AcceptorProposerMessage; +use crate::safekeeper::ProposerAcceptorMessage; use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; use crate::GlobalTimelines; - +use anyhow::{anyhow, Context}; +use bytes::BytesMut; +use nix::unistd::gettid; +use postgres_backend::CopyStreamHandlerEnd; +use postgres_backend::PostgresBackend; +use postgres_backend::PostgresBackendReader; +use postgres_backend::QueryError; +use pq_proto::BeMessage; use std::net::SocketAddr; -use std::sync::mpsc::channel; -use std::sync::mpsc::Receiver; - use std::sync::Arc; use std::thread; +use std::thread::JoinHandle; +use tokio::sync::mpsc::channel; +use tokio::sync::mpsc::error::TryRecvError; +use tokio::sync::mpsc::Receiver; +use tokio::sync::mpsc::Sender; +use tokio::task::spawn_blocking; +use tracing::*; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; -use crate::safekeeper::AcceptorProposerMessage; -use crate::safekeeper::ProposerAcceptorMessage; +const MSG_QUEUE_SIZE: usize = 256; +const REPLY_QUEUE_SIZE: usize = 16; -use crate::handler::SafekeeperPostgresHandler; -use pq_proto::{BeMessage, FeMessage}; -use utils::{postgres_backend::PostgresBackend, sock_split::ReadStream}; - -pub struct ReceiveWalConn<'pg> { - /// Postgres connection - pg_backend: &'pg mut PostgresBackend, - /// The cached result of `pg_backend.socket().peer_addr()` (roughly) - peer_addr: SocketAddr, -} - -impl<'pg> ReceiveWalConn<'pg> { - pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> { - let peer_addr = *pg.get_peer_addr(); - ReceiveWalConn { - pg_backend: pg, - peer_addr, +impl SafekeeperPostgresHandler { + /// Wrapper around handle_start_wal_push_guts handling result. Error is + /// handled here while we're still in walreceiver ttid span; with API + /// extension, this can probably be moved into postgres_backend. + pub async fn handle_start_wal_push( + &mut self, + pgb: &mut PostgresBackend, + ) -> Result<(), QueryError> { + if let Err(end) = self.handle_start_wal_push_guts(pgb).await { + // Log the result and probably send it to the client, closing the stream. + pgb.handle_copy_stream_end(end).await; } - } - - // Send message to the postgres - fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> { - let mut buf = BytesMut::with_capacity(128); - msg.serialize(&mut buf)?; - self.pg_backend.write_message(&BeMessage::CopyData(&buf))?; Ok(()) } - /// Receive WAL from wal_proposer - pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> { - let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered(); - + pub async fn handle_start_wal_push_guts( + &mut self, + pgb: &mut PostgresBackend, + ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages - self.pg_backend - .write_message(&BeMessage::CopyBothResponse)?; + pgb.write_message(&BeMessage::CopyBothResponse).await?; - let r = self - .pg_backend - .take_stream_in() - .ok_or_else(|| anyhow!("failed to take read stream from pgbackend"))?; - let mut poll_reader = ProposerPollStream::new(r)?; + // Experiments [1] confirm that doing network IO in one (this) thread and + // processing with disc IO in another significantly improves + // performance; we spawn off WalAcceptor thread for message processing + // to this end. + // + // [1] https://github.com/neondatabase/neon/pull/1318 + let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE); + let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE); + let mut acceptor_handle: Option>> = None; - // Receive information about server - let next_msg = poll_reader.recv_msg()?; - let tli = match next_msg { - ProposerAcceptorMessage::Greeting(ref greeting) => { - info!( - "start handshake with walproposer {} sysid {} timeline {}", - self.peer_addr, greeting.system_id, greeting.tli, - ); - let server_info = ServerInfo { - pg_version: greeting.pg_version, - system_id: greeting.system_id, - wal_seg_size: greeting.wal_seg_size, - }; - GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)? - } - _ => { - return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message {next_msg:?} instead of greeting" - ))) - } + // Concurrently receive and send data; replies are not synchronized with + // sends, so this avoids deadlocks. + let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; + let peer_addr = *pgb.get_peer_addr(); + let res = tokio::select! { + // todo: add read|write .context to these errors + r = read_network(self.ttid, &mut pgb_reader, peer_addr, msg_tx, &mut acceptor_handle, msg_rx, reply_tx) => r, + r = write_network(pgb, reply_rx) => r, }; - let mut next_msg = Some(next_msg); + // Join pg backend back. + pgb.unsplit(pgb_reader)?; - let mut first_time_through = true; - let mut _guard: Option = None; - loop { - if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { - // poll AppendRequest's without blocking and write WAL to disk without flushing, - // while it's readily available - while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { - let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - - let reply = tli.process_msg(&msg)?; - if let Some(reply) = reply { - self.write_msg(&reply)?; - } - - next_msg = poll_reader.poll_msg(); - } - - // flush all written WAL to the disk - let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?; - if let Some(reply) = reply { - self.write_msg(&reply)?; - } - } else if let Some(msg) = next_msg.take() { - // process other message - let reply = tli.process_msg(&msg)?; - if let Some(reply) = reply { - self.write_msg(&reply)?; - } - } - if first_time_through { - // Register the connection and defer unregister. Do that only - // after processing first message, as it sets wal_seg_size, - // wanted by many. - tli.on_compute_connect()?; - _guard = Some(ComputeConnectionGuard { - timeline: Arc::clone(&tli), - }); - first_time_through = false; + // Join the spawned WalAcceptor. At this point chans to/from it passed + // to network routines are dropped, so it will exit as soon as it + // touches them. + match acceptor_handle { + None => { + // failed even before spawning; read_network should have error + Err(res.expect_err("no error with WalAcceptor not spawn")) } + Some(handle) => { + let wal_acceptor_res = handle.join(); - // blocking wait for the next message - if next_msg.is_none() { - next_msg = Some(poll_reader.recv_msg()?); + // If there was any network error, return it. + res?; + + // Otherwise, WalAcceptor thread must have errored. + match wal_acceptor_res { + Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination + Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))), + Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!( + "WalAcceptor thread panicked", + ))), + } } } } } -struct ProposerPollStream { - msg_rx: Receiver, - read_thread: Option>>, +/// Read next message from walproposer. +/// TODO: Return Ok(None) on graceful termination. +async fn read_message( + pgb_reader: &mut PostgresBackendReader, +) -> Result { + let copy_data = pgb_reader.read_copy_message().await?; + let msg = ProposerAcceptorMessage::parse(copy_data)?; + Ok(msg) } -impl ProposerPollStream { - fn new(mut r: ReadStream) -> anyhow::Result { - let (msg_tx, msg_rx) = channel(); - - let read_thread = thread::Builder::new() - .name("Read WAL thread".into()) - .spawn(move || -> Result<(), QueryError> { - loop { - let copy_data = match FeMessage::read(&mut r)? { - Some(FeMessage::CopyData(bytes)) => Ok(bytes), - Some(msg) => Err(QueryError::Other(anyhow::anyhow!( - "expected `CopyData` message, found {msg:?}" - ))), - None => Err(QueryError::from(std::io::Error::new( - std::io::ErrorKind::ConnectionAborted, - "walproposer closed the connection", - ))), - }?; - - let msg = ProposerAcceptorMessage::parse(copy_data)?; - msg_tx - .send(msg) - .context("Failed to send the proposer message")?; - } - // msg_tx will be dropped here, this will also close msg_rx - })?; - - Ok(Self { - msg_rx, - read_thread: Some(read_thread), - }) - } - - fn recv_msg(&mut self) -> Result { - self.msg_rx.recv().map_err(|_| { - // return error from the read thread - let res = match self.read_thread.take() { - Some(thread) => thread.join(), - None => return QueryError::Other(anyhow::anyhow!("read thread is gone")), +/// Read messages from socket and pass it to WalAcceptor thread. Returns Ok(()) +/// if msg_tx closed; it must mean WalAcceptor terminated, joining it should +/// tell the error. +async fn read_network( + ttid: TenantTimelineId, + pgb_reader: &mut PostgresBackendReader, + peer_addr: SocketAddr, + msg_tx: Sender, + // WalAcceptor is spawned when we learn server info from walproposer and + // create timeline; handle is put here. + acceptor_handle: &mut Option>>, + msg_rx: Receiver, + reply_tx: Sender, +) -> Result<(), CopyStreamHandlerEnd> { + // Receive information about server to create timeline, if not yet. + let next_msg = read_message(pgb_reader).await?; + let tli = match next_msg { + ProposerAcceptorMessage::Greeting(ref greeting) => { + info!( + "start handshake with walproposer {} sysid {} timeline {}", + peer_addr, greeting.system_id, greeting.tli, + ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, }; + GlobalTimelines::create(ttid, server_info, Lsn::INVALID, Lsn::INVALID).await? + } + _ => { + return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( + "unexpected message {next_msg:?} instead of greeting" + ))) + } + }; - match res { - Ok(Ok(())) => { - QueryError::Other(anyhow::anyhow!("unexpected result from read thread")) - } - Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")), - Ok(Err(err)) => err, + *acceptor_handle = Some( + WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx).context("spawn WalAcceptor thread")?, + ); + + // Forward all messages to WalAcceptor + read_network_loop(pgb_reader, msg_tx, next_msg).await +} + +async fn read_network_loop( + pgb_reader: &mut PostgresBackendReader, + msg_tx: Sender, + mut next_msg: ProposerAcceptorMessage, +) -> Result<(), CopyStreamHandlerEnd> { + loop { + if msg_tx.send(next_msg).await.is_err() { + return Ok(()); // chan closed, WalAcceptor terminated + } + next_msg = read_message(pgb_reader).await?; + } +} + +/// Read replies from WalAcceptor and pass them back to socket. Returns Ok(()) +/// if reply_rx closed; it must mean WalAcceptor terminated, joining it should +/// tell the error. +async fn write_network( + pgb_writer: &mut PostgresBackend, + mut reply_rx: Receiver, +) -> Result<(), CopyStreamHandlerEnd> { + let mut buf = BytesMut::with_capacity(128); + + loop { + match reply_rx.recv().await { + Some(msg) => { + buf.clear(); + msg.serialize(&mut buf)?; + pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } - }) + None => return Ok(()), // chan closed, WalAcceptor terminated + } + } +} + +/// Takes messages from msg_rx, processes and pushes replies to reply_tx. +struct WalAcceptor { + tli: Arc, + msg_rx: Receiver, + reply_tx: Sender, +} + +impl WalAcceptor { + /// Spawn thread with WalAcceptor running, return handle to it. + fn spawn( + tli: Arc, + msg_rx: Receiver, + reply_tx: Sender, + ) -> anyhow::Result>> { + let thread_name = format!("WAL acceptor {}", tli.ttid); + thread::Builder::new() + .name(thread_name) + .spawn(move || -> anyhow::Result<()> { + let mut wa = WalAcceptor { + tli, + msg_rx, + reply_tx, + }; + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + let span_ttid = wa.tli.ttid; // satisfy borrow checker + runtime.block_on( + wa.run() + .instrument(info_span!("WAL acceptor", tid = %gettid(), ttid = %span_ttid)), + ) + }) + .map_err(anyhow::Error::from) } - fn poll_msg(&mut self) -> Option { - let res = self.msg_rx.try_recv(); + /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; + /// it must mean that network thread terminated. + async fn run(&mut self) -> anyhow::Result<()> { + // Register the connection and defer unregister. + self.tli.on_compute_connect().await?; + let _guard = ComputeConnectionGuard { + timeline: Arc::clone(&self.tli), + }; - match res { - Err(_) => None, - Ok(msg) => Some(msg), + let mut next_msg: ProposerAcceptorMessage; + + loop { + let opt_msg = self.msg_rx.recv().await; + if opt_msg.is_none() { + return Ok(()); // chan closed, streaming terminated + } + next_msg = opt_msg.unwrap(); + + if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) { + // loop through AppendRequest's while it's readily available to + // write as many WAL as possible without fsyncing + while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg { + let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); + + if let Some(reply) = self.tli.process_msg(&noflush_msg)? { + if self.reply_tx.send(reply).await.is_err() { + return Ok(()); // chan closed, streaming terminated + } + } + + match self.msg_rx.try_recv() { + Ok(msg) => next_msg = msg, + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated + } + } + + // flush all written WAL to the disk + if let Some(reply) = self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)? { + if self.reply_tx.send(reply).await.is_err() { + return Ok(()); // chan closed, streaming terminated + } + } + } else { + // process message other than AppendRequest + if let Some(reply) = self.tli.process_msg(&next_msg)? { + if self.reply_tx.send(reply).await.is_err() { + return Ok(()); // chan closed, streaming terminated + } + } + } } } } @@ -210,8 +290,13 @@ struct ComputeConnectionGuard { impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - if let Err(e) = self.timeline.on_compute_disconnect() { - error!("failed to unregister compute connection: {}", e); - } + let tli = self.timeline.clone(); + // tokio forbids to call blocking_send inside the runtime, and see + // comments in on_compute_disconnect why we call blocking_send. + spawn_blocking(move || { + if let Err(e) = tli.on_compute_disconnect() { + error!("failed to unregister compute connection: {}", e); + } + }); } } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 7df347427e..4a046cb048 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -488,7 +488,7 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.pageserver_feedback.serialize(buf)? + msg.pageserver_feedback.serialize(buf)?; } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 169ab03f0a..e8c1b4c02e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -5,24 +5,22 @@ use crate::handler::SafekeeperPostgresHandler; use crate::timeline::{ReplicaState, Timeline}; use crate::wal_storage::WalReader; use crate::GlobalTimelines; -use anyhow::Context; - +use anyhow::Context as AnyhowContext; use bytes::Bytes; +use postgres_backend::PostgresBackend; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; +use pq_proto::{BeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; use std::cmp::min; -use std::net::Shutdown; +use std::str; use std::sync::Arc; use std::time::Duration; -use std::{io, str, thread}; -use utils::postgres_backend::QueryError; - -use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use tokio::sync::watch::Receiver; use tokio::time::timeout; use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn, postgres_backend::PostgresBackend, sock_split::ReadStream}; +use utils::{bin_ser::BeSer, lsn::Lsn}; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; @@ -60,13 +58,6 @@ pub struct StandbyReply { pub reply_requested: bool, } -/// A network connection that's speaking the replication protocol. -pub struct ReplicationConn { - /// This is an `Option` because we will spawn a background thread that will - /// `take` it from us. - stream_in: Option, -} - /// Scope guard to unregister replication connection from timeline struct ReplicationConnGuard { replica: usize, // replica internal ID assigned by timeline @@ -79,230 +70,275 @@ impl Drop for ReplicationConnGuard { } } -impl ReplicationConn { - /// Create a new `ReplicationConn` - pub fn new(pgb: &mut PostgresBackend) -> Self { - Self { - stream_in: pgb.take_stream_in(), +impl SafekeeperPostgresHandler { + /// Wrapper around handle_start_replication_guts handling result. Error is + /// handled here while we're still in walsender ttid span; with API + /// extension, this can probably be moved into postgres_backend. + pub async fn handle_start_replication( + &mut self, + pgb: &mut PostgresBackend, + start_pos: Lsn, + ) -> Result<(), QueryError> { + if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await { + // Log the result and probably send it to the client, closing the stream. + pgb.handle_copy_stream_end(end).await; } - } - - /// Handle incoming messages from the network. - /// This is spawned into the background by `handle_start_replication`. - fn background_thread( - mut stream_in: ReadStream, - replica_guard: Arc, - ) -> anyhow::Result<()> { - let replica_id = replica_guard.replica; - let timeline = &replica_guard.timeline; - - let mut state = ReplicaState::new(); - // Wait for replica's feedback. - while let Some(msg) = FeMessage::read(&mut stream_in)? { - match &msg { - FeMessage::CopyData(m) => { - // There's three possible data messages that the client is supposed to send here: - // `HotStandbyFeedback` and `StandbyStatusUpdate` and `NeonStandbyFeedback`. - - match m.first().cloned() { - Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { - // Note: deserializing is on m[1..] because we skip the tag byte. - state.hs_feedback = HotStandbyFeedback::des(&m[1..]) - .context("failed to deserialize HotStandbyFeedback")?; - timeline.update_replica_state(replica_id, state); - } - Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { - let _reply = StandbyReply::des(&m[1..]) - .context("failed to deserialize StandbyReply")?; - // This must be a regular postgres replica, - // because pageserver doesn't send this type of messages to safekeeper. - // Currently this is not implemented, so this message is ignored. - - warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet."); - // timeline.update_replica_state(replica_id, Some(state)); - } - Some(NEON_STATUS_UPDATE_TAG_BYTE) => { - // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. - let buf = Bytes::copy_from_slice(&m[9..]); - let reply = ReplicationFeedback::parse(buf); - - trace!("ReplicationFeedback is {:?}", reply); - // Only pageserver sends ReplicationFeedback, so set the flag. - // This replica is the source of information to resend to compute. - state.pageserver_feedback = Some(reply); - - timeline.update_replica_state(replica_id, state); - } - _ => warn!("unexpected message {:?}", msg), - } - } - FeMessage::Sync => {} - FeMessage::CopyFail => { - // Shutdown the connection, because rust-postgres client cannot be dropped - // when connection is alive. - let _ = stream_in.shutdown(Shutdown::Both); - anyhow::bail!("Copy failed"); - } - _ => { - // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored. - info!("unexpected message {:?}", msg); - } - } - } - Ok(()) } - /// - /// Handle START_REPLICATION replication command - /// - pub fn run( + pub async fn handle_start_replication_guts( &mut self, - spg: &mut SafekeeperPostgresHandler, pgb: &mut PostgresBackend, - mut start_pos: Lsn, - ) -> Result<(), QueryError> { - let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered(); - - let tli = GlobalTimelines::get(spg.ttid).map_err(|e| QueryError::Other(e.into()))?; - - // spawn the background thread which receives HotStandbyFeedback messages. - let bg_timeline = Arc::clone(&tli); - let bg_stream_in = self.stream_in.take().unwrap(); - let bg_timeline_id = spg.timeline_id.unwrap(); + start_pos: Lsn, + ) -> Result<(), CopyStreamHandlerEnd> { + let appname = self.appname.clone(); + let tli = + GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; let state = ReplicaState::new(); // This replica_id is used below to check if it's time to stop replication. - let replica_id = bg_timeline.add_replica(state); + let replica_id = tli.add_replica(state); // Use a guard object to remove our entry from the timeline, when the background // thread and us have both finished using it. - let replica_guard = Arc::new(ReplicationConnGuard { + let _guard = Arc::new(ReplicationConnGuard { replica: replica_id, - timeline: bg_timeline, + timeline: tli.clone(), }); - let bg_replica_guard = Arc::clone(&replica_guard); - // TODO: here we got two threads, one for writing WAL and one for receiving - // feedback. If one of them fails, we should shutdown the other one too. - let _ = thread::Builder::new() - .name("HotStandbyFeedback thread".into()) - .spawn(move || { - let _enter = - info_span!("HotStandbyFeedback thread", timeline = %bg_timeline_id).entered(); - if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) { - error!("Replication background thread failed: {}", err); + // Walproposer gets special handling: safekeeper must give proposer all + // local WAL till the end, whether committed or not (walproposer will + // hang otherwise). That's because walproposer runs the consensus and + // synchronizes safekeepers on the most advanced one. + // + // There is a small risk of this WAL getting concurrently garbaged if + // another compute rises which collects majority and starts fixing log + // on this safekeeper itself. That's ok as (old) proposer will never be + // able to commit such WAL. + let stop_pos: Option = if self.is_walproposer_recovery() { + let wal_end = tli.get_flush_lsn(); + Some(wal_end) + } else { + None + }; + let end_pos = stop_pos.unwrap_or(Lsn::INVALID); + + info!( + "starting streaming from {:?} till {:?}", + start_pos, stop_pos + ); + + // switch to copy + pgb.write_message(&BeMessage::CopyBothResponse).await?; + + let (_, persisted_state) = tli.get_state(); + let wal_reader = WalReader::new( + self.conf.workdir.clone(), + self.conf.timeline_dir(&tli.ttid), + &persisted_state, + start_pos, + self.conf.wal_backup_enabled, + )?; + + // Split to concurrently receive and send data; replies are generally + // not synchronized with sends, so this avoids deadlocks. + let reader = pgb.split().context("START_REPLICATION split")?; + + let mut sender = WalSender { + pgb, + tli: tli.clone(), + appname, + start_pos, + end_pos, + stop_pos, + commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + replica_id, + wal_reader, + send_buf: [0; MAX_SEND_SIZE], + }; + let mut reply_reader = ReplyReader { + reader, + tli, + replica_id, + feedback: ReplicaState::new(), + }; + + let res = tokio::select! { + // todo: add read|write .context to these errors + r = sender.run() => r, + r = reply_reader.run() => r, + }; + // Join pg backend back. + pgb.unsplit(reply_reader.reader)?; + + res + } +} + +/// A half driving sending WAL. +struct WalSender<'a> { + pgb: &'a mut PostgresBackend, + tli: Arc, + appname: Option, + // Position since which we are sending next chunk. + start_pos: Lsn, + // WAL up to this position is known to be locally available. + end_pos: Lsn, + // If present, terminate after reaching this position; used by walproposer + // in recovery. + stop_pos: Option, + commit_lsn_watch_rx: Receiver, + replica_id: usize, + wal_reader: WalReader, + // buffer for readling WAL into to send it + send_buf: [u8; MAX_SEND_SIZE], +} + +impl WalSender<'_> { + /// Send WAL until + /// - an error occurs + /// - if we are streaming to walproposer, we've streamed until stop_pos + /// (recovery finished) + /// - receiver is caughtup and there is no computes + /// + /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? + /// convenience. + async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> { + loop { + // If we are streaming to walproposer, check it is time to stop. + if let Some(stop_pos) = self.stop_pos { + if self.start_pos >= stop_pos { + // recovery finished + return Err(CopyStreamHandlerEnd::ServerInitiated(format!( + "ending streaming to walproposer at {}, recovery finished", + self.start_pos + ))); } - })?; - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - runtime.block_on(async move { - let (inmem_state, persisted_state) = tli.get_state(); - // add persisted_state.timeline_start_lsn == Lsn(0) check - - // Walproposer gets special handling: safekeeper must give proposer all - // local WAL till the end, whether committed or not (walproposer will - // hang otherwise). That's because walproposer runs the consensus and - // synchronizes safekeepers on the most advanced one. - // - // There is a small risk of this WAL getting concurrently garbaged if - // another compute rises which collects majority and starts fixing log - // on this safekeeper itself. That's ok as (old) proposer will never be - // able to commit such WAL. - let stop_pos: Option = if spg.is_walproposer_recovery() { - let wal_end = tli.get_flush_lsn(); - Some(wal_end) } else { - None - }; + // Wait for the next portion if it is not there yet, or just + // update our end of WAL available for sending value, we + // communicate it to the receiver. + self.wait_wal().await?; + } - info!("Start replication from {:?} till {:?}", start_pos, stop_pos); + // try to send as much as available, capped by MAX_SEND_SIZE + let mut send_size = self + .end_pos + .checked_sub(self.start_pos) + .context("reading wal without waiting for it first")? + .0 as usize; + send_size = min(send_size, self.send_buf.len()); + let send_buf = &mut self.send_buf[..send_size]; + // read wal into buffer + send_size = self.wal_reader.read(send_buf).await?; + let send_buf = &send_buf[..send_size]; - // switch to copy - pgb.write_message(&BeMessage::CopyBothResponse)?; - - let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); - - let mut wal_reader = WalReader::new( - spg.conf.workdir.clone(), - spg.conf.timeline_dir(&tli.ttid), - &persisted_state, - start_pos, - spg.conf.wal_backup_enabled, - )?; - - // buffer for wal sending, limited by MAX_SEND_SIZE - let mut send_buf = vec![0u8; MAX_SEND_SIZE]; - - // watcher for commit_lsn updates - let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); - - loop { - if let Some(stop_pos) = stop_pos { - if start_pos >= stop_pos { - break; /* recovery finished */ - } - end_pos = stop_pos; - } else { - /* Wait until we have some data to stream */ - let lsn = wait_for_lsn(&mut commit_lsn_watch_rx, start_pos).await?; - - if let Some(lsn) = lsn { - end_pos = lsn; - } else { - // TODO: also check once in a while whether we are walsender - // to right pageserver. - if tli.should_walsender_stop(replica_id) { - // Shut down, timeline is suspended. - return Err(QueryError::from(io::Error::new( - io::ErrorKind::ConnectionAborted, - format!("end streaming to {:?}", spg.appname), - ))); - } - - // timeout expired: request pageserver status - pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - sent_ptr: end_pos.0, - timestamp: get_current_timestamp(), - request_reply: true, - }))?; - continue; - } - } - - let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; - let send_size = min(send_size, send_buf.len()); - - let send_buf = &mut send_buf[..send_size]; - - // read wal into buffer - let send_size = wal_reader.read(send_buf).await?; - let send_buf = &send_buf[..send_size]; - - // Write some data to the network socket. - pgb.write_message(&BeMessage::XLogData(XLogDataBody { - wal_start: start_pos.0, - wal_end: end_pos.0, + // and send it + self.pgb + .write_message(&BeMessage::XLogData(XLogDataBody { + wal_start: self.start_pos.0, + wal_end: self.end_pos.0, timestamp: get_current_timestamp(), data: send_buf, })) - .context("Failed to send XLogData")?; + .await?; - start_pos += send_size as u64; - trace!("sent WAL up to {}", start_pos); + trace!( + "sent {} bytes of WAL {}-{}", + send_size, + self.start_pos, + self.start_pos + send_size as u64 + ); + self.start_pos += send_size as u64; + } + } + + /// wait until we have WAL to stream, sending keepalives and checking for + /// exit in the meanwhile + async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { + loop { + if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? { + self.end_pos = lsn; + return Ok(()); } + // Timed out waiting for WAL, check for termination and send KA + if self.tli.should_walsender_stop(self.replica_id) { + // Terminate if there is nothing more to send. + // TODO close the stream properly + return Err(CopyStreamHandlerEnd::ServerInitiated(format!( + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); + } + self.pgb + .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { + sent_ptr: self.end_pos.0, + timestamp: get_current_timestamp(), + request_reply: true, + })) + .await?; + } + } +} - Ok(()) - }) +/// A half driving receiving replies. +struct ReplyReader { + reader: PostgresBackendReader, + tli: Arc, + replica_id: usize, + feedback: ReplicaState, +} + +impl ReplyReader { + async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> { + loop { + let msg = self.reader.read_copy_message().await?; + self.handle_feedback(&msg)? + } + } + + fn handle_feedback(&mut self, msg: &Bytes) -> anyhow::Result<()> { + match msg.first().cloned() { + Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { + // Note: deserializing is on m[1..] because we skip the tag byte. + self.feedback.hs_feedback = HotStandbyFeedback::des(&msg[1..]) + .context("failed to deserialize HotStandbyFeedback")?; + self.tli + .update_replica_state(self.replica_id, self.feedback); + } + Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { + let _reply = + StandbyReply::des(&msg[1..]).context("failed to deserialize StandbyReply")?; + // This must be a regular postgres replica, + // because pageserver doesn't send this type of messages to safekeeper. + // Currently we just ignore this, tracking progress for them is not supported. + } + Some(NEON_STATUS_UPDATE_TAG_BYTE) => { + // pageserver sends this. + // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. + let buf = Bytes::copy_from_slice(&msg[9..]); + let reply = ReplicationFeedback::parse(buf); + + trace!("ReplicationFeedback is {:?}", reply); + // Only pageserver sends ReplicationFeedback, so set the flag. + // This replica is the source of information to resend to compute. + self.feedback.pageserver_feedback = Some(reply); + + self.tli + .update_replica_state(self.replica_id, self.feedback); + } + _ => warn!("unexpected message {:?}", msg), + } + Ok(()) } } const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); -// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn. +/// Wait until we have commit_lsn > lsn or timeout expires. Returns +/// - Ok(Some(commit_lsn)) if needed lsn is successfully observed; +/// - Ok(None) if timeout expired; +/// - Err in case of error (if watch channel is in trouble, shouldn't happen). async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result> { let commit_lsn: Lsn = *rx.borrow(); if commit_lsn > lsn { diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 98c565cde4..fca460d998 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,4 +1,4 @@ -//! This module implements Timeline lifecycle management and has all neccessary code +//! This module implements Timeline lifecycle management and has all necessary code //! to glue together SafeKeeper and all other background services. use anyhow::{anyhow, bail, Result}; @@ -532,7 +532,7 @@ impl Timeline { /// Register compute connection, starting timeline-related activity if it is /// not running yet. - pub fn on_compute_connect(&self) -> Result<()> { + pub async fn on_compute_connect(&self) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } @@ -546,7 +546,7 @@ impl Timeline { // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { // Can fail only if channel to a static thread got closed, which is not normal at all. - self.wal_backup_launcher_tx.blocking_send(self.ttid)?; + self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } @@ -563,6 +563,11 @@ impl Timeline { // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { // Can fail only if channel to a static thread got closed, which is not normal at all. + // + // Note: this is blocking_send because on_compute_disconnect is called in Drop, there is + // no async Drop and we use current thread runtimes. With current thread rt spawning + // task in drop impl is racy, as thread along with runtime might finish before the task. + // This should be switched send.await when/if we go to full async. self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index c99ca0a51a..868ee97645 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -171,7 +171,7 @@ impl GlobalTimelines { /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. - pub fn create( + pub async fn create( ttid: TenantTimelineId, server_info: ServerInfo, commit_lsn: Lsn, @@ -199,28 +199,20 @@ impl GlobalTimelines { // Take a lock and finish the initialization holding this mutex. No other threads // can interfere with creation after we will insert timeline into the map. - let mut shared_state = timeline.write_shared_state(); + { + let mut shared_state = timeline.write_shared_state(); - // We can get a race condition here in case of concurrent create calls, but only - // in theory. create() will return valid timeline on the next try. - TIMELINES_STATE - .lock() - .unwrap() - .try_insert(timeline.clone())?; + // We can get a race condition here in case of concurrent create calls, but only + // in theory. create() will return valid timeline on the next try. + TIMELINES_STATE + .lock() + .unwrap() + .try_insert(timeline.clone())?; - // Write the new timeline to the disk and start background workers. - // Bootstrap is transactional, so if it fails, the timeline will be deleted, - // and the state on disk should remain unchanged. - match timeline.bootstrap(&mut shared_state) { - Ok(_) => { - // We are done with bootstrap, release the lock, return the timeline. - drop(shared_state); - timeline - .wal_backup_launcher_tx - .blocking_send(timeline.ttid)?; - Ok(timeline) - } - Err(e) => { + // Write the new timeline to the disk and start background workers. + // Bootstrap is transactional, so if it fails, the timeline will be deleted, + // and the state on disk should remain unchanged. + if let Err(e) = timeline.bootstrap(&mut shared_state) { // Note: the most likely reason for bootstrap failure is that the timeline // directory already exists on disk. This happens when timeline is corrupted // and wasn't loaded from disk on startup because of that. We want to preserve @@ -232,9 +224,13 @@ impl GlobalTimelines { // Timeline failed to bootstrap, it cannot be used. Remove it from the map. TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid); - Err(e) + return Err(e); } + // We are done with bootstrap, release the lock, return the timeline. + // {} block forces release before .await } + timeline.wal_backup_launcher_tx.send(timeline.ttid).await?; + Ok(timeline) } /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, @@ -254,7 +250,7 @@ impl GlobalTimelines { } } - /// Returns all timelines. This is used for background timeline proccesses. + /// Returns all timelines. This is used for background timeline processes. pub fn get_all() -> Vec> { let global_lock = TIMELINES_STATE.lock().unwrap(); global_lock diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index fc971ca753..798b9abaf3 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -191,7 +191,7 @@ async fn wal_backup_launcher_main_loop( .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) }); - // Presense in this map means launcher is aware s3 offloading is needed for + // Presence in this map means launcher is aware s3 offloading is needed for // the timeline, but task is started only if it makes sense for to offload // from this safekeeper. let mut tasks: HashMap = HashMap::new(); @@ -467,7 +467,7 @@ async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize pub async fn read_object( file_path: &RemotePath, offset: u64, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let storage = REMOTE_STORAGE .get() .context("Failed to get remote storage")? diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 40448be949..8d63d604ad 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -2,50 +2,65 @@ //! WAL service listens for client connections and //! receive WAL from wal_proposer and send it to WAL receivers //! -use regex::Regex; -use std::net::{TcpListener, TcpStream}; -use std::thread; +use anyhow::{Context, Result}; +use nix::unistd::gettid; +use postgres_backend::QueryError; +use std::{future, thread}; +use tokio::net::TcpStream; use tracing::*; -use utils::postgres_backend::QueryError; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; -use utils::postgres_backend::{AuthType, PostgresBackend}; +use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! { - loop { - match listener.accept() { - Ok((socket, peer_addr)) => { - debug!("accepted connection from {}", peer_addr); - let conf = conf.clone(); +pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .context("create runtime") + // todo catch error in main thread + .expect("failed to create runtime"); - let _ = thread::Builder::new() - .name("WAL service thread".into()) - .spawn(move || { - if let Err(err) = handle_socket(socket, conf) { - error!("connection handler exited: {}", err); - } - }) - .unwrap(); + runtime + .block_on(async move { + // Tokio's from_std won't do this for us, per its comment. + pg_listener.set_nonblocking(true)?; + let listener = tokio::net::TcpListener::from_std(pg_listener)?; + + loop { + match listener.accept().await { + Ok((socket, peer_addr)) => { + debug!("accepted connection from {}", peer_addr); + let conf = conf.clone(); + + let _ = thread::Builder::new() + .name("WAL service thread".into()) + .spawn(move || { + if let Err(err) = handle_socket(socket, conf) { + error!("connection handler exited: {}", err); + } + }) + .unwrap(); + } + Err(e) => error!("Failed to accept connection: {}", e), + } } - Err(e) => error!("Failed to accept connection: {}", e), - } - } -} - -// Get unique thread id (Rust internal), with ThreadId removed for shorter printing -fn get_tid() -> u64 { - let tids = format!("{:?}", thread::current().id()); - let r = Regex::new(r"ThreadId\((\d+)\)").unwrap(); - let caps = r.captures(&tids).unwrap(); - caps.get(1).unwrap().as_str().parse().unwrap() + #[allow(unreachable_code)] // hint compiler the closure return type + Ok::<(), anyhow::Error>(()) + }) + .expect("listener failed") } /// This is run by `thread_main` above, inside a background thread. /// fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> { - let _enter = info_span!("", tid = ?get_tid()).entered(); + let _enter = info_span!("", tid = %gettid()).entered(); + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + let local = tokio::task::LocalSet::new(); socket.set_nodelay(true)?; @@ -54,9 +69,13 @@ fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryErr Some(_) => AuthType::NeonJWT, }; let mut conn_handler = SafekeeperPostgresHandler::new(conf); - let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?; - // libpq replication protocol between safekeeper and replicas/pagers - pgbackend.run(&mut conn_handler)?; + let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + // libpq protocol between safekeeper and walproposer / pageserver + // We don't use shutdown. + local.block_on( + &runtime, + pgbackend.run(&mut conn_handler, future::pending::<()>), + )?; Ok(()) } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ae02b3c7bc..9b385630c2 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -471,7 +471,7 @@ pub struct WalReader { timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn, - wal_segment: Option>>, + wal_segment: Option>>, // S3 will be used to read WAL if LSN is not available locally enable_remote_read: bool, @@ -538,7 +538,7 @@ impl WalReader { } /// Open WAL segment at the current position of the reader. - async fn open_segment(&self) -> Result>> { + async fn open_segment(&self) -> Result>> { let xlogoff = self.pos.segment_offset(self.wal_seg_size); let segno = self.pos.segment_number(self.wal_seg_size); let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ba98563693..70a6f1809e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2068,8 +2068,10 @@ class NeonPageserver(PgProtocol): ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*", ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*", ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*", + # FIXME: replication patch for tokio_postgres regards any but CopyDone/CopyData message in CopyBoth stream as unexpected + ".*Connection aborted: connection error: unexpected message from server*", ".*kill_and_wait_impl.*: wait successful.*", - ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*", + ".*Replication stream finished: db error:.*ending streaming to Some*", ".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down ".*query handler for 'pagestream.*failed: Connection reset by peer.*", # pageserver notices compute shut down # safekeeper connection can fail with this, in the window between timeline creation diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0ac9127c6b..489afb7b93 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1138,8 +1138,8 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # FIXME: are these expected? env.pageserver.allowed_errors.extend( [ - ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*", - ".*Failed to process query for timeline .*: Timeline .* was cancelled and cannot be used anymore.*", + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", ] ) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index bd21095fff..f885f4a94d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -14,14 +14,19 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] anyhow = { version = "1", features = ["backtrace"] } +byteorder = { version = "1" } bytes = { version = "1", features = ["serde"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } clap = { version = "4", features = ["derive", "string"] } crossbeam-utils = { version = "0.8" } +digest = { version = "0.10", features = ["mac", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures = { version = "0.3" } +futures-channel = { version = "0.3", features = ["sink"] } +futures-core = { version = "0.3" } futures-executor = { version = "0.3" } +futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hashbrown = { version = "0.12", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } @@ -45,6 +50,7 @@ serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] } +tokio-rustls = { version = "0.23" } tokio-util = { version = "0.7", features = ["codec", "io"] } tonic = { version = "0.8", features = ["tls-roots"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } From b80fe41af3e672f242ce4aab6699649f517d6e33 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 2 Feb 2023 12:03:45 +0400 Subject: [PATCH 122/426] Refactor postgres protocol parsing. 1) Remove allocation and data copy during each message read. Instead, parsing functions now accept BytesMut from which data they form messages, with pointers (e.g. in CopyData) pointing directly into BytesMut buffer. Accordingly, move ConnectionError containing IO error subtype into framed.rs providing this and leave in pq_proto only ProtocolError. 2) Remove anyhow from pq_proto. 3) Move FeStartupPacket out of FeMessage. Now FeStartupPacket::parse returns it directly, eliminating dead code where user wants startup packet but has to match for others. proxy stream.rs is adapted to framed.rs with minimal changes. It also benefits from framed.rs improvements described above. --- Cargo.lock | 2 +- libs/postgres_backend/src/lib.rs | 17 +- libs/pq_proto/Cargo.toml | 2 +- libs/pq_proto/src/framed.rs | 127 +++++-- libs/pq_proto/src/lib.rs | 330 +++++++++--------- pageserver/src/page_service.rs | 12 +- .../walreceiver/walreceiver_connection.rs | 2 +- proxy/src/stream.rs | 61 ++-- safekeeper/src/safekeeper.rs | 2 +- 9 files changed, 316 insertions(+), 239 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e380e72dc0..b96f7dbc99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2747,7 +2747,7 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" name = "pq_proto" version = "0.1.0" dependencies = [ - "anyhow", + "byteorder", "bytes", "pin-project-lite", "postgres-protocol", diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index ba28add9f9..ce46899779 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -17,9 +17,9 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; use tracing::{debug, error, info, trace}; -use pq_proto::framed::{Framed, FramedReader, FramedWriter}; +use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; use pq_proto::{ - BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR, + BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION, }; @@ -37,7 +37,7 @@ pub enum QueryError { impl From for QueryError { fn from(e: io::Error) -> Self { - Self::Disconnected(ConnectionError::Socket(e)) + Self::Disconnected(ConnectionError::Io(e)) } } @@ -219,7 +219,7 @@ impl MaybeWriteOnly { } } - fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ConnectionError> { + fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { match self { MaybeWriteOnly::Full(framed) => framed.write_message(msg), MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.write_message_noflush(msg), @@ -701,8 +701,7 @@ impl PostgresBackend { FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail - | FeMessage::PasswordMessage(_) - | FeMessage::StartupPacket(_) => { + | FeMessage::PasswordMessage(_) => { return Err(QueryError::Other(anyhow::anyhow!( "unexpected message type: {msg:?}", ))); @@ -721,7 +720,7 @@ impl PostgresBackend { let expected_end = match &end { ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true, - CopyStreamHandlerEnd::Disconnected(ConnectionError::Socket(io_error)) + CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { true @@ -800,7 +799,7 @@ impl PostgresBackendReader { FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail), FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate), _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol( - format!("unexpected message in COPY stream {:?}", msg), + ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)), ))), }, None => Err(CopyStreamHandlerEnd::EOF), @@ -871,7 +870,7 @@ pub fn short_error(e: &QueryError) -> String { fn log_query_error(query: &str, e: &QueryError) { match e { - QueryError::Disconnected(ConnectionError::Socket(io_error)) => { + QueryError::Disconnected(ConnectionError::Io(io_error)) => { if is_expected_io_error(io_error) { info!("query handler for '{query}' failed with expected io error: {io_error}"); } else { diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index bc90a7a2c1..76b71729ed 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -5,8 +5,8 @@ edition.workspace = true license.workspace = true [dependencies] -anyhow.workspace = true bytes.workspace = true +byteorder.workspace = true pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 7c33222e6e..972730cbab 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -1,51 +1,84 @@ //! Provides `Framed` -- writing/flushing and reading Postgres messages to/from -//! the async stream. +//! the async stream based on (and buffered with) BytesMut. All functions are +//! cancellation safe. +//! +//! It is similar to what tokio_util::codec::Framed with appropriate codec +//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used +//! separately without using split from futures::stream::StreamExt (which +//! allocates box[1] in polling internally). tokio::io::split is used for splitting +//! instead. Plus we customize error messages more than a single type for all io +//! calls. +//! +//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 use bytes::{Buf, BytesMut}; use std::{ future::Future, io::{self, ErrorKind}, }; -use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader, ReadHalf, WriteHalf}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf}; -use crate::{BeMessage, ConnectionError, FeMessage, FeStartupPacket}; +use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; const INITIAL_CAPACITY: usize = 8 * 1024; +/// Error on postgres connection: either IO (physical transport error) or +/// protocol violation. +#[derive(thiserror::Error, Debug)] +pub enum ConnectionError { + #[error(transparent)] + Io(#[from] io::Error), + #[error(transparent)] + Protocol(#[from] ProtocolError), +} + +impl ConnectionError { + /// Proxy stream.rs uses only io::Error; provide it. + pub fn into_io_error(self) -> io::Error { + match self { + ConnectionError::Io(io) => io, + ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()), + } + } +} + /// Wraps async io `stream`, providing messages to write/flush + read Postgres /// messages. pub struct Framed { - stream: BufReader, + stream: S, + read_buf: BytesMut, write_buf: BytesMut, } -impl Framed { +impl Framed { pub fn new(stream: S) -> Self { Self { - stream: BufReader::new(stream), + stream, + read_buf: BytesMut::with_capacity(INITIAL_CAPACITY), write_buf: BytesMut::with_capacity(INITIAL_CAPACITY), } } /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { - self.stream.get_ref() + &self.stream } /// Extract the underlying stream. pub fn into_inner(self) -> S { - self.stream.into_inner() + self.stream } /// Return new Framed with stream type transformed by async f, for TLS /// upgrade. - pub async fn map_stream(self, f: F) -> Result, E> + pub async fn map_stream(self, f: F) -> Result, E> where F: FnOnce(S) -> Fut, Fut: Future>, { - let stream = f(self.stream.into_inner()).await?; + let stream = f(self.stream).await?; Ok(Framed { - stream: BufReader::new(stream), + stream, + read_buf: self.read_buf, write_buf: self.write_buf, }) } @@ -55,24 +88,18 @@ impl Framed { pub async fn read_startup_message( &mut self, ) -> Result, ConnectionError> { - let msg = FeStartupPacket::read(&mut self.stream).await?; - - match msg { - Some(FeMessage::StartupPacket(packet)) => Ok(Some(packet)), - None => Ok(None), - _ => panic!("unreachable state"), - } + read_message(&mut self.stream, &mut self.read_buf, FeStartupPacket::parse).await } pub async fn read_message(&mut self) -> Result, ConnectionError> { - FeMessage::read(&mut self.stream).await + read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await } } -impl Framed { +impl Framed { /// Write next message to the output buffer; doesn't flush. - pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ConnectionError> { - BeMessage::write(&mut self.write_buf, msg).map_err(|e| e.into()) + pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { + BeMessage::write(&mut self.write_buf, msg) } /// Flush out the buffer. This function is cancellation safe: it can be @@ -93,7 +120,10 @@ impl Framed { /// https://github.com/tokio-rs/tls/issues/40 pub fn split(self) -> (FramedReader, FramedWriter) { let (read_half, write_half) = tokio::io::split(self.stream); - let reader = FramedReader { stream: read_half }; + let reader = FramedReader { + stream: read_half, + read_buf: self.read_buf, + }; let writer = FramedWriter { stream: write_half, write_buf: self.write_buf, @@ -105,6 +135,7 @@ impl Framed { pub fn unsplit(reader: FramedReader, writer: FramedWriter) -> Self { Self { stream: reader.stream.unsplit(writer.stream), + read_buf: reader.read_buf, write_buf: writer.write_buf, } } @@ -112,25 +143,26 @@ impl Framed { /// Read-only version of `Framed`. pub struct FramedReader { - stream: ReadHalf>, + stream: ReadHalf, + read_buf: BytesMut, } impl FramedReader { pub async fn read_message(&mut self) -> Result, ConnectionError> { - FeMessage::read(&mut self.stream).await + read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await } } /// Write-only version of `Framed`. pub struct FramedWriter { - stream: WriteHalf>, + stream: WriteHalf, write_buf: BytesMut, } -impl FramedWriter { +impl FramedWriter { /// Write next message to the output buffer; doesn't flush. - pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ConnectionError> { - BeMessage::write(&mut self.write_buf, msg).map_err(|e| e.into()) + pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> { + BeMessage::write(&mut self.write_buf, msg) } /// Flush out the buffer. This function is cancellation safe: it can be @@ -145,6 +177,43 @@ impl FramedWriter { } } +/// Read next message from the stream. Returns Ok(None), if EOF happened and we +/// don't have remaining data in the buffer. This function is cancellation safe: +/// you can drop future which is not yet complete and finalize reading message +/// with the next call. +/// +/// Parametrized to allow reading startup or usual message, having different +/// format. +async fn read_message( + stream: &mut S, + read_buf: &mut BytesMut, + parse: P, +) -> Result, ConnectionError> +where + P: Fn(&mut BytesMut) -> Result, ProtocolError>, +{ + loop { + if let Some(msg) = parse(read_buf)? { + return Ok(Some(msg)); + } + // If we can't build a frame yet, try to read more data and try again. + // Make sure we've got room for at least one byte to read to ensure + // that we don't get a spurious 0 that looks like EOF. + read_buf.reserve(1); + if stream.read_buf(read_buf).await? == 0 { + if read_buf.has_remaining() { + return Err(io::Error::new( + ErrorKind::UnexpectedEof, + "EOF with unprocessed data in the buffer", + ) + .into()); + } else { + return Ok(None); // clean EOF + } + } + } +} + async fn flush( stream: &mut S, write_buf: &mut BytesMut, diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 6980c4afae..46d531239a 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -4,19 +4,16 @@ pub mod framed; -use anyhow::{ensure, Context, Result}; +use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_protocol::PG_EPOCH; use serde::{Deserialize, Serialize}; use std::{ borrow::Cow, collections::HashMap, - fmt, - io::{self, Cursor}, - str, + fmt, io, str, time::{Duration, SystemTime}, }; -use tokio::io::AsyncReadExt; use tracing::{trace, warn}; pub type Oid = u32; @@ -28,7 +25,6 @@ pub const TEXT_OID: Oid = 25; #[derive(Debug)] pub enum FeMessage { - StartupPacket(FeStartupPacket), // Simple query. Query(Bytes), // Extended query protocol. @@ -188,100 +184,90 @@ pub struct FeExecuteMessage { #[derive(Debug)] pub struct FeCloseMessage; -/// Retry a read on EINTR -/// -/// This runs the enclosed expression, and if it returns -/// Err(io::ErrorKind::Interrupted), retries it. -macro_rules! retry_read { - ( $x:expr ) => { - loop { - match $x { - Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, - res => break res, - } - } - }; -} - -/// An error occured during connection being open. +/// An error occured while parsing or serializing raw stream into Postgres +/// messages. #[derive(thiserror::Error, Debug)] -pub enum ConnectionError { - /// IO error during writing to or reading from the connection socket. - #[error("Socket IO error: {0}")] - Socket(#[from] std::io::Error), - /// Invalid packet was received from client +pub enum ProtocolError { + /// Invalid packet was received from the client (e.g. unexpected message + /// type or broken len). #[error("Protocol error: {0}")] Protocol(String), - /// Failed to parse a protocol mesage + /// Failed to parse or, (unlikely), serialize a protocol message. #[error("Message parse error: {0}")] - MessageParse(anyhow::Error), + BadMessage(String), } -impl From for ConnectionError { - fn from(e: anyhow::Error) -> Self { - Self::MessageParse(e) - } -} - -impl ConnectionError { +impl ProtocolError { + /// Proxy stream.rs uses only io::Error; provide it. pub fn into_io_error(self) -> io::Error { - match self { - ConnectionError::Socket(io) => io, - other => io::Error::new(io::ErrorKind::Other, other.to_string()), - } + io::Error::new(io::ErrorKind::Other, self.to_string()) } } impl FeMessage { - /// Read one message from the stream. - /// This function returns `Ok(None)` in case of EOF. - pub async fn read(stream: &mut Reader) -> Result, ConnectionError> - where - Reader: tokio::io::AsyncRead + Unpin, - { - // We return a Future that's sync (has a `wait` method) if and only if the provided stream is SyncProof. - // SyncFuture contract: we are only allowed to await on sync-proof futures, the AsyncRead and - // AsyncReadExt methods of the stream. - // Each libpq message begins with a message type byte, followed by message length - // If the client closes the connection, return None. But if the client closes the - // connection in the middle of a message, we will return an error. - let tag = match retry_read!(stream.read_u8().await) { - Ok(b) => b, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(ConnectionError::Socket(e)), - }; + /// Read and parse one message from the `buf` input buffer. If there is at + /// least one valid message, returns it, advancing `buf`; redundant copies + /// are avoided, as thanks to `bytes` crate ptrs in parsed message point + /// directly into the `buf` (processed data is garbage collected after + /// parsed message is dropped). + /// + /// Returns None if `buf` doesn't contain enough data for a single message. + /// For efficiency, tries to reserve large enough space in `buf` for the + /// next message in this case to save the repeated calls. + /// + /// Returns Error if message is malformed, the only possible ErrorKind is + /// InvalidInput. + // + // Inspired by rust-postgres Message::parse. + pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { + // Every message contains message type byte and 4 bytes len; can't do + // much without them. + if buf.len() < 5 { + let to_read = 5 - buf.len(); + buf.reserve(to_read); + return Ok(None); + } - // The message length includes itself, so it better be at least 4. - let len = retry_read!(stream.read_u32().await) - .map_err(ConnectionError::Socket)? - .checked_sub(4) - .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?; + // We shouldn't advance `buf` as probably full message is not there yet, + // so can't directly use Bytes::get_u32 etc. + let tag = buf[0]; + let len = (&buf[1..5]).read_u32::().unwrap(); + if len < 4 { + return Err(ProtocolError::Protocol(format!( + "invalid message length {}", + len + ))); + } - let body = { - let mut buffer = vec![0u8; len as usize]; - stream - .read_exact(&mut buffer) - .await - .map_err(ConnectionError::Socket)?; - Bytes::from(buffer) - }; + // length field includes itself, but not message type. + let total_len = len as usize + 1; + if buf.len() < total_len { + // Don't have full message yet. + let to_read = total_len - buf.len(); + buf.reserve(to_read); + return Ok(None); + } + + // got the message, advance buffer + let mut msg = buf.split_to(total_len).freeze(); + msg.advance(5); // consume message type and len match tag { - b'Q' => Ok(Some(FeMessage::Query(body))), - b'P' => Ok(Some(FeParseMessage::parse(body)?)), - b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), - b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), - b'B' => Ok(Some(FeBindMessage::parse(body)?)), - b'C' => Ok(Some(FeCloseMessage::parse(body)?)), + b'Q' => Ok(Some(FeMessage::Query(msg))), + b'P' => Ok(Some(FeParseMessage::parse(msg)?)), + b'D' => Ok(Some(FeDescribeMessage::parse(msg)?)), + b'E' => Ok(Some(FeExecuteMessage::parse(msg)?)), + b'B' => Ok(Some(FeBindMessage::parse(msg)?)), + b'C' => Ok(Some(FeCloseMessage::parse(msg)?)), b'S' => Ok(Some(FeMessage::Sync)), b'X' => Ok(Some(FeMessage::Terminate)), - b'd' => Ok(Some(FeMessage::CopyData(body))), + b'd' => Ok(Some(FeMessage::CopyData(msg))), b'c' => Ok(Some(FeMessage::CopyDone)), b'f' => Ok(Some(FeMessage::CopyFail)), - b'p' => Ok(Some(FeMessage::PasswordMessage(body))), + b'p' => Ok(Some(FeMessage::PasswordMessage(msg))), tag => { - return Err(ConnectionError::Protocol(format!( - "unknown message tag: {tag},'{body:?}'" + return Err(ProtocolError::Protocol(format!( + "unknown message tag: {tag},'{msg:?}'" ))) } } @@ -289,60 +275,59 @@ impl FeMessage { } impl FeStartupPacket { - /// Read startup message from the stream. - // XXX: It's tempting yet undesirable to accept `stream` by value, - // since such a change will cause user-supplied &mut references to be consumed - pub async fn read(stream: &mut Reader) -> Result, ConnectionError> - where - Reader: tokio::io::AsyncRead + Unpin, - { + /// Read and parse startup message from the `buf` input buffer. It is + /// different from [`FeMessage::parse`] because startup messages don't have + /// message type byte; otherwise, its comments apply. + pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { const MAX_STARTUP_PACKET_LENGTH: usize = 10000; const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234; const CANCEL_REQUEST_CODE: u32 = 5678; const NEGOTIATE_SSL_CODE: u32 = 5679; const NEGOTIATE_GSS_CODE: u32 = 5680; - // Read length. If the connection is closed before reading anything (or before - // reading 4 bytes, to be precise), return None to indicate that the connection - // was closed. This matches the PostgreSQL server's behavior, which avoids noise - // in the log if the client opens connection but closes it immediately. - let len = match retry_read!(stream.read_u32().await) { - Ok(len) => len as usize, - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(ConnectionError::Socket(e)), - }; + // need at least 4 bytes with packet len + if buf.len() < 4 { + let to_read = 4 - buf.len(); + buf.reserve(to_read); + return Ok(None); + } - #[allow(clippy::manual_range_contains)] + // We shouldn't advance `buf` as probably full message is not there yet, + // so can't directly use Bytes::get_u32 etc. + let len = (&buf[0..4]).read_u32::().unwrap() as usize; if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { - return Err(ConnectionError::Protocol(format!( - "invalid message length {len}" + return Err(ProtocolError::Protocol(format!( + "invalid startup packet message length {}", + len ))); } - let request_code = retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?; + if buf.len() < len { + // Don't have full message yet. + let to_read = len - buf.len(); + buf.reserve(to_read); + return Ok(None); + } - // the rest of startup packet are params - let params_len = len - 8; - let mut params_bytes = vec![0u8; params_len]; - stream - .read_exact(params_bytes.as_mut()) - .await - .map_err(ConnectionError::Socket)?; + // got the message, advance buffer + let mut msg = buf.split_to(len).freeze(); + msg.advance(4); // consume len - // Parse params depending on request code + let request_code = msg.get_u32(); let req_hi = request_code >> 16; let req_lo = request_code & ((1 << 16) - 1); + // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code. let message = match (req_hi, req_lo) { (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { - if params_len != 8 { - return Err(ConnectionError::Protocol( - "expected 8 bytes for CancelRequest params".to_string(), + if msg.remaining() != 8 { + return Err(ProtocolError::BadMessage( + "CancelRequest message is malformed, backend PID / secret key missing" + .to_owned(), )); } - let mut cursor = Cursor::new(params_bytes); FeStartupPacket::CancelRequest(CancelKeyData { - backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?, - cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?, + backend_pid: msg.get_i32(), + cancel_key: msg.get_i32(), }) } (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { @@ -354,19 +339,23 @@ impl FeStartupPacket { FeStartupPacket::GssEncRequest } (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { - return Err(ConnectionError::Protocol(format!( + return Err(ProtocolError::Protocol(format!( "Unrecognized request code {unrecognized_code}" ))); } // TODO bail if protocol major_version is not 3? (major_version, minor_version) => { + // StartupMessage + // Parse pairs of null-terminated strings (key, value). // See `postgres: ProcessStartupPacket, build_startup_packet`. - let mut tokens = str::from_utf8(¶ms_bytes) - .context("StartupMessage params: invalid utf-8")? + let mut tokens = str::from_utf8(&msg) + .map_err(|_e| { + ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned()) + })? .strip_suffix('\0') // drop packet's own null .ok_or_else(|| { - ConnectionError::Protocol( + ProtocolError::Protocol( "StartupMessage params: missing null terminator".to_string(), ) })? @@ -375,7 +364,7 @@ impl FeStartupPacket { let mut params = HashMap::new(); while let Some(name) = tokens.next() { let value = tokens.next().ok_or_else(|| { - ConnectionError::Protocol( + ProtocolError::Protocol( "StartupMessage params: key without value".to_string(), ) })?; @@ -390,13 +379,12 @@ impl FeStartupPacket { } } }; - - Ok(Some(FeMessage::StartupPacket(message))) + Ok(Some(message)) } } impl FeParseMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { // FIXME: the rust-postgres driver uses a named prepared statement // for copy_out(). We're not prepared to handle that correctly. For // now, just ignore the statement name, assuming that the client never @@ -404,55 +392,82 @@ impl FeParseMessage { let _pstmt_name = read_cstr(&mut buf)?; let query_string = read_cstr(&mut buf)?; + if buf.remaining() < 2 { + return Err(ProtocolError::BadMessage( + "Parse message is malformed, nparams missing".to_string(), + )); + } let nparams = buf.get_i16(); - ensure!(nparams == 0, "query params not implemented"); + if nparams != 0 { + return Err(ProtocolError::BadMessage( + "query params not implemented".to_string(), + )); + } Ok(FeMessage::Parse(FeParseMessage { query_string })) } } impl FeDescribeMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let kind = buf.get_u8(); let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - ensure!( - kind == b'S', - "only prepared statemement Describe is implemented" - ); + if kind != b'S' { + return Err(ProtocolError::BadMessage( + "only prepared statemement Describe is implemented".to_string(), + )); + } Ok(FeMessage::Describe(FeDescribeMessage { kind })) } } impl FeExecuteMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let portal_name = read_cstr(&mut buf)?; + if buf.remaining() < 4 { + return Err(ProtocolError::BadMessage( + "FeExecuteMessage message is malformed, maxrows missing".to_string(), + )); + } let maxrows = buf.get_i32(); - ensure!(portal_name.is_empty(), "named portals not implemented"); - ensure!(maxrows == 0, "row limit in Execute message not implemented"); + if !portal_name.is_empty() { + return Err(ProtocolError::BadMessage( + "named portals not implemented".to_string(), + )); + } + if maxrows != 0 { + return Err(ProtocolError::BadMessage( + "row limit in Execute message not implemented".to_string(), + )); + } Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) } } impl FeBindMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let portal_name = read_cstr(&mut buf)?; let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - ensure!(portal_name.is_empty(), "named portals not implemented"); + if !portal_name.is_empty() { + return Err(ProtocolError::BadMessage( + "named portals not implemented".to_string(), + )); + } Ok(FeMessage::Bind(FeBindMessage)) } } impl FeCloseMessage { - fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> Result { let _kind = buf.get_u8(); let _pstmt_or_portal_name = read_cstr(&mut buf)?; @@ -481,6 +496,7 @@ pub enum BeMessage<'a> { CloseComplete, // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), + // None errcode means internal_error will be sent. ErrorResponse(&'a str, Option<&'a [u8; 5]>), /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), @@ -594,7 +610,7 @@ impl RowDescriptor<'_> { #[derive(Debug)] pub struct XLogDataBody<'a> { pub wal_start: u64, - pub wal_end: u64, + pub wal_end: u64, // current end of WAL on the server pub timestamp: i64, pub data: &'a [u8], } @@ -634,12 +650,11 @@ fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { } /// Safe write of s into buf as cstring (String in the protocol). -fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> { +fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolError> { let bytes = s.as_ref(); if bytes.contains(&0) { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "string contains embedded null", + return Err(ProtocolError::BadMessage( + "string contains embedded null".to_owned(), )); } buf.put_slice(bytes); @@ -647,9 +662,13 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> { Ok(()) } -fn read_cstr(buf: &mut Bytes) -> anyhow::Result { - let pos = buf.iter().position(|x| *x == 0); - let result = buf.split_to(pos.context("missing terminator")?); +/// Read cstring from buf, advancing it. +fn read_cstr(buf: &mut Bytes) -> Result { + let pos = buf + .iter() + .position(|x| *x == 0) + .ok_or_else(|| ProtocolError::BadMessage("missing cstring terminator".to_owned()))?; + let result = buf.split_to(pos); buf.advance(1); // drop the null terminator Ok(result) } @@ -658,12 +677,12 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000"; pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000"; impl<'a> BeMessage<'a> { - /// Write message to the given buf. - // Unlike the reading side, we use BytesMut - // here as msg len precedes its body and it is handy to write it down first - // and then fill the length. With Write we would have to either calc it - // manually or have one more buffer. - pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> { + /// Serialize `message` to the given `buf`. + /// Apart from smart memory managemet, BytesMut is good here as msg len + /// precedes its body and it is handy to write it down first and then fill + /// the length. With Write we would have to either calc it manually or have + /// one more buffer. + pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> { match message { BeMessage::AuthenticationOk => { buf.put_u8(b'R'); @@ -708,7 +727,7 @@ impl<'a> BeMessage<'a> { buf.put_slice(extra); } } - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -812,7 +831,7 @@ impl<'a> BeMessage<'a> { write_cstr(error_msg, buf)?; buf.put_u8(0); // terminator - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -835,7 +854,7 @@ impl<'a> BeMessage<'a> { write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -890,7 +909,7 @@ impl<'a> BeMessage<'a> { buf.put_i32(-1); /* typmod */ buf.put_i16(0); /* format code */ } - Ok::<_, io::Error>(()) + Ok(()) })?; } @@ -957,7 +976,7 @@ impl ReplicationFeedback { // null-terminated string - key, // uint32 - value length in bytes // value itself - pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { + pub fn serialize(&self, buf: &mut BytesMut) { buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); @@ -982,7 +1001,6 @@ impl ReplicationFeedback { buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); - Ok(()) } // Deserialize ReplicationFeedback message @@ -1050,7 +1068,7 @@ mod tests { // because it is rounded up to microseconds during serialization. rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - rf.serialize(&mut data).unwrap(); + rf.serialize(&mut data); let rf_parsed = ReplicationFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); @@ -1065,7 +1083,7 @@ mod tests { // because it is rounded up to microseconds during serialization. rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - rf.serialize(&mut data).unwrap(); + rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bdcd71a20f..40e11a70b7 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -21,7 +21,7 @@ use pageserver_api::models::{ PagestreamNblocksRequest, PagestreamNblocksResponse, }; use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; -use pq_proto::ConnectionError; +use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; @@ -78,7 +78,7 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream continue, FeMessage::Terminate => { let msg = "client terminated connection with Terminate message during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); // error can't happen here, ErrorResponse serialization should be always ok pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; @@ -97,13 +97,13 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream { let msg = "client closed connection during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg))); + let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); // error can't happen here, ErrorResponse serialization should be always ok pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; pgb.flush().await?; Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; } - Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { + Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { Err(io_error)?; } Err(other) => { @@ -214,7 +214,7 @@ async fn page_service_conn_main( // we've been requested to shut down Ok(()) } - Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => { + Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { if is_expected_io_error(&io_error) { info!("Postgres client disconnected ({io_error})"); Ok(()) @@ -1057,7 +1057,7 @@ impl From for QueryError { fn from(e: GetActiveTenantError) -> Self { match e { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( - ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), + ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), GetActiveTenantError::Other(e) => QueryError::Other(e), } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 41ac61b7b6..7194a4f3ed 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -354,7 +354,7 @@ pub async fn handle_walreceiver_connection( debug!("neon_status_update {status_update:?}"); let mut data = BytesMut::new(); - status_update.serialize(&mut data)?; + status_update.serialize(&mut data); physical_stream .as_mut() .zenith_status_update(data.len() as u64, &data) diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index e0cf1326b9..5a802dafb2 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,45 +1,40 @@ use crate::error::UserFacingError; use anyhow::bail; -use bytes::BytesMut; use pin_project_lite::pin_project; -use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket}; +use pq_proto::framed::{ConnectionError, Framed}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; -pin_project! { - /// Stream wrapper which implements libpq's protocol. - /// NOTE: This object deliberately doesn't implement [`AsyncRead`] - /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying - /// to pass random malformed bytes through the connection). - pub struct PqStream { - #[pin] - stream: S, - buffer: BytesMut, - } +/// Stream wrapper which implements libpq's protocol. +/// NOTE: This object deliberately doesn't implement [`AsyncRead`] +/// or [`AsyncWrite`] to prevent subtle errors (e.g. trying +/// to pass random malformed bytes through the connection). +pub struct PqStream { + framed: Framed, } impl PqStream { /// Construct a new libpq protocol wrapper. pub fn new(stream: S) -> Self { Self { - stream, - buffer: Default::default(), + framed: Framed::new(stream), } } /// Extract the underlying stream. pub fn into_inner(self) -> S { - self.stream + self.framed.into_inner() } /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { - &self.stream + self.framed.get_ref() } } @@ -50,16 +45,19 @@ fn err_connection() -> io::Error { impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. pub async fn read_startup_packet(&mut self) -> io::Result { - // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` - let msg = FeStartupPacket::read(&mut self.stream) + self.framed + .read_startup_message() .await .map_err(ConnectionError::into_io_error)? - .ok_or_else(err_connection)?; + .ok_or_else(err_connection) + } - match msg { - FeMessage::StartupPacket(packet) => Ok(packet), - _ => panic!("unreachable state"), - } + async fn read_message(&mut self) -> io::Result { + self.framed + .read_message() + .await + .map_err(ConnectionError::into_io_error)? + .ok_or_else(err_connection) } pub async fn read_password_message(&mut self) -> io::Result { @@ -71,19 +69,14 @@ impl PqStream { )), } } - - async fn read_message(&mut self) -> io::Result { - FeMessage::read(&mut self.stream) - .await - .map_err(ConnectionError::into_io_error)? - .ok_or_else(err_connection) - } } impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { - BeMessage::write(&mut self.buffer, message)?; + self.framed + .write_message(message) + .map_err(ProtocolError::into_io_error)?; Ok(self) } @@ -96,9 +89,7 @@ impl PqStream { /// Flush the output buffer into the underlying stream. pub async fn flush(&mut self) -> io::Result<&mut Self> { - self.stream.write_all(&self.buffer).await?; - self.buffer.clear(); - self.stream.flush().await?; + self.framed.flush().await?; Ok(self) } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 4a046cb048..d8fe36d7f8 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -488,7 +488,7 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.pageserver_feedback.serialize(buf)?; + msg.pageserver_feedback.serialize(buf); } } From a34e78d0841d68b2f3eddfd4065e8b5f24052919 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 9 Mar 2023 22:15:46 +0200 Subject: [PATCH 123/426] Retry attempt to connect to pageserver in order to make pageserver restart transparent for clients (#3700) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …start transparent for clients ## Describe your changes Try to reestablish connection with pageserver if send is failed to be able to make pageserver restart transparent for client ## Issue ticket number and link https://github.com/neondatabase/neon/issues/1138 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: Heikki Linnakangas --- pgxn/neon/libpagestore.c | 77 ++++++++++++++----- .../regress/test_pageserver_restart.py | 18 ----- ...test_pageserver_restarts_under_workload.py | 35 +++++++++ 3 files changed, 94 insertions(+), 36 deletions(-) create mode 100644 test_runner/regress/test_pageserver_restarts_under_workload.py diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 88e3a12d96..3fe6d38251 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -32,6 +32,9 @@ #define PageStoreTrace DEBUG5 +#define MAX_RECONNECT_ATTEMPTS 5 +#define RECONNECT_INTERVAL_USEC 1000000 + bool connected = false; PGconn *pageserver_conn = NULL; @@ -52,8 +55,8 @@ int readahead_buffer_size = 128; static void pageserver_flush(void); -static void -pageserver_connect() +static bool +pageserver_connect(int elevel) { char *query; int ret; @@ -69,10 +72,11 @@ pageserver_connect() PQfinish(pageserver_conn); pageserver_conn = NULL; - ereport(ERROR, + ereport(elevel, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), errmsg(NEON_TAG "could not establish connection to pageserver"), errdetail_internal("%s", msg))); + return false; } query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); @@ -81,7 +85,8 @@ pageserver_connect() { PQfinish(pageserver_conn); pageserver_conn = NULL; - neon_log(ERROR, "could not send pagestream command to pageserver"); + neon_log(elevel, "could not send pagestream command to pageserver"); + return false; } pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3); @@ -113,8 +118,9 @@ pageserver_connect() FreeWaitEventSet(pageserver_conn_wes); pageserver_conn_wes = NULL; - neon_log(ERROR, "could not complete handshake with pageserver: %s", + neon_log(elevel, "could not complete handshake with pageserver: %s", msg); + return false; } } } @@ -122,6 +128,7 @@ pageserver_connect() neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); connected = true; + return true; } /* @@ -149,8 +156,11 @@ retry: if (event.events & WL_SOCKET_READABLE) { if (!PQconsumeInput(pageserver_conn)) - neon_log(ERROR, "could not get response from pageserver: %s", + { + neon_log(LOG, "could not get response from pageserver: %s", PQerrorMessage(pageserver_conn)); + return -1; + } } goto retry; @@ -190,31 +200,62 @@ static void pageserver_send(NeonRequest * request) { StringInfoData req_buff; + int n_reconnect_attempts = 0; /* If the connection was lost for some reason, reconnect */ if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) pageserver_disconnect(); - if (!connected) - pageserver_connect(); req_buff = nm_pack_request(request); /* - * Send request. - * - * In principle, this could block if the output buffer is full, and we - * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output and - * TCP buffer. + * If pageserver is stopped, the connections from compute node are broken. + * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query. + * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another. + * See https://github.com/neondatabase/neon/issues/1138 + * So try to reestablish connection in case of failure. */ - if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + while (true) { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + if (!connected) + { + if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR)) + { + n_reconnect_attempts += 1; + pg_usleep(RECONNECT_INTERVAL_USEC); + continue; + } + } - pageserver_disconnect(); - neon_log(ERROR, "failed to send page request: %s", msg); + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output and + * TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS) + { + neon_log(LOG, "failed to send page request (try to reconnect): %s", msg); + if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */ + pg_usleep(RECONNECT_INTERVAL_USEC); + n_reconnect_attempts += 1; + continue; + } + else + { + pageserver_disconnect(); + neon_log(ERROR, "failed to send page request: %s", msg); + } + } + break; } + pfree(req_buff.data); n_unflushed_requests++; diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 6388e979e5..453ddec0d4 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -45,14 +45,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - # Stopping the pageserver breaks the connection from the postgres backend to - # the page server, and causes the next query on the connection to fail. Start a new - # postgres connection too, to avoid that error. (Ideally, the compute node would - # handle that and retry internally, without propagating the error to the user, but - # currently it doesn't...) - pg_conn = pg.connect() - cur = pg_conn.cursor() - cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) @@ -70,8 +62,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): assert tenant_status["state"] == "Loading" # Try to read. This waits until the loading finishes, and then return normally. - pg_conn = pg.connect() - cur = pg_conn.cursor() cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) @@ -132,14 +122,6 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): env.pageserver.stop(immediate=True) env.pageserver.start() - # Stopping the pageserver breaks the connection from the postgres backend to - # the page server, and causes the next query on the connection to fail. Start a new - # postgres connection too, to avoid that error. (Ideally, the compute node would - # handle that and retry internally, without propagating the error to the user, but - # currently it doesn't...) - pg_conn = pg.connect() - cur = pg_conn.cursor() - # Check that all the updates are visible num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0] assert num_updates == i * 100000 diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py new file mode 100644 index 0000000000..28159778fe --- /dev/null +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -0,0 +1,35 @@ +# This test spawns pgbench in a thread in the background and concurrently restarts pageserver, +# checking how client is able to transparently restore connection to pageserver +# +import threading +import time + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres + + +# Test restarting page server, while safekeeper and compute node keep +# running. +def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin): + env = neon_simple_env + env.neon_cli.create_branch("test_pageserver_restarts") + pg = env.postgres.create_start("test_pageserver_restarts") + n_restarts = 10 + scale = 10 + + def run_pgbench(pg: Postgres): + connstr = pg.connstr() + log.info(f"Start a pgbench workload on pg {connstr}") + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr]) + + thread = threading.Thread(target=run_pgbench, args=(pg,), daemon=True) + thread.start() + + for i in range(n_restarts): + # Stop the pageserver gracefully and restart it. + time.sleep(1) + env.pageserver.stop() + env.pageserver.start() + + thread.join() From d1a0f2f0eb8a567d9a867b1b480f147210efdd10 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 10 Mar 2023 00:37:11 +0400 Subject: [PATCH 124/426] Fix example why manual-range-contains is disabled. --- run_clippy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_clippy.sh b/run_clippy.sh index 0558541089..be07a0110a 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -13,7 +13,7 @@ # script that checks every feature. # # manual-range-contains wants -# !(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len) +# !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len) # instead of # len < 4 || len > MAX_STARTUP_PACKET_LENGTH # , let's disagree. From 965837df53c945be91e914d7fadb032f4d9e4371 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 10 Mar 2023 00:09:28 +0400 Subject: [PATCH 125/426] Log connection ids in safekeeper instead of thread ids. Fixes build on macOS (which doesn't have nix gettid) after 0d8ced85341102. --- Cargo.lock | 1 - safekeeper/Cargo.toml | 1 - safekeeper/src/handler.rs | 6 +++++- safekeeper/src/lib.rs | 1 + safekeeper/src/receive_wal.rs | 11 +++++++---- safekeeper/src/wal_service.rs | 24 +++++++++++++++++++----- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b96f7dbc99..2e3ea2842d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3333,7 +3333,6 @@ dependencies = [ "humantime", "hyper", "metrics", - "nix", "once_cell", "parking_lot", "postgres", diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 36ee15347d..8b0733832a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -19,7 +19,6 @@ git-version.workspace = true hex.workspace = true humantime.workspace = true hyper.workspace = true -nix.workspace = true once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 3e7bafbd2f..7d788fe3b9 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -8,6 +8,7 @@ use tracing::{info, info_span, Instrument}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; +use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::QueryError; use postgres_backend::{self, PostgresBackend}; @@ -28,6 +29,8 @@ pub struct SafekeeperPostgresHandler { pub tenant_id: Option, pub timeline_id: Option, pub ttid: TenantTimelineId, + /// Unique connection id is logged in spans for observability. + pub conn_id: ConnectionId, claims: Option, } @@ -181,13 +184,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf) -> Self { + pub fn new(conf: SafeKeeperConf, conn_id: u32) -> Self { SafekeeperPostgresHandler { conf, appname: None, tenant_id: None, timeline_id: None, ttid: TenantTimelineId::empty(), + conn_id, claims: None, } } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 03df546a4d..f4e753cdbf 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,4 +1,5 @@ use remote_storage::RemoteStorageConfig; + use std::path::PathBuf; use std::time::Duration; use storage_broker::Uri; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 22c9871026..b7cf5a7310 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -7,10 +7,10 @@ use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; +use crate::wal_service::ConnectionId; use crate::GlobalTimelines; use anyhow::{anyhow, Context}; use bytes::BytesMut; -use nix::unistd::gettid; use postgres_backend::CopyStreamHandlerEnd; use postgres_backend::PostgresBackend; use postgres_backend::PostgresBackendReader; @@ -70,7 +70,7 @@ impl SafekeeperPostgresHandler { let peer_addr = *pgb.get_peer_addr(); let res = tokio::select! { // todo: add read|write .context to these errors - r = read_network(self.ttid, &mut pgb_reader, peer_addr, msg_tx, &mut acceptor_handle, msg_rx, reply_tx) => r, + r = read_network(self.ttid, self.conn_id, &mut pgb_reader, peer_addr, msg_tx, &mut acceptor_handle, msg_rx, reply_tx) => r, r = write_network(pgb, reply_rx) => r, }; @@ -119,6 +119,7 @@ async fn read_message( /// tell the error. async fn read_network( ttid: TenantTimelineId, + conn_id: ConnectionId, pgb_reader: &mut PostgresBackendReader, peer_addr: SocketAddr, msg_tx: Sender, @@ -151,7 +152,8 @@ async fn read_network( }; *acceptor_handle = Some( - WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx).context("spawn WalAcceptor thread")?, + WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, conn_id) + .context("spawn WalAcceptor thread")?, ); // Forward all messages to WalAcceptor @@ -205,6 +207,7 @@ impl WalAcceptor { tli: Arc, msg_rx: Receiver, reply_tx: Sender, + conn_id: ConnectionId, ) -> anyhow::Result>> { let thread_name = format!("WAL acceptor {}", tli.ttid); thread::Builder::new() @@ -223,7 +226,7 @@ impl WalAcceptor { let span_ttid = wa.tli.ttid; // satisfy borrow checker runtime.block_on( wa.run() - .instrument(info_span!("WAL acceptor", tid = %gettid(), ttid = %span_ttid)), + .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)), ) }) .map_err(anyhow::Error::from) diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 8d63d604ad..96f063d686 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -3,7 +3,6 @@ //! receive WAL from wal_proposer and send it to WAL receivers //! use anyhow::{Context, Result}; -use nix::unistd::gettid; use postgres_backend::QueryError; use std::{future, thread}; use tokio::net::TcpStream; @@ -27,17 +26,19 @@ pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) { // Tokio's from_std won't do this for us, per its comment. pg_listener.set_nonblocking(true)?; let listener = tokio::net::TcpListener::from_std(pg_listener)?; + let mut connection_count: ConnectionCount = 0; loop { match listener.accept().await { Ok((socket, peer_addr)) => { debug!("accepted connection from {}", peer_addr); let conf = conf.clone(); + let conn_id = issue_connection_id(&mut connection_count); let _ = thread::Builder::new() .name("WAL service thread".into()) .spawn(move || { - if let Err(err) = handle_socket(socket, conf) { + if let Err(err) = handle_socket(socket, conf, conn_id) { error!("connection handler exited: {}", err); } }) @@ -54,8 +55,12 @@ pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) { /// This is run by `thread_main` above, inside a background thread. /// -fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> { - let _enter = info_span!("", tid = %gettid()).entered(); +fn handle_socket( + socket: TcpStream, + conf: SafeKeeperConf, + conn_id: ConnectionId, +) -> Result<(), QueryError> { + let _enter = info_span!("", cid = %conn_id).entered(); let runtime = tokio::runtime::Builder::new_current_thread() .enable_all() @@ -68,7 +73,7 @@ fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryErr None => AuthType::Trust, Some(_) => AuthType::NeonJWT, }; - let mut conn_handler = SafekeeperPostgresHandler::new(conf); + let mut conn_handler = SafekeeperPostgresHandler::new(conf, conn_id); let pgbackend = PostgresBackend::new(socket, auth_type, None)?; // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. @@ -79,3 +84,12 @@ fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryErr Ok(()) } + +/// Unique WAL service connection ids are logged in spans for observability. +pub type ConnectionId = u32; +pub type ConnectionCount = u32; + +pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId { + *count = count.wrapping_add(1); + *count +} From 290884ea3b87fa7657d69465634e82897ee2b5d3 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 10 Mar 2023 10:13:31 +0400 Subject: [PATCH 126/426] Fix too many arguments in read_network clippy complain. --- safekeeper/src/receive_wal.rs | 108 +++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 49 deletions(-) diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index b7cf5a7310..0652ad0676 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -68,10 +68,17 @@ impl SafekeeperPostgresHandler { // sends, so this avoids deadlocks. let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; let peer_addr = *pgb.get_peer_addr(); + let network_reader = NetworkReader { + ttid: self.ttid, + conn_id: self.conn_id, + pgb_reader: &mut pgb_reader, + peer_addr, + acceptor_handle: &mut acceptor_handle, + }; let res = tokio::select! { // todo: add read|write .context to these errors - r = read_network(self.ttid, self.conn_id, &mut pgb_reader, peer_addr, msg_tx, &mut acceptor_handle, msg_rx, reply_tx) => r, - r = write_network(pgb, reply_rx) => r, + r = network_reader.run(msg_tx, msg_rx, reply_tx) => r, + r = network_write(pgb, reply_rx) => r, }; // Join pg backend back. @@ -104,6 +111,55 @@ impl SafekeeperPostgresHandler { } } +struct NetworkReader<'a> { + ttid: TenantTimelineId, + conn_id: ConnectionId, + pgb_reader: &'a mut PostgresBackendReader, + peer_addr: SocketAddr, + // WalAcceptor is spawned when we learn server info from walproposer and + // create timeline; handle is put here. + acceptor_handle: &'a mut Option>>, +} + +impl<'a> NetworkReader<'a> { + async fn run( + self, + msg_tx: Sender, + msg_rx: Receiver, + reply_tx: Sender, + ) -> Result<(), CopyStreamHandlerEnd> { + // Receive information about server to create timeline, if not yet. + let next_msg = read_message(self.pgb_reader).await?; + let tli = match next_msg { + ProposerAcceptorMessage::Greeting(ref greeting) => { + info!( + "start handshake with walproposer {} sysid {} timeline {}", + self.peer_addr, greeting.system_id, greeting.tli, + ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await? + } + _ => { + return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( + "unexpected message {next_msg:?} instead of greeting" + ))) + } + }; + + *self.acceptor_handle = Some( + WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id) + .context("spawn WalAcceptor thread")?, + ); + + // Forward all messages to WalAcceptor + read_network_loop(self.pgb_reader, msg_tx, next_msg).await + } +} + /// Read next message from walproposer. /// TODO: Return Ok(None) on graceful termination. async fn read_message( @@ -114,52 +170,6 @@ async fn read_message( Ok(msg) } -/// Read messages from socket and pass it to WalAcceptor thread. Returns Ok(()) -/// if msg_tx closed; it must mean WalAcceptor terminated, joining it should -/// tell the error. -async fn read_network( - ttid: TenantTimelineId, - conn_id: ConnectionId, - pgb_reader: &mut PostgresBackendReader, - peer_addr: SocketAddr, - msg_tx: Sender, - // WalAcceptor is spawned when we learn server info from walproposer and - // create timeline; handle is put here. - acceptor_handle: &mut Option>>, - msg_rx: Receiver, - reply_tx: Sender, -) -> Result<(), CopyStreamHandlerEnd> { - // Receive information about server to create timeline, if not yet. - let next_msg = read_message(pgb_reader).await?; - let tli = match next_msg { - ProposerAcceptorMessage::Greeting(ref greeting) => { - info!( - "start handshake with walproposer {} sysid {} timeline {}", - peer_addr, greeting.system_id, greeting.tli, - ); - let server_info = ServerInfo { - pg_version: greeting.pg_version, - system_id: greeting.system_id, - wal_seg_size: greeting.wal_seg_size, - }; - GlobalTimelines::create(ttid, server_info, Lsn::INVALID, Lsn::INVALID).await? - } - _ => { - return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( - "unexpected message {next_msg:?} instead of greeting" - ))) - } - }; - - *acceptor_handle = Some( - WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, conn_id) - .context("spawn WalAcceptor thread")?, - ); - - // Forward all messages to WalAcceptor - read_network_loop(pgb_reader, msg_tx, next_msg).await -} - async fn read_network_loop( pgb_reader: &mut PostgresBackendReader, msg_tx: Sender, @@ -176,7 +186,7 @@ async fn read_network_loop( /// Read replies from WalAcceptor and pass them back to socket. Returns Ok(()) /// if reply_rx closed; it must mean WalAcceptor terminated, joining it should /// tell the error. -async fn write_network( +async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, ) -> Result<(), CopyStreamHandlerEnd> { From 3c4f5af1b928b0d2fe25de05ce90de4ec620aa5c Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Fri, 10 Mar 2023 11:11:39 +0100 Subject: [PATCH 127/426] Try depot.dev for image building (#3768) To see if it is faster. Run side-by-side for a while so we can gather enough data. --- .github/workflows/build_and_test.yml | 42 ++++++++++++++++++++++++++++ Dockerfile | 2 +- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d16d221cc4..d479201305 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -551,6 +551,48 @@ jobs: - name: Cleanup ECR folder run: rm -rf ~/.ecr + + neon-image-depot: + # For testing this will run side-by-side for a few merges. + # This action is not really optimized yet, but gets the job done + runs-on: [ self-hosted, gen3, small ] + needs: [ tag ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + permissions: + contents: read + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Setup go + uses: actions/setup-go@v3 + with: + go-version: '1.19' + + - name: Set up Depot CLI + uses: depot/setup-action@v1 + + - name: Install Crane & ECR helper + run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Build and push + uses: depot/build-push-action@v1 + with: + # if no depot.json file is at the root of your repo, you must specify the project id + project: nrdv0s4kcs + push: true + tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}} + compute-tools-image: runs-on: [ self-hosted, gen3, large ] needs: [ tag ] diff --git a/Dockerfile b/Dockerfile index 0d5ba73456..6f7d2c32a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server -COPY . . +COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. From 42ec79fb0df114ceb88fbaa8a4e4d48f37ab5a03 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 9 Mar 2023 15:20:26 +0200 Subject: [PATCH 128/426] Make expected test output nicer to read. By using Rust raw string literal. --- compute_tools/tests/pg_helpers_tests.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 431d9794bc..dc1bbbdc8a 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,28 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + r#"fsync = off +wal_level = replica +hot_standby = on +neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' +wal_log_hints = on +log_connections = on +shared_buffers = 32768 +port = 55432 +max_connections = 100 +max_wal_senders = 10 +listen_addresses = '0.0.0.0' +wal_sender_timeout = 0 +password_encryption = md5 +maintenance_work_mem = 65536 +max_parallel_workers = 8 +max_worker_processes = 8 +neon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8' +max_replication_slots = 10 +neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13' +shared_preload_libraries = 'neon' +synchronous_standby_names = 'walproposer' +neon.pageserver_connstring = 'host=127.0.0.1 port=6400'"# ); } From 856d01ff68716e543fed90799ea83c05b81fd71a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 9 Mar 2023 15:28:53 +0200 Subject: [PATCH 129/426] Add newline at end of postgresql.conf --- compute_tools/src/pg_helpers.rs | 1 + compute_tools/tests/pg_helpers_tests.rs | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 6a1377b6aa..47f64f581d 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -109,6 +109,7 @@ impl PgOptionsSerialize for GenericOptions { .map(|op| op.to_pg_setting()) .collect::>() .join("\n") + + "\n" // newline after last setting } else { "".to_string() } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index dc1bbbdc8a..c92bb13668 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -49,7 +49,8 @@ max_replication_slots = 10 neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13' shared_preload_libraries = 'neon' synchronous_standby_names = 'walproposer' -neon.pageserver_connstring = 'host=127.0.0.1 port=6400'"# +neon.pageserver_connstring = 'host=127.0.0.1 port=6400' +"# ); } From d1537a49fab1635766adccffeb764290b202361b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 10 Mar 2023 00:52:35 +0200 Subject: [PATCH 130/426] Fix escaping in postgresql.conf that we generate at compute startup If there are any config options that contain single quotes or backslashes, they need to be escaped --- compute_tools/src/pg_helpers.rs | 15 +++++++++++++-- compute_tools/tests/cluster_spec.json | 5 +++++ compute_tools/tests/pg_helpers_tests.rs | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 47f64f581d..79f851ed13 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -47,12 +47,23 @@ pub struct GenericOption { /// declare a `trait` on it. pub type GenericOptions = Option>; +/// Escape a string for including it in a SQL literal +fn escape_literal(s: &str) -> String { + s.replace('\'', "''").replace('\\', "\\\\") +} + +/// Escape a string so that it can be used in postgresql.conf. +/// Same as escape_literal, currently. +fn escape_conf_value(s: &str) -> String { + s.replace('\'', "''").replace('\\', "\\\\") +} + impl GenericOption { /// Represent `GenericOption` as SQL statement parameter. pub fn to_pg_option(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { - "string" => format!("{} '{}'", self.name, val), + "string" => format!("{} '{}'", self.name, escape_literal(val)), _ => format!("{} {}", self.name, val), } } else { @@ -73,7 +84,7 @@ impl GenericOption { }; match self.vartype.as_ref() { - "string" => format!("{} = '{}'", name, val), + "string" => format!("{} = '{}'", name, escape_conf_value(val)), _ => format!("{} = {}", name, val), } } else { diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index c29416d9c4..8f81e7b3bd 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -178,6 +178,11 @@ "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" + }, + { + "name": "test.escaping", + "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray", + "vartype": "string" } ] }, diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index c92bb13668..f48211f7ed 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -50,6 +50,7 @@ neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13' shared_preload_libraries = 'neon' synchronous_standby_names = 'walproposer' neon.pageserver_connstring = 'host=127.0.0.1 port=6400' +test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hooray' "# ); } From b7fddfa70de2adb6d4b43acde49f6db22fc2350a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 7 Mar 2023 18:47:06 +0200 Subject: [PATCH 131/426] Add branch_id field to proxy_io_bytes_per_client metric. Since we allow switching endpoints between different branches, it is important to use composite key. Otherwise, we may try to calculate delta between metric values for two different branches. --- proxy/src/metrics.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 3b28346872..be22c45836 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -21,6 +21,7 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; #[derive(Eq, Hash, PartialEq, Serialize, Debug)] pub struct Ids { pub endpoint_id: String, + pub branch_id: String, } pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> { @@ -74,12 +75,23 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { .find(|l| l.get_name() == "endpoint_id") .unwrap() .get_value(); + let branch_id = ms + .get_label() + .iter() + .find(|l| l.get_name() == "branch_id") + .unwrap() + .get_value(); + let value = ms.get_counter().get_value() as u64; - debug!("endpoint_id:val - {}: {}", endpoint_id, value); + debug!( + "branch_id {} endpoint_id {} val: {}", + branch_id, endpoint_id, value + ); current_metrics.push(( Ids { endpoint_id: endpoint_id.to_string(), + branch_id: "".to_string(), }, (value, Utc::now()), )); @@ -131,6 +143,7 @@ async fn collect_metrics_iteration( value, extra: Ids { endpoint_id: curr_key.endpoint_id.clone(), + branch_id: curr_key.branch_id.clone(), }, }) }) @@ -172,6 +185,7 @@ async fn collect_metrics_iteration( cached_metrics .entry(Ids { endpoint_id: send_metric.extra.endpoint_id.clone(), + branch_id: send_metric.extra.branch_id.clone(), }) // update cached value (add delta) and time .and_modify(|e| { From 2ceef91da11905acf0dd9adef64bb9c877fad006 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 10 Mar 2023 10:49:41 +0100 Subject: [PATCH 132/426] Compile `pg_tiktoken` extension --- Dockerfile.compute-node | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 06c820009b..12cd4fbbd1 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -360,6 +360,20 @@ RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github. sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control +######################################################################################### +# +# Layer "pg-tiktoken-build" +# Compile "pg_tiktoken" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-tiktoken-pg-build + +RUN git clone --depth=1 --single-branch --branch neon_abi https://github.com/vadim2404/pg_tiktoken && \ + cd pg_tiktoken && \ + cargo pgx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -377,6 +391,7 @@ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ From bebf76c4618dca24a500af2da3b1860a5eec203b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 10 Mar 2023 12:06:17 +0200 Subject: [PATCH 133/426] Accept RS384 and RS512 JWT tokens. Previously, we only accepted RS256. Seems like a pointless limitation, and when I was testing it with RS512 tokens, it took me a while to understand why it wasn't working. --- libs/utils/src/auth.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 4fa85346ad..a57dbff5de 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -9,14 +9,26 @@ use std::path::Path; use anyhow::Result; use jsonwebtoken::{ - decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, + decode, encode, Algorithm, Algorithm::*, DecodingKey, EncodingKey, Header, TokenData, + Validation, }; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use crate::id::TenantId; -const JWT_ALGORITHM: Algorithm = Algorithm::RS256; +/// Algorithms accepted during validation. +/// +/// Accept all RSA-based algorithms. We pass this list to jsonwebtoken::decode, +/// which checks that the algorithm in the token is one of these. +/// +/// XXX: It also fails the validation if there are any algorithms in this list that belong +/// to different family than the token's algorithm. In other words, we can *not* list any +/// non-RSA algorithms here, or the validation always fails with InvalidAlgorithm error. +const ACCEPTED_ALGORITHMS: &[Algorithm] = &[RS256, RS384, RS512]; + +/// Algorithm to use when generating a new token in [`encode_from_key_file`] +const ENCODE_ALGORITHM: Algorithm = Algorithm::RS256; #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "lowercase")] @@ -33,6 +45,7 @@ pub enum Scope { SafekeeperData, } +/// JWT payload. See docs/authentication.md for the format #[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Claims { @@ -55,7 +68,8 @@ pub struct JwtAuth { impl JwtAuth { pub fn new(decoding_key: DecodingKey) -> Self { - let mut validation = Validation::new(JWT_ALGORITHM); + let mut validation = Validation::default(); + validation.algorithms = ACCEPTED_ALGORITHMS.into(); // The default 'required_spec_claims' is 'exp'. But we don't want to require // expiration. validation.required_spec_claims = [].into(); @@ -86,5 +100,5 @@ impl std::fmt::Debug for JwtAuth { // this function is used only for testing purposes in CLI e g generate tokens during init pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result { let key = EncodingKey::from_rsa_pem(key_data)?; - Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?) + Ok(encode(&Header::new(ENCODE_ALGORITHM), claims, &key)?) } From b00530df2abfd6d76cba9bf2d486252255a467f3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 10 Mar 2023 12:15:54 +0200 Subject: [PATCH 134/426] Add section in internal docs on the JWT payload. Just copied from the code comments. Could be improved, but this is a start. --- docs/authentication.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/authentication.md b/docs/authentication.md index e22d7b700f..1637519211 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -29,6 +29,41 @@ These components should not have access to the private key and may only get toke The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`. There is currently no way to rotate the key without bringing down all components. +### Token format + +The JWT tokens in Neon use RSA as the algorithm. Example: + +Header: + +``` +{ + "alg": "RS512", # RS256, RS384, or RS512 + "typ": "JWT" +} +``` + +Payload: + +``` +{ + "scope": "tenant", # "tenant", "pageserverapi", or "safekeeperdata" + "tenant_id": "5204921ff44f09de8094a1390a6a50f6", +} +``` + + +Meanings of scope: + +"tenant": Provides access to all data for a specific tenant + +"pageserverapi": Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs. +Should only be used e.g. for status check/tenant creation/list. + +"safekeeperdata": Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs. +Should only be used e.g. for status check. +Currently also used for connection from any pageserver to any safekeeper. + + ### CLI CLI generates a key pair during call to `neon_local init` with the following commands: From 34d3385b2e03d8d8a9c234b4319eafe365c118b3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 10 Mar 2023 14:08:29 +0200 Subject: [PATCH 135/426] Add unit tests for JWT encoding and decoding. --- libs/utils/src/auth.rs | 112 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 2 deletions(-) diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index a57dbff5de..027950cb39 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -30,7 +30,7 @@ const ACCEPTED_ALGORITHMS: &[Algorithm] = &[RS256, RS384, RS512]; /// Algorithm to use when generating a new token in [`encode_from_key_file`] const ENCODE_ALGORITHM: Algorithm = Algorithm::RS256; -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Scope { // Provides access to all data for a specific tenant (specified in `struct Claims` below) @@ -47,7 +47,7 @@ pub enum Scope { /// JWT payload. See docs/authentication.md for the format #[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] pub struct Claims { #[serde(default)] #[serde_as(as = "Option")] @@ -102,3 +102,111 @@ pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result let key = EncodingKey::from_rsa_pem(key_data)?; Ok(encode(&Header::new(ENCODE_ALGORITHM), claims, &key)?) } + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + // generated with: + // + // openssl genpkey -algorithm rsa -out storage-auth-priv.pem + // openssl pkey -in storage-auth-priv.pem -pubout -out storage-auth-pub.pem + const TEST_PUB_KEY_RSA: &[u8] = br#" +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy6OZ+/kQXcueVJA/KTzO +v4ljxylc/Kcb0sXWuXg1GB8k3nDA1gK66LFYToH0aTnqrnqG32Vu6wrhwuvqsZA7 +jQvP0ZePAbWhpEqho7EpNunDPcxZ/XDy5TQlB1P58F9I3lkJXDC+DsHYLuuzwhAv +vo2MtWRdYlVHblCVLyZtANHhUMp2HUhgjHnJh5UrLIKOl4doCBxkM3rK0wjKsNCt +M92PCR6S9rvYzldfeAYFNppBkEQrXt2CgUqZ4KaS4LXtjTRUJxljijA4HWffhxsr +euRu3ufq8kVqie7fum0rdZZSkONmce0V0LesQ4aE2jB+2Sn48h6jb4dLXGWdq8TV +wQIDAQAB +-----END PUBLIC KEY----- +"#; + const TEST_PRIV_KEY_RSA: &[u8] = br#" +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDLo5n7+RBdy55U +kD8pPM6/iWPHKVz8pxvSxda5eDUYHyTecMDWArrosVhOgfRpOequeobfZW7rCuHC +6+qxkDuNC8/Rl48BtaGkSqGjsSk26cM9zFn9cPLlNCUHU/nwX0jeWQlcML4Owdgu +67PCEC++jYy1ZF1iVUduUJUvJm0A0eFQynYdSGCMecmHlSssgo6Xh2gIHGQzesrT +CMqw0K0z3Y8JHpL2u9jOV194BgU2mkGQRCte3YKBSpngppLgte2NNFQnGWOKMDgd +Z9+HGyt65G7e5+ryRWqJ7t+6bSt1llKQ42Zx7RXQt6xDhoTaMH7ZKfjyHqNvh0tc +ZZ2rxNXBAgMBAAECggEAVz3u4Wlx3o02dsoZlSQs+xf0PEX3RXKeU+1YMbtTG9Nz +6yxpIQaoZrpbt76rJE2gwkFR+PEu1NmjoOuLb6j4KlQuI4AHz1auOoGSwFtM6e66 +K4aZ4x95oEJ3vqz2fkmEIWYJwYpMUmwvnuJx76kZm0xvROMLsu4QHS2+zCVtO5Tr +hvS05IMVuZ2TdQBZw0+JaFdwXbgDjQnQGY5n9MoTWSx1a4s/FF4Eby65BbDutcpn +Vt3jQAOmO1X2kbPeWSGuPJRzyUs7Kg8qfeglBIR3ppGP3vPYAdWX+ho00bmsVkSp +Q8vjul6C3WiM+kjwDxotHSDgbl/xldAl7OqPh0bfAQKBgQDnycXuq14Vg8nZvyn9 +rTnvucO8RBz5P6G+FZ+44cAS2x79+85onARmMnm+9MKYLSMo8fOvsK034NDI68XM +04QQ/vlfouvFklMTGJIurgEImTZbGCmlMYCvFyIxaEWixon8OpeI4rFe4Hmbiijh +PxhxWg221AwvBS2sco8J/ylEkQKBgQDg6Rh2QYb/j0Wou1rJPbuy3NhHofd5Rq35 +4YV3f2lfVYcPrgRhwe3T9SVII7Dx8LfwzsX5TAlf48ESlI3Dzv40uOCDM+xdtBRI +r96SfSm+jup6gsXU3AsdNkrRK3HoOG9Z/TkrUp213QAIlVnvIx65l4ckFMlpnPJ0 +lo1LDXZWMQKBgFArzjZ7N5OhfdO+9zszC3MLgdRAivT7OWqR+CjujIz5FYMr8Xzl +WfAvTUTrS9Nu6VZkObFvHrrRG+YjBsuN7YQjbQXTSFGSBwH34bgbn2fl9pMTjHQC +50uoaL9GHa/rlBaV/YvvPQJgCi/uXa1rMX0jdNLkDULGO8IF7cu7Yf7BAoGBAIUU +J29BkpmAst0GDs/ogTlyR18LTR0rXyHt+UUd1MGeH859TwZw80JpWWf4BmkB4DTS +hH3gKePdJY7S65ci0XNsuRupC4DeXuorde0DtkGU2tUmr9wlX0Ynq9lcdYfMbMa4 +eK1TsxG69JwfkxlWlIWITWRiEFM3lJa7xlrUWmLhAoGAFpKWF/hn4zYg3seU9gai +EYHKSbhxA4mRb+F0/9IlCBPMCqFrL5yftUsYIh2XFKn8+QhO97Nmk8wJSK6TzQ5t +ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp +8ismApXVGHpOCstzikV9W7k= +-----END PRIVATE KEY----- +"#; + + #[test] + fn test_decode() -> Result<(), anyhow::Error> { + let expected_claims = Claims { + tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?), + scope: Scope::Tenant, + }; + + // Here are tokens containing the following payload, signed using TEST_PRIV_KEY_RSA + // using RS512, RS384 and RS256 algorithms: + // + // ``` + // { + // "scope": "tenant", + // "tenant_id": "3d1f7595b468230304e0b73cecbcb081", + // "iss": "neon.controlplane", + // "exp": 1709200879, + // "iat": 1678442479 + // } + // ``` + // + // These were encoded with the online debugger at https://jwt.io + // + let encoded_rs512 = "eyJhbGciOiJSUzUxMiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.QmqfteDQmDGoxQ5EFkasbt35Lx0W0Nh63muQnYZvFq93DSh4ZbOG9Mc4yaiXZoiS5HgeKtFKv3mbWkDqjz3En06aY17hWwguBtAsGASX48lYeCPADYGlGAuaWnOnVRwe3iiOC7tvPFvwX_45S84X73sNUXyUiXv6nLdcDqVXudtNrGST_DnZDnjuUJX11w7sebtKqQQ8l9-iGHiXOl5yevpMCoB1OcTWcT6DfDtffoNuMHDC3fyhmEGG5oKAt1qBybqAIiyC9-UBAowRZXhdfxrzUl-I9jzKWvk85c5ulhVRwbPeP6TTTlPKwFzBNHg1i2U-1GONew5osQ3aoptwsA"; + + let encoded_rs384 = "eyJhbGciOiJSUzM4NCIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.qqk4nkxKzOJP38c_g57_w_SfdQVmCsDT_bsLmdFj_N6LIB22gr6U6_P_5mvk3pIAsp0VCTDwPrCU908TxqjibEkwvQoJwbogHamSGHpD7eJBxGblSnA-Nr3MlEMxpFtec8QokSm6C5mH7DoBYjB2xzeOlxAmpR2GAzInKiMkU4kZ_OcqqrmVcMXY_6VnbxZWMekuw56zE1-PP_qNF1HvYOH-P08ONP8qdo5UPtBG7QBEFlCqZXJZCFihQaI4Vzil9rDuZGCm3I7xQJ8-yh1PX3BTbGo8EzqLdRyBeTpr08UTuRbp_MJDWevHpP3afvJetAItqZXIoZQrbJjcByHqKw"; + + let encoded_rs256 = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.dF2N9KXG8ftFKHYbd5jQtXMQqv0Ej8FISGp1b_dmqOCotXj5S1y2AWjwyB_EXHM77JXfbEoJPAPrFFBNfd8cWtkCSTvpxWoHaecGzegDFGv5ZSc5AECFV1Daahc3PI3jii9wEiGkFOiwiBNfZ5INomOAsV--XXxlqIwKbTcgSYI7lrOTfecXAbAHiMKQlQYiIBSGnytRCgafhRkyGzPAL8ismthFJ9RHfeejyskht-9GbVHURw02bUyijuHEulpf9eEY3ZiB28de6jnCdU7ftIYaUMaYWt0nZQGkzxKPSfSLZNy14DTOYLDS04DVstWQPqnCUW_ojg0wJETOOfo9Zw"; + + // Check that RS512, RS384 and RS256 tokens can all be validated + let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?); + + for encoded in [encoded_rs512, encoded_rs384, encoded_rs256] { + let claims_from_token = auth.decode(encoded)?.claims; + assert_eq!(claims_from_token, expected_claims); + } + Ok(()) + } + + #[test] + fn test_encode() -> Result<(), anyhow::Error> { + let claims = Claims { + tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?), + scope: Scope::Tenant, + }; + + let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_RSA)?; + + // decode it back + let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?); + let decoded = auth.decode(&encoded)?; + + assert_eq!(decoded.claims, claims); + + Ok(()) + } +} From 252b3685a27a0f4c31a0f91e983c6314838e89e8 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 10 Mar 2023 12:39:35 +0200 Subject: [PATCH 136/426] Use `unsafe-postgres` feature to build pgx extension Recently added `unsafe-postgres` feature allows to build pgx extensions against postgres forks that decided to change their ABI name (like us). With that we can build extensions without forking them and using stock pgx. As this feature is new few manual version bumps were required. --- Dockerfile.compute-node | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 12cd4fbbd1..fc65448323 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -323,7 +323,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ - cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \ + cargo install --locked --version 0.7.3 cargo-pgx && \ /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root @@ -337,11 +337,11 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build -RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \ - cd pg_jsonschema && \ +# there is no release tag yet, but we need it due to the superuser fix in the control file +RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ + mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgx install --release && \ - # it's needed to enable extension because it uses untrusted C language - sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_jsonschema.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control ######################################################################################### @@ -353,8 +353,13 @@ RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github. FROM rust-extensions-build AS pg-graphql-pg-build -RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \ - cd pg_graphql && \ +# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in +# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the +# same 1.1 version we've used before. +RUN git clone -b remove-pgx-contrib-spiext --single-branch https://github.com/yrashk/pg_graphql && \ + cd pg_graphql && \ + sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \ cargo pgx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ @@ -369,7 +374,7 @@ RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github. FROM rust-extensions-build AS pg-tiktoken-pg-build -RUN git clone --depth=1 --single-branch --branch neon_abi https://github.com/vadim2404/pg_tiktoken && \ +RUN git clone --depth=1 --single-branch https://github.com/kelvich/pg_tiktoken && \ cd pg_tiktoken && \ cargo pgx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control From 1401021b21c6338951b6c8eb2d3d2c4c5729918f Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 10 Mar 2023 18:00:20 +0100 Subject: [PATCH 137/426] Be able to get number of CPUs (#3774) After enabling autoscaling, we faced the issue that customers are not able to get the number of CPUs they use at this moment. Therefore I've added these two options: 1. Postgresql function to allow customers to call it whenever they want 2. `compute_ctl` endpoint to show these number in console --- Cargo.lock | 1 + Cargo.toml | 1 + Dockerfile.compute-node | 4 +++ Makefile | 8 ++++++ compute_tools/Cargo.toml | 1 + compute_tools/src/http/api.rs | 12 ++++++++ compute_tools/src/http/openapi_spec.yaml | 24 ++++++++++++++++ pgxn/neon_utils/Makefile | 15 ++++++++++ pgxn/neon_utils/neon_utils--1.0.sql | 6 ++++ pgxn/neon_utils/neon_utils.c | 35 ++++++++++++++++++++++++ pgxn/neon_utils/neon_utils.control | 6 ++++ 11 files changed, 113 insertions(+) create mode 100644 pgxn/neon_utils/Makefile create mode 100644 pgxn/neon_utils/neon_utils--1.0.sql create mode 100644 pgxn/neon_utils/neon_utils.c create mode 100644 pgxn/neon_utils/neon_utils.control diff --git a/Cargo.lock b/Cargo.lock index 2e3ea2842d..9721d487c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -851,6 +851,7 @@ dependencies = [ "futures", "hyper", "notify", + "num_cpus", "opentelemetry", "postgres", "regex", diff --git a/Cargo.toml b/Cargo.toml index bbd4975603..e27a50a1cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,6 +64,7 @@ md5 = "0.7.0" memoffset = "0.8" nix = "0.26" notify = "5.0.0" +num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" opentelemetry = "0.18.0" diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index fc65448323..ef861b15be 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -409,6 +409,10 @@ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ + -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon_utils \ -s install ######################################################################################### diff --git a/Makefile b/Makefile index e04a82c7c9..9d78c5d0fc 100644 --- a/Makefile +++ b/Makefile @@ -133,6 +133,11 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install + +@echo "Compiling neon_utils $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install .PHONY: neon-pg-ext-clean-% neon-pg-ext-clean-%: @@ -145,6 +150,9 @@ neon-pg-ext-clean-%: $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean .PHONY: neon-pg-ext neon-pg-ext: \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 46b0e80896..59433535f1 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,6 +11,7 @@ clap.workspace = true futures.workspace = true hyper = { workspace = true, features = ["full"] } notify.workspace = true +num_cpus.workspace = true opentelemetry.workspace = true postgres.workspace = true regex.workspace = true diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 2392863303..199e0f3bd0 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -7,6 +7,7 @@ use crate::compute::ComputeNode; use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use num_cpus; use serde_json; use tracing::{error, info}; use tracing_utils::http::OtelName; @@ -49,6 +50,17 @@ async fn routes(req: Request, compute: &Arc) -> Response { + let num_cpus = num_cpus::get_physical(); + info!("serving /info GET request. num_cpus: {}", num_cpus); + Response::new(Body::from( + serde_json::json!({ + "num_cpus": num_cpus, + }) + .to_string(), + )) + } + // Return the `404 Not Found` for any other routes. _ => { let mut not_found = Response::new(Body::from("404 Not Found")); diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 3a8e9fc1dc..5c74dfd2d2 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -53,6 +53,21 @@ paths: schema: $ref: "#/components/schemas/ComputeInsights" + /info: + get: + tags: + - "info" + summary: Get info about the compute Pod/VM + description: "" + operationId: getInfo + responses: + "200": + description: Info + content: + application/json: + schema: + $ref: "#/components/schemas/Info" + /check_writability: post: tags: @@ -96,6 +111,15 @@ components: total_startup_ms: type: integer + Info: + type: object + description: Information about VM/Pod + required: + - num_cpus + properties: + num_cpus: + type: integer + ComputeState: type: object required: diff --git a/pgxn/neon_utils/Makefile b/pgxn/neon_utils/Makefile new file mode 100644 index 0000000000..852a437713 --- /dev/null +++ b/pgxn/neon_utils/Makefile @@ -0,0 +1,15 @@ +# pgxs/neon_utils/Makefile + + +MODULE_big = neon_utils +OBJS = \ + $(WIN32RES) \ + neon_utils.o + +EXTENSION = neon_utils +DATA = neon_utils--1.0.sql +PGFILEDESC = "neon_utils - small useful functions" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_utils/neon_utils--1.0.sql b/pgxn/neon_utils/neon_utils--1.0.sql new file mode 100644 index 0000000000..d4652e91ad --- /dev/null +++ b/pgxn/neon_utils/neon_utils--1.0.sql @@ -0,0 +1,6 @@ +CREATE FUNCTION num_cpus() +RETURNS int +AS 'MODULE_PATHNAME', 'num_cpus' +LANGUAGE C STRICT +PARALLEL UNSAFE +VOLATILE; diff --git a/pgxn/neon_utils/neon_utils.c b/pgxn/neon_utils/neon_utils.c new file mode 100644 index 0000000000..8b9dfa24f4 --- /dev/null +++ b/pgxn/neon_utils/neon_utils.c @@ -0,0 +1,35 @@ +/*------------------------------------------------------------------------- + * + * neon_utils.c + * neon_utils - small useful functions + * + * IDENTIFICATION + * contrib/neon_utils/neon_utils.c + * + *------------------------------------------------------------------------- + */ +#ifdef _WIN32 +#include +#else +#include +#endif + +#include "postgres.h" +#include "fmgr.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(num_cpus); + +Datum +num_cpus(PG_FUNCTION_ARGS) +{ +#ifdef _WIN32 + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + uint32 num_cpus = (uint32) sysinfo.dwNumberOfProcessors; +#else + uint32 num_cpus = (uint32) sysconf(_SC_NPROCESSORS_ONLN); +#endif + PG_RETURN_UINT32(num_cpus); +} diff --git a/pgxn/neon_utils/neon_utils.control b/pgxn/neon_utils/neon_utils.control new file mode 100644 index 0000000000..ff402efb31 --- /dev/null +++ b/pgxn/neon_utils/neon_utils.control @@ -0,0 +1,6 @@ +# neon_utils extension +comment = 'neon_utils - small useful functions' +default_version = '1.0' +module_pathname = '$libdir/neon_utils' +relocatable = true +trusted = true From ce8fbbd910eabc1adf1982164f97bd8e2b108f8f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 10 Mar 2023 19:44:12 +0200 Subject: [PATCH 138/426] Fix allowed error again (#3790) Fixes #3360 again, this time checking all other "Error processing HTTP request" messages and aligning the regex with the two others. --- test_runner/regress/test_remote_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 24db80c7cc..dd0b576c5e 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -619,7 +619,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( log.info("sending delete request") checkpoint_allowed_to_fail.set() env.pageserver.allowed_errors.append( - ".+ERROR Error processing HTTP request: InternalServerError\\(timeline is Stopping" + ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping" ) client.timeline_delete(tenant_id, timeline_id) From 8699342249b1af38a30bd497f95794d3a865b608 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 13 Mar 2023 09:26:49 +0200 Subject: [PATCH 139/426] Ondemand rx bytes and layer count (#3777) Adds two new *global* metrics: - pageserver_remote_ondemand_downloaded_layers_total - pageserver_remote_ondemand_downloaded_bytes_total An existing test is repurposed once more to check that we do get some reasonable counts. These are to replace guessing from the nic RX bytes metric how much was on-demand downloaded. First part of #3745: This does not add the "(un)?avoidable" metric, which I plan to add as a new metric, which will be a subset of the counts of the metrics added here. --- pageserver/src/metrics.rs | 16 ++++++++++++ .../src/tenant/remote_timeline_client.rs | 11 +++++--- test_runner/regress/test_ondemand_download.py | 25 +++++++++++++++++-- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9d3d11eba8..4826a0b7ae 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -123,6 +123,22 @@ static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_remote_ondemand_downloaded_layers_total", + "Total on-demand downloaded layers" + ) + .unwrap() +}); + +pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_remote_ondemand_downloaded_bytes_total", + "Total bytes of layers on-demand downloaded", + ) + .unwrap() +}); + static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_current_logical_size", diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 7049a0bd66..f3943298f2 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -218,9 +218,10 @@ use tracing::{debug, info, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; -use crate::metrics::RemoteOpFileKind; -use crate::metrics::RemoteOpKind; -use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics}; +use crate::metrics::{ + MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, + REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, +}; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::{ config::PageServerConf, @@ -446,6 +447,10 @@ impl RemoteTimelineClient { ); } } + + REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc(); + REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size); + Ok(downloaded_size) } diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index e6c580c37c..12088c3353 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -4,7 +4,7 @@ import time from collections import defaultdict from pathlib import Path -from typing import Any, DefaultDict, Dict +from typing import Any, DefaultDict, Dict, Tuple import pytest from fixtures.log_helper import log @@ -497,6 +497,17 @@ def test_compaction_downloads_on_demand_without_image_creation( # pitr_interval and gc_horizon are not interesting because we dont run gc } + def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]: + m = pageserver_http.get_metrics() + # these are global counters + total_bytes = m.query_one("pageserver_remote_ondemand_downloaded_bytes_total").value + assert ( + total_bytes < 2**53 and total_bytes.is_integer() + ), "bytes should still be safe integer-in-f64" + count = m.query_one("pageserver_remote_ondemand_downloaded_layers_total").value + assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64" + return (int(total_bytes), int(count)) + # Override defaults, to create more layers tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf)) env.initial_tenant = tenant_id @@ -517,10 +528,14 @@ def test_compaction_downloads_on_demand_without_image_creation( layers = pageserver_http.layer_map_info(tenant_id, timeline_id) assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint" - assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas" + assert len(layers.historic_layers) == 1 + 2, "should have initdb layer and 2 deltas" + + layer_sizes = 0 for layer in layers.historic_layers: log.info(f"pre-compact: {layer}") + assert layer.layer_file_size is not None, "we must know layer file sizes" + layer_sizes += layer.layer_file_size pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"}) @@ -531,6 +546,12 @@ def test_compaction_downloads_on_demand_without_image_creation( log.info(f"post compact: {layer}") assert len(layers.historic_layers) == 1, "should have compacted to single layer" + post_compact = downloaded_bytes_and_count(pageserver_http) + + # use gte to allow pageserver to do other random stuff; this test could be run on a shared pageserver + assert post_compact[0] >= layer_sizes + assert post_compact[1] >= 3, "should had downloaded the three layers" + @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) def test_compaction_downloads_on_demand_with_image_creation( From d9a1329834ce7e666e51db2c87fde66488bd0d73 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 13 Mar 2023 12:18:10 +0300 Subject: [PATCH 140/426] Make postgres_backend use generic IO type (#3789) - Support measuring inbound and outbound traffic in MeasuredStream - Start using MeasuredStream in safekeepers code --- Cargo.lock | 1 + libs/postgres_backend/src/lib.rs | 83 ++++++++++++-------- libs/postgres_backend/tests/simple_select.rs | 5 +- libs/utils/Cargo.toml | 1 + libs/utils/src/lib.rs | 2 + libs/utils/src/measured_stream.rs | 77 ++++++++++++++++++ pageserver/src/page_service.rs | 19 ++--- proxy/src/console/mgmt.rs | 8 +- proxy/src/proxy.rs | 27 ++++--- proxy/src/stream.rs | 65 --------------- safekeeper/src/handler.rs | 15 ++-- safekeeper/src/json_ctrl.rs | 5 +- safekeeper/src/metrics.rs | 10 ++- safekeeper/src/receive_wal.rs | 28 ++++--- safekeeper/src/send_wal.rs | 22 +++--- safekeeper/src/wal_service.rs | 20 ++++- 16 files changed, 234 insertions(+), 154 deletions(-) create mode 100644 libs/utils/src/measured_stream.rs diff --git a/Cargo.lock b/Cargo.lock index 9721d487c8..17aacd8ee7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4532,6 +4532,7 @@ dependencies = [ "metrics", "nix", "once_cell", + "pin-project-lite", "rand", "routerify", "sentry", diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index ce46899779..4d88b958f0 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -59,14 +59,14 @@ pub fn is_expected_io_error(e: &io::Error) -> bool { } #[async_trait::async_trait] -pub trait Handler { +pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this /// might be not what we want after CopyData streaming, but currently we don't /// care). It will also flush out the output buffer. async fn process_query( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError>; @@ -77,7 +77,7 @@ pub trait Handler { /// to override whole init logic in implementations. fn startup( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { Ok(()) @@ -86,7 +86,7 @@ pub trait Handler { /// Check auth jwt fn check_auth_jwt( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, _jwt_response: &[u8], ) -> Result<(), QueryError> { Err(QueryError::Other(anyhow::anyhow!("JWT auth failed"))) @@ -115,12 +115,12 @@ pub enum ProcessMsgResult { } /// Either plain TCP stream or encrypted one, implementing AsyncRead + AsyncWrite. -pub enum MaybeTlsStream { - Unencrypted(tokio::net::TcpStream), - Tls(Box>), +pub enum MaybeTlsStream { + Unencrypted(IO), + Tls(Box>), } -impl AsyncWrite for MaybeTlsStream { +impl AsyncWrite for MaybeTlsStream { fn poll_write( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -147,7 +147,7 @@ impl AsyncWrite for MaybeTlsStream { } } } -impl AsyncRead for MaybeTlsStream { +impl AsyncRead for MaybeTlsStream { fn poll_read( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -192,13 +192,13 @@ impl fmt::Display for AuthType { /// PostgresBackend after call to `split`. In principle we could always store a /// pair of splitted handles, but that would force to to pay splitting price /// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver). -enum MaybeWriteOnly { - Full(Framed), - WriteOnly(FramedWriter), +enum MaybeWriteOnly { + Full(Framed>), + WriteOnly(FramedWriter>), Broken, // temporary value palmed off during the split } -impl MaybeWriteOnly { +impl MaybeWriteOnly { async fn read_startup_message(&mut self) -> Result, ConnectionError> { match self { MaybeWriteOnly::Full(framed) => framed.read_startup_message().await, @@ -244,8 +244,8 @@ impl MaybeWriteOnly { } } -pub struct PostgresBackend { - framed: MaybeWriteOnly, +pub struct PostgresBackend { + framed: MaybeWriteOnly, pub state: ProtoState, @@ -255,6 +255,8 @@ pub struct PostgresBackend { pub tls_config: Option>, } +pub type PostgresBackendTCP = PostgresBackend; + pub fn query_from_cstring(query_string: Bytes) -> Vec { let mut query_string = query_string.to_vec(); if let Some(ch) = query_string.last() { @@ -271,7 +273,7 @@ fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { std::str::from_utf8(without_null).map_err(|e| e.into()) } -impl PostgresBackend { +impl PostgresBackend { pub fn new( socket: tokio::net::TcpStream, auth_type: AuthType, @@ -288,6 +290,25 @@ impl PostgresBackend { peer_addr, }) } +} + +impl PostgresBackend { + pub fn new_from_io( + socket: IO, + peer_addr: SocketAddr, + auth_type: AuthType, + tls_config: Option>, + ) -> io::Result { + let stream = MaybeTlsStream::Unencrypted(socket); + + Ok(Self { + framed: MaybeWriteOnly::Full(Framed::new(stream)), + state: ProtoState::Initialization, + auth_type, + tls_config, + peer_addr, + }) + } pub fn get_peer_addr(&self) -> &SocketAddr { &self.peer_addr @@ -346,14 +367,14 @@ impl PostgresBackend { /// to it in CopyData messages, and writes them to the connection /// /// The caller is responsible for sending CopyOutResponse and CopyDone messages. - pub fn copyout_writer(&mut self) -> CopyDataWriter { + pub fn copyout_writer(&mut self) -> CopyDataWriter { CopyDataWriter { pgb: self } } /// Wrapper for run_message_loop() that shuts down socket when we are done pub async fn run( mut self, - handler: &mut impl Handler, + handler: &mut impl Handler, shutdown_watcher: F, ) -> Result<(), QueryError> where @@ -369,7 +390,7 @@ impl PostgresBackend { async fn run_message_loop( &mut self, - handler: &mut impl Handler, + handler: &mut impl Handler, shutdown_watcher: F, ) -> Result<(), QueryError> where @@ -426,9 +447,9 @@ impl PostgresBackend { /// Try to upgrade MaybeTlsStream into actual TLS one, performing handshake. async fn tls_upgrade( - src: MaybeTlsStream, + src: MaybeTlsStream, tls_config: Arc, - ) -> anyhow::Result { + ) -> anyhow::Result> { match src { MaybeTlsStream::Unencrypted(s) => { let acceptor = TlsAcceptor::from(tls_config); @@ -466,7 +487,7 @@ impl PostgresBackend { /// Split off owned read part from which messages can be read in different /// task/thread. - pub fn split(&mut self) -> anyhow::Result { + pub fn split(&mut self) -> anyhow::Result> { // temporary replace stream with fake to cook split one, Indiana Jones style match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { MaybeWriteOnly::Full(framed) => { @@ -482,7 +503,7 @@ impl PostgresBackend { } /// Join read part back. - pub fn unsplit(&mut self, reader: PostgresBackendReader) -> anyhow::Result<()> { + pub fn unsplit(&mut self, reader: PostgresBackendReader) -> anyhow::Result<()> { // temporary replace stream with fake to cook joined one, Indiana Jones style match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) { MaybeWriteOnly::Full(_) => { @@ -499,7 +520,7 @@ impl PostgresBackend { /// Perform handshake with the client, transitioning to Established. /// In case of EOF during handshake logs this, sets state to Closed and returns Ok(()). - async fn handshake(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { + async fn handshake(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> { while self.state < ProtoState::Authentication { match self.framed.read_startup_message().await? { Some(msg) => { @@ -565,7 +586,7 @@ impl PostgresBackend { /// actual startup packet. async fn process_startup_message( &mut self, - handler: &mut impl Handler, + handler: &mut impl Handler, msg: FeStartupPacket, ) -> Result<(), QueryError> { assert!(self.state < ProtoState::Authentication); @@ -629,7 +650,7 @@ impl PostgresBackend { async fn process_message( &mut self, - handler: &mut impl Handler, + handler: &mut impl Handler, msg: FeMessage, unnamed_query_string: &mut Bytes, ) -> Result { @@ -776,9 +797,9 @@ impl PostgresBackend { } } -pub struct PostgresBackendReader(FramedReader); +pub struct PostgresBackendReader(FramedReader>); -impl PostgresBackendReader { +impl PostgresBackendReader { /// Read full message or return None if connection is cleanly closed with no /// unprocessed data. pub async fn read_message(&mut self) -> Result, ConnectionError> { @@ -812,11 +833,11 @@ impl PostgresBackendReader { /// messages. /// -pub struct CopyDataWriter<'a> { - pgb: &'a mut PostgresBackend, +pub struct CopyDataWriter<'a, IO> { + pgb: &'a mut PostgresBackend, } -impl<'a> AsyncWrite for CopyDataWriter<'a> { +impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> { fn poll_write( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index a310171c70..e046fa5260 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -4,6 +4,7 @@ use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use std::io::Cursor; use std::{future, sync::Arc}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; use tokio_postgres::tls::MakeTlsConnect; @@ -22,11 +23,11 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) { struct TestHandler {} #[async_trait::async_trait] -impl Handler for TestHandler { +impl Handler for TestHandler { // return single col 'hey' for any query async fn process_query( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, _query_string: &str, ) -> Result<(), QueryError> { pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index b24de57f99..b9f67e82f8 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -18,6 +18,7 @@ futures = { workspace = true} jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true +pin-project-lite.workspace = true routerify.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index acb5273943..766d759ab4 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -49,6 +49,8 @@ pub mod fs_ext; pub mod history_buffer; +pub mod measured_stream; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/measured_stream.rs b/libs/utils/src/measured_stream.rs new file mode 100644 index 0000000000..c37d686a1d --- /dev/null +++ b/libs/utils/src/measured_stream.rs @@ -0,0 +1,77 @@ +use pin_project_lite::pin_project; +use std::pin::Pin; +use std::{io, task}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +pin_project! { + /// This stream tracks all writes and calls user provided + /// callback when the underlying stream is flushed. + pub struct MeasuredStream { + #[pin] + stream: S, + write_count: usize, + inc_read_count: R, + inc_write_count: W, + } +} + +impl MeasuredStream { + pub fn new(stream: S, inc_read_count: R, inc_write_count: W) -> Self { + Self { + stream, + write_count: 0, + inc_read_count, + inc_write_count, + } + } +} + +impl AsyncRead for MeasuredStream { + fn poll_read( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> task::Poll> { + let this = self.project(); + let filled = buf.filled().len(); + this.stream.poll_read(context, buf).map_ok(|()| { + let cnt = buf.filled().len() - filled; + // Increment the read count. + (this.inc_read_count)(cnt); + }) + } +} + +impl AsyncWrite for MeasuredStream { + fn poll_write( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + buf: &[u8], + ) -> task::Poll> { + let this = self.project(); + this.stream.poll_write(context, buf).map_ok(|cnt| { + // Increment the write count. + *this.write_count += cnt; + cnt + }) + } + + fn poll_flush( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + ) -> task::Poll> { + let this = self.project(); + this.stream.poll_flush(context).map_ok(|()| { + // Call the user provided callback and reset the write count. + (this.inc_write_count)(*this.write_count); + *this.write_count = 0; + }) + } + + fn poll_shutdown( + self: Pin<&mut Self>, + context: &mut task::Context<'_>, + ) -> task::Poll> { + self.project().stream.poll_shutdown(context) + } +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 40e11a70b7..dc9bf955f7 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -20,6 +20,7 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; +use postgres_backend::PostgresBackendTCP; use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; @@ -54,7 +55,7 @@ use crate::trace::Tracer; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { +fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream> + '_ { async_stream::try_stream! { loop { let msg = tokio::select! { @@ -288,7 +289,7 @@ impl PageServerHandler { #[instrument(skip(self, pgb, ctx))] async fn handle_pagerequests( &self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackendTCP, tenant_id: TenantId, timeline_id: TimelineId, ctx: RequestContext, @@ -392,7 +393,7 @@ impl PageServerHandler { #[instrument(skip(self, pgb, ctx))] async fn handle_import_basebackup( &self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackendTCP, tenant_id: TenantId, timeline_id: TimelineId, base_lsn: Lsn, @@ -448,7 +449,7 @@ impl PageServerHandler { #[instrument(skip(self, pgb, ctx))] async fn handle_import_wal( &self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackendTCP, tenant_id: TenantId, timeline_id: TimelineId, start_lsn: Lsn, @@ -659,7 +660,7 @@ impl PageServerHandler { #[instrument(skip(self, pgb, ctx))] async fn handle_basebackup_request( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackendTCP, tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, @@ -723,10 +724,10 @@ impl PageServerHandler { } #[async_trait::async_trait] -impl postgres_backend::Handler for PageServerHandler { +impl postgres_backend::Handler for PageServerHandler { fn check_auth_jwt( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackendTCP, jwt_response: &[u8], ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT @@ -754,7 +755,7 @@ impl postgres_backend::Handler for PageServerHandler { fn startup( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackendTCP, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { Ok(()) @@ -762,7 +763,7 @@ impl postgres_backend::Handler for PageServerHandler { async fn process_query( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackendTCP, query_string: &str, ) -> Result<(), QueryError> { let ctx = self.connection_ctx.attached_child(); diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 8f3bc72407..30364be6f4 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -4,7 +4,7 @@ use crate::{ }; use anyhow::Context; use once_cell::sync::Lazy; -use postgres_backend::{self, AuthType, PostgresBackend, QueryError}; +use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use std::future; use tokio::net::{TcpListener, TcpStream}; @@ -71,10 +71,10 @@ pub type ComputeReady = Result; // TODO: replace with an http-based protocol. struct MgmtHandler; #[async_trait::async_trait] -impl postgres_backend::Handler for MgmtHandler { +impl postgres_backend::Handler for MgmtHandler { async fn process_query( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackendTCP, query: &str, ) -> Result<(), QueryError> { try_process_query(pgb, query).await.map_err(|e| { @@ -84,7 +84,7 @@ impl postgres_backend::Handler for MgmtHandler { } } -async fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> { +async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> { let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?; let span = info_span!("event", session_id = resp.session_id); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 9642047812..abeff6a33b 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -8,7 +8,7 @@ use crate::{ config::{ProxyConfig, TlsConfig}, console::{self, messages::MetricsAuxInfo}, error::io_error, - stream::{MeasuredStream, PqStream, Stream}, + stream::{PqStream, Stream}, }; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -18,6 +18,7 @@ use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{error, info, warn}; +use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. const NUM_RETRIES_WAKE_COMPUTE: usize = 1; @@ -353,16 +354,24 @@ async fn proxy_pass( aux: &MetricsAuxInfo, ) -> anyhow::Result<()> { let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx")); - let mut client = MeasuredStream::new(client, |cnt| { - // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); - }); + let mut client = MeasuredStream::new( + client, + |_| {}, + |cnt| { + // Number of bytes we sent to the client (outbound). + m_sent.inc_by(cnt as u64); + }, + ); let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("rx")); - let mut compute = MeasuredStream::new(compute, |cnt| { - // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); - }); + let mut compute = MeasuredStream::new( + compute, + |_| {}, + |cnt| { + // Number of bytes the client sent to the compute node (inbound). + m_recv.inc_by(cnt as u64); + }, + ); // Starting from here we only proxy the client's traffic. info!("performing the proxy pass..."); diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 5a802dafb2..9dfc435e39 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -217,68 +217,3 @@ impl AsyncWrite for Stream { } } } - -pin_project! { - /// This stream tracks all writes and calls user provided - /// callback when the underlying stream is flushed. - pub struct MeasuredStream { - #[pin] - stream: S, - write_count: usize, - inc_write_count: W, - } -} - -impl MeasuredStream { - pub fn new(stream: S, inc_write_count: W) -> Self { - Self { - stream, - write_count: 0, - inc_write_count, - } - } -} - -impl AsyncRead for MeasuredStream { - fn poll_read( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> task::Poll> { - self.project().stream.poll_read(context, buf) - } -} - -impl AsyncWrite for MeasuredStream { - fn poll_write( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - buf: &[u8], - ) -> task::Poll> { - let this = self.project(); - this.stream.poll_write(context, buf).map_ok(|cnt| { - // Increment the write count. - *this.write_count += cnt; - cnt - }) - } - - fn poll_flush( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - ) -> task::Poll> { - let this = self.project(); - this.stream.poll_flush(context).map_ok(|()| { - // Call the user provided callback and reset the write count. - (this.inc_write_count)(*this.write_count); - *this.write_count = 0; - }) - } - - fn poll_shutdown( - self: Pin<&mut Self>, - context: &mut task::Context<'_>, - ) -> task::Poll> { - self.project().stream.poll_shutdown(context) - } -} diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 7d788fe3b9..3b8434b2de 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -3,6 +3,7 @@ use anyhow::Context; use std::str; +use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span, Instrument}; use crate::auth::check_permission; @@ -67,11 +68,13 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } #[async_trait::async_trait] -impl postgres_backend::Handler for SafekeeperPostgresHandler { +impl postgres_backend::Handler + for SafekeeperPostgresHandler +{ // tenant_id and timeline_id are passed in connection string params fn startup( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, sm: &FeStartupPacket, ) -> Result<(), QueryError> { if let FeStartupPacket::StartupMessage { params, .. } = sm { @@ -110,7 +113,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn check_auth_jwt( &mut self, - _pgb: &mut PostgresBackend, + _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT @@ -139,7 +142,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { async fn process_query( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { if query_string @@ -216,9 +219,9 @@ impl SafekeeperPostgresHandler { /// /// Handle IDENTIFY_SYSTEM replication command /// - async fn handle_identify_system( + async fn handle_identify_system( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 769ec8d409..2841cd195f 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -12,6 +12,7 @@ use anyhow::Context; use bytes::Bytes; use postgres_backend::QueryError; use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; use tracing::*; use utils::id::TenantTimelineId; @@ -60,9 +61,9 @@ struct AppendResult { /// Handles command to craft logical message WAL record with given /// content, and then append it with specified term and lsn. This /// function is used to test safekeepers in different scenarios. -pub async fn handle_json_ctrl( +pub async fn handle_json_ctrl( spg: &SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, ) -> Result<(), QueryError> { info!("JSON_CTRL request: {append_request:?}"); diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index b21770686c..16aca24927 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,7 +7,7 @@ use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, proto::MetricFamily, - Gauge, IntGaugeVec, + register_int_counter_vec, Gauge, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; @@ -61,6 +61,14 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") }); +pub static PG_IO_BYTES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_pg_io_bytes", + "Bytes read from or written to any PostgreSQL connection", + &["direction"] + ) + .expect("Failed to register safekeeper_pg_io_bytes gauge") +}); /// Metrics for WalStorage in a single timeline. #[derive(Clone, Default)] diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 0652ad0676..61e4c5f0fa 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -20,6 +20,8 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; use std::thread::JoinHandle; +use tokio::io::AsyncRead; +use tokio::io::AsyncWrite; use tokio::sync::mpsc::channel; use tokio::sync::mpsc::error::TryRecvError; use tokio::sync::mpsc::Receiver; @@ -36,9 +38,9 @@ impl SafekeeperPostgresHandler { /// Wrapper around handle_start_wal_push_guts handling result. Error is /// handled here while we're still in walreceiver ttid span; with API /// extension, this can probably be moved into postgres_backend. - pub async fn handle_start_wal_push( + pub async fn handle_start_wal_push( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { if let Err(end) = self.handle_start_wal_push_guts(pgb).await { // Log the result and probably send it to the client, closing the stream. @@ -47,9 +49,9 @@ impl SafekeeperPostgresHandler { Ok(()) } - pub async fn handle_start_wal_push_guts( + pub async fn handle_start_wal_push_guts( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -111,17 +113,17 @@ impl SafekeeperPostgresHandler { } } -struct NetworkReader<'a> { +struct NetworkReader<'a, IO> { ttid: TenantTimelineId, conn_id: ConnectionId, - pgb_reader: &'a mut PostgresBackendReader, + pgb_reader: &'a mut PostgresBackendReader, peer_addr: SocketAddr, // WalAcceptor is spawned when we learn server info from walproposer and // create timeline; handle is put here. acceptor_handle: &'a mut Option>>, } -impl<'a> NetworkReader<'a> { +impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { async fn run( self, msg_tx: Sender, @@ -162,16 +164,16 @@ impl<'a> NetworkReader<'a> { /// Read next message from walproposer. /// TODO: Return Ok(None) on graceful termination. -async fn read_message( - pgb_reader: &mut PostgresBackendReader, +async fn read_message( + pgb_reader: &mut PostgresBackendReader, ) -> Result { let copy_data = pgb_reader.read_copy_message().await?; let msg = ProposerAcceptorMessage::parse(copy_data)?; Ok(msg) } -async fn read_network_loop( - pgb_reader: &mut PostgresBackendReader, +async fn read_network_loop( + pgb_reader: &mut PostgresBackendReader, msg_tx: Sender, mut next_msg: ProposerAcceptorMessage, ) -> Result<(), CopyStreamHandlerEnd> { @@ -186,8 +188,8 @@ async fn read_network_loop( /// Read replies from WalAcceptor and pass them back to socket. Returns Ok(()) /// if reply_rx closed; it must mean WalAcceptor terminated, joining it should /// tell the error. -async fn network_write( - pgb_writer: &mut PostgresBackend, +async fn network_write( + pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index e8c1b4c02e..b533e87c5b 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -13,6 +13,8 @@ use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use pq_proto::{BeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; + use std::cmp::min; use std::str; use std::sync::Arc; @@ -74,9 +76,9 @@ impl SafekeeperPostgresHandler { /// Wrapper around handle_start_replication_guts handling result. Error is /// handled here while we're still in walsender ttid span; with API /// extension, this can probably be moved into postgres_backend. - pub async fn handle_start_replication( + pub async fn handle_start_replication( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, start_pos: Lsn, ) -> Result<(), QueryError> { if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await { @@ -86,9 +88,9 @@ impl SafekeeperPostgresHandler { Ok(()) } - pub async fn handle_start_replication_guts( + pub async fn handle_start_replication_guts( &mut self, - pgb: &mut PostgresBackend, + pgb: &mut PostgresBackend, start_pos: Lsn, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); @@ -176,8 +178,8 @@ impl SafekeeperPostgresHandler { } /// A half driving sending WAL. -struct WalSender<'a> { - pgb: &'a mut PostgresBackend, +struct WalSender<'a, IO> { + pgb: &'a mut PostgresBackend, tli: Arc, appname: Option, // Position since which we are sending next chunk. @@ -194,7 +196,7 @@ struct WalSender<'a> { send_buf: [u8; MAX_SEND_SIZE], } -impl WalSender<'_> { +impl WalSender<'_, IO> { /// Send WAL until /// - an error occurs /// - if we are streaming to walproposer, we've streamed until stop_pos @@ -282,14 +284,14 @@ impl WalSender<'_> { } /// A half driving receiving replies. -struct ReplyReader { - reader: PostgresBackendReader, +struct ReplyReader { + reader: PostgresBackendReader, tli: Arc, replica_id: usize, feedback: ReplicaState, } -impl ReplyReader { +impl ReplyReader { async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { let msg = self.reader.read_copy_message().await?; diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 96f063d686..5f58c4f7fc 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -7,9 +7,10 @@ use postgres_backend::QueryError; use std::{future, thread}; use tokio::net::TcpStream; use tracing::*; +use utils::measured_stream::MeasuredStream; -use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; +use crate::{handler::SafekeeperPostgresHandler, metrics::PG_IO_BYTES}; use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. @@ -67,14 +68,29 @@ fn handle_socket( .build()?; let local = tokio::task::LocalSet::new(); + let read_metrics = PG_IO_BYTES.with_label_values(&["read"]); + let write_metrics = PG_IO_BYTES.with_label_values(&["write"]); + socket.set_nodelay(true)?; + let peer_addr = socket.peer_addr()?; + + // TODO: measure cross-az traffic + let socket = MeasuredStream::new( + socket, + |cnt| { + read_metrics.inc_by(cnt as u64); + }, + |cnt| { + write_metrics.inc_by(cnt as u64); + }, + ); let auth_type = match conf.auth { None => AuthType::Trust, Some(_) => AuthType::NeonJWT, }; let mut conn_handler = SafekeeperPostgresHandler::new(conf, conn_id); - let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. local.block_on( From e0ee138a8bca404b6c418ca5e8ff2ccccab0d2ed Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 13 Mar 2023 11:11:01 +0200 Subject: [PATCH 141/426] Add a test for tokio-postgres client to the driver test suite. It is fully supported. To enable TLS, though, it requires some extra glue code, and a dependency to a TLS library. --- .../rust/tokio-postgres/.dockerignore | 1 + .../pg_clients/rust/tokio-postgres/.gitignore | 1 + .../pg_clients/rust/tokio-postgres/Cargo.lock | 1006 +++++++++++++++++ .../pg_clients/rust/tokio-postgres/Cargo.toml | 17 + .../pg_clients/rust/tokio-postgres/Dockerfile | 6 + .../rust/tokio-postgres/src/main.rs | 43 + test_runner/pg_clients/test_pg_clients.py | 1 + 7 files changed, 1075 insertions(+) create mode 100644 test_runner/pg_clients/rust/tokio-postgres/.dockerignore create mode 100644 test_runner/pg_clients/rust/tokio-postgres/.gitignore create mode 100644 test_runner/pg_clients/rust/tokio-postgres/Cargo.lock create mode 100644 test_runner/pg_clients/rust/tokio-postgres/Cargo.toml create mode 100644 test_runner/pg_clients/rust/tokio-postgres/Dockerfile create mode 100644 test_runner/pg_clients/rust/tokio-postgres/src/main.rs diff --git a/test_runner/pg_clients/rust/tokio-postgres/.dockerignore b/test_runner/pg_clients/rust/tokio-postgres/.dockerignore new file mode 100644 index 0000000000..2f7896d1d1 --- /dev/null +++ b/test_runner/pg_clients/rust/tokio-postgres/.dockerignore @@ -0,0 +1 @@ +target/ diff --git a/test_runner/pg_clients/rust/tokio-postgres/.gitignore b/test_runner/pg_clients/rust/tokio-postgres/.gitignore new file mode 100644 index 0000000000..2f7896d1d1 --- /dev/null +++ b/test_runner/pg_clients/rust/tokio-postgres/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock new file mode 100644 index 0000000000..96989ee5ee --- /dev/null +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -0,0 +1,1006 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "async-trait" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "095183a3539c7c7649b2beb87c2d3f0591f3a7fed07761cc546d244e27e0238c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "block-buffer" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +dependencies = [ + "generic-array", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "core-foundation" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" + +[[package]] +name = "cpufeatures" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "futures" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" + +[[package]] +name = "futures-executor" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" + +[[package]] +name = "futures-macro" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" + +[[package]] +name = "futures-task" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" + +[[package]] +name = "futures-util" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +dependencies = [ + "libc", + "windows-sys 0.45.0", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.139" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" + +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + +[[package]] +name = "lock_api" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "md-5" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +dependencies = [ + "digest", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "mio" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys 0.45.0", +] + +[[package]] +name = "native-tls" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "once_cell" +version = "1.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" + +[[package]] +name = "openssl" +version = "0.10.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b102428fd03bc5edf97f62620f7298614c45cedf287c271e7ed450bbaf83f2e1" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23bbbf7854cd45b83958ebe919f0e8e516793727652e27fda10a8384cfc790b7" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-sys 0.45.0", +] + +[[package]] +name = "percent-encoding" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" + +[[package]] +name = "phf" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + +[[package]] +name = "postgres-native-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d442770e2b1e244bb5eb03b31c79b65bb2568f413b899eaba850fa945a65954" +dependencies = [ + "futures", + "native-tls", + "tokio", + "tokio-native-tls", + "tokio-postgres", +] + +[[package]] +name = "postgres-protocol" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "878c6cbf956e03af9aa8204b407b9cbf47c072164800aa918c516cd4b056c50c" +dependencies = [ + "base64", + "byteorder", + "bytes", + "fallible-iterator", + "hmac", + "md-5", + "memchr", + "rand", + "sha2", + "stringprep", +] + +[[package]] +name = "postgres-types" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73d946ec7d256b04dfadc4e6a3292324e6f417124750fc5c0950f981b703a0f1" +dependencies = [ + "bytes", + "fallible-iterator", + "postgres-protocol", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "proc-macro2" +version = "1.0.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rust-neon-example" +version = "0.1.0" +dependencies = [ + "native-tls", + "postgres-native-tls", + "tokio", + "tokio-postgres", +] + +[[package]] +name = "rustix" +version = "0.36.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys 0.45.0", +] + +[[package]] +name = "schannel" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3" +dependencies = [ + "windows-sys 0.42.0", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "security-framework" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "sha2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "slab" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "stringprep" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "subtle" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys 0.42.0", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03201d01c3c27a29c8a5cee5b55a93ddae1ccf6f08f65365c2c918f8c1b76f64" +dependencies = [ + "autocfg", + "bytes", + "libc", + "memchr", + "mio", + "pin-project-lite", + "socket2", + "tokio-macros", + "windows-sys 0.45.0", +] + +[[package]] +name = "tokio-macros" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-postgres" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29a12c1b3e0704ae7dfc25562629798b29c72e6b1d0a681b6f29ab4ae5e7f7bf" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures-channel", + "futures-util", + "log", + "parking_lot", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol", + "postgres-types", + "socket2", + "tokio", + "tokio-util", +] + +[[package]] +name = "tokio-util" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + +[[package]] +name = "tracing" +version = "0.1.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +dependencies = [ + "cfg-if", + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + +[[package]] +name = "unicode-bidi" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" + +[[package]] +name = "unicode-ident" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "775c11906edafc97bc378816b94585fbd9a054eabaf86fdd0ced94af449efab7" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml new file mode 100644 index 0000000000..f1b519bdb2 --- /dev/null +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "rust-neon-example" +version = "0.1.0" +edition = "2021" +publish = false + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +native-tls = "0.2.11" +postgres-native-tls = "0.5.0" +tokio = { version = "1.26", features=["rt", "macros"] } +tokio-postgres = "0.7.7" + + +# This is not part of the main 'neon' workspace +[workspace] diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile new file mode 100644 index 0000000000..c3a15b5d85 --- /dev/null +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -0,0 +1,6 @@ +FROM rust:1.67 +WORKDIR /source + +COPY . . +RUN cargo build +CMD ["/source/target/debug/rust-neon-example"] diff --git a/test_runner/pg_clients/rust/tokio-postgres/src/main.rs b/test_runner/pg_clients/rust/tokio-postgres/src/main.rs new file mode 100644 index 0000000000..6ed82276e4 --- /dev/null +++ b/test_runner/pg_clients/rust/tokio-postgres/src/main.rs @@ -0,0 +1,43 @@ +use std::env::VarError; +use tokio_postgres; + +fn get_env(key: &str) -> String { + match std::env::var(key) { + Ok(val) => val, + Err(VarError::NotPresent) => panic!("{key} env variable not set"), + Err(VarError::NotUnicode(_)) => panic!("{key} is not valid unicode"), + } +} + +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<(), tokio_postgres::Error> { + let host = get_env("NEON_HOST"); + let database = get_env("NEON_DATABASE"); + let user = get_env("NEON_USER"); + let password = get_env("NEON_PASSWORD"); + + let url = format!("postgresql://{user}:{password}@{host}/{database}"); + + // Use the native TLS implementation (Neon requires TLS) + let tls_connector = + postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap()); + + // Connect to the database. + let (client, connection) = tokio_postgres::connect(&url, tls_connector).await?; + + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let result = client.query("SELECT 1", &[]).await?; + + let value: i32 = result[0].get(0); + assert_eq!(value, 1); + println!("{value}"); + + Ok(()) +} diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index 5a8da56680..f44fdd1724 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -13,6 +13,7 @@ from fixtures.utils import subprocess_capture [ "csharp/npgsql", "java/jdbc", + "rust/tokio-postgres", "python/asyncpg", "python/pg8000", pytest.param( From 07dcf679ded0f1bb77e5a6e1d124630d8f39b2b7 Mon Sep 17 00:00:00 2001 From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com> Date: Mon, 13 Mar 2023 14:00:01 +0200 Subject: [PATCH 142/426] set content type explicitly (#3799) I moved management API v2 to ogen and the generated code seems to be more strict about content type. Let's set it properly as it is json after all ## Describe your changes ## Issue ticket number and link ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- .github/ansible/deploy.yaml | 4 ++-- .github/ansible/scripts/init_pageserver.sh | 2 +- .github/ansible/scripts/init_safekeeper.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index a17dc9c78f..0243e91f37 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -118,7 +118,7 @@ cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version - curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers + curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers tags: - pageserver @@ -188,6 +188,6 @@ cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version - curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers + curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers tags: - safekeeper diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh index e89fc5e667..e7d6efadae 100644 --- a/.github/ansible/scripts/init_pageserver.sh +++ b/.github/ansible/scripts/init_pageserver.sh @@ -26,7 +26,7 @@ EOF if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then # not registered, so register it now - ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id') + ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id') # init pageserver sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh index 28d61b6223..fc23584712 100644 --- a/.github/ansible/scripts/init_safekeeper.sh +++ b/.github/ansible/scripts/init_safekeeper.sh @@ -25,7 +25,7 @@ EOF if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then # not registered, so register it now - ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id') + ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id') # init safekeeper sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data fi From f0573f5991d99214e45f55308bf6b873394ae8e4 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 13 Mar 2023 14:07:35 +0200 Subject: [PATCH 143/426] Remove block cursor cache (#3740) ## Describe your changes Do not pin current block in BlockCursor ## Issue ticket number and link See #3712 There are places (see get_reconstruct_data) in our code when thread is holding read layers lock and then try to read file and so lock page cache slot. So we have edge in dependency graph layers->page cache slot. At the same time (as Christian noticed) we can lock page cache slot in BlockCursor and then try obtain shard lock on layers. So there is backward edge in dependency graph page cache slot>layers which forms loop and may cause deadlock. There are three possible fixes of the problem: 1. Perform compaction under `layers` shared lock. See PR #3732. It fixes the problem but make it not possible to append any data to pageserver until compaction is completed. 2. Do not hold `layers` lock while accessing layers (not sure if it is possible to do because it definitely introduce some new race conditions). 3. Do not pin current pages in BockCursor (this PR). My experiments shows that this cache in BlockCursor is not so useful: the number of hits/misses for cursor cache on pgbench workload (-i -s 10/-c 10 -T 100/-c 10 -S -T 100): ``` hits: 163011 misses: 1023602 ``` So number of cache misses is 10x times larger. And results for read-only pgbench are mostly the same: ``` with cache: 14581 w/out cache: 14429 ``` ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- pageserver/src/tenant/block_io.rs | 38 ++----------------------- pageserver/src/tenant/ephemeral_file.rs | 5 +--- 2 files changed, 4 insertions(+), 39 deletions(-) diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index e3cc800447..10de34e3f6 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -51,9 +51,6 @@ where /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// -/// A cursor caches the last accessed page, allowing for faster access if the -/// same block is accessed repeatedly. -/// /// You can access the last page with `*cursor`. 'read_blk' returns 'self', so /// that in many cases you can use a BlockCursor as a drop-in replacement for /// the underlying BlockReader. For example: @@ -73,8 +70,6 @@ where R: BlockReader, { reader: R, - /// last accessed page - cache: Option<(u32, R::BlockLease)>, } impl BlockCursor @@ -82,40 +77,13 @@ where R: BlockReader, { pub fn new(reader: R) -> Self { - BlockCursor { - reader, - cache: None, - } + BlockCursor { reader } } - pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> { - // Fast return if this is the same block as before - if let Some((cached_blk, _buf)) = &self.cache { - if *cached_blk == blknum { - return Ok(self); - } - } - - // Read the block from the underlying reader, and cache it - self.cache = None; - let buf = self.reader.read_blk(blknum)?; - self.cache = Some((blknum, buf)); - - Ok(self) + pub fn read_blk(&mut self, blknum: u32) -> Result { + self.reader.read_blk(blknum) } } - -impl Deref for BlockCursor -where - R: BlockReader, -{ - type Target = [u8; PAGE_SZ]; - - fn deref(&self) -> &::Target { - &self.cache.as_ref().unwrap().1 - } -} - static NEXT_ID: AtomicU64 = AtomicU64::new(1); /// An adapter for reading a (virtual) file using the page cache. diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index c433e65ad2..4379438896 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -2,9 +2,7 @@ //! used to keep in-memory layers spilled on disk. use crate::config::PageServerConf; -use crate::page_cache; -use crate::page_cache::PAGE_SZ; -use crate::page_cache::{ReadBufResult, WriteBufResult}; +use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::BlockReader; use crate::virtual_file::VirtualFile; @@ -427,7 +425,6 @@ mod tests { let actual = cursor.read_blob(pos)?; assert_eq!(actual, expected); } - drop(cursor); // Test a large blob that spans multiple pages let mut large_data = Vec::new(); From daeaa767c405532f0c8bdb8a5765f0c13fd83aee Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 10 Mar 2023 11:15:32 +0100 Subject: [PATCH 144/426] Add `neondatabase/release` team as a default reviewers for storage releases --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4bce9cdd1e..014084c410 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,3 +31,4 @@ jobs: head: releases/${{ steps.date.outputs.date }} base: release title: Release ${{ steps.date.outputs.date }} + team_reviewers: release From 582620274abeccdfa748a214b7ec30f796e7731c Mon Sep 17 00:00:00 2001 From: sharnoff Date: Mon, 13 Mar 2023 07:16:39 -0700 Subject: [PATCH 145/426] Enable file cache handling by vm-informant (#3794) Enables the VM informant's file cache integration. See also: https://github.com/neondatabase/autoscaling/pull/47 --- Dockerfile.vm-compute-node | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node index dff40485de..faea311708 100644 --- a/Dockerfile.vm-compute-node +++ b/Dockerfile.vm-compute-node @@ -1,7 +1,7 @@ # Note: this file *mostly* just builds on Dockerfile.compute-node ARG SRC_IMAGE -ARG VM_INFORMANT_VERSION=v0.1.6 +ARG VM_INFORMANT_VERSION=v0.1.14 # Pull VM informant and set up inittab FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant @@ -11,7 +11,9 @@ RUN set -e \ && touch /etc/inittab RUN set -e \ - && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart'" >> /etc/inittab + && CONNSTR="dbname=neondb user=cloud_admin sslmode=disable" \ + && ARGS="--auto-restart --pgconnstr=\"$CONNSTR\"" \ + && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab # Combine, starting from non-VM compute node image. FROM $SRC_IMAGE as base From d7ab69f303e51ca7f8669fb7952d13f3234606c8 Mon Sep 17 00:00:00 2001 From: andres Date: Sun, 6 Nov 2022 19:15:18 +0100 Subject: [PATCH 146/426] add test for getting branchpoints from an inactive timeline --- pageserver/src/tenant.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 9e9c98ad62..aa77196f2c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3176,6 +3176,35 @@ mod tests { } */ + #[tokio::test] + async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { + let tenant = + TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + tline.set_state(TimelineState::Paused); + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant + .get_timeline(NEW_TIMELINE_ID, true) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false).await?; + + assert_eq!( + newtline.get(*TEST_KEY, Lsn(0x50))?, + TEST_IMG(&format!("foo at {}", Lsn(0x40)))); + + let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; + assert_eq!(branchpoints.len(), 1); + assert_eq!(branchpoints[0], Lsn(0x40)); + + Ok(()) + } + #[tokio::test] async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { let (tenant, ctx) = From 50476a7cc77b375e3aa793ed93155715dfd1241e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 13 Mar 2023 14:30:43 +0200 Subject: [PATCH 147/426] test: update to match current interfaces --- pageserver/src/tenant.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index aa77196f2c..6a373ad520 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3178,30 +3178,39 @@ mod tests { #[tokio::test] async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> { - let tenant = - TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?.load(); + let (tenant, ctx) = + TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")? + .load() + .await; let tline = tenant - .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? - .initialize()?; - tline.set_state(TimelineState::Paused); + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)? + .initialize(&ctx)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant + .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; - tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false).await?; - + + tline.set_state(TimelineState::Broken); + + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx) + .await?; + assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50))?, - TEST_IMG(&format!("foo at {}", Lsn(0x40)))); + newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?, + TEST_IMG(&format!("foo at {}", Lsn(0x40))) + ); let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); assert_eq!(branchpoints[0], Lsn(0x40)); - + Ok(()) } From 15ed6af5f286b7e082b3ba44e26e6e66fa71d04e Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 10 Mar 2023 23:10:55 +0300 Subject: [PATCH 148/426] Add descriptions to proxy's python tests. --- test_runner/regress/test_proxy.py | 45 +++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 99a3f2fa86..51fabdd2a1 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -4,10 +4,20 @@ from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres def test_proxy_select_1(static_proxy: NeonProxy): - static_proxy.safe_psql("select 1", options="project=generic-project-name") + """ + A simplest smoke test: check proxy against a local postgres instance. + """ + + out = static_proxy.safe_psql("select 1", options="project=generic-project-name") + assert out[0][0] == 1 def test_password_hack(static_proxy: NeonProxy): + """ + Check the PasswordHack auth flow: an alternative to SCRAM auth for + clients which can't provide the project/endpoint name via SNI or `options`. + """ + user = "borat" password = "password" static_proxy.safe_psql( @@ -25,7 +35,11 @@ def test_password_hack(static_proxy: NeonProxy): @pytest.mark.asyncio -async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy): +async def test_link_auth(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy): + """ + Check the Link auth flow: a lightweight auth method which delegates + all necessary checks to the console by sending client an auth URL. + """ psql = await PSQL(host=link_proxy.host, port=link_proxy.proxy_port).run("select 42") @@ -40,16 +54,27 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx assert out == "42" -# Pass extra options to the server. def test_proxy_options(static_proxy: NeonProxy): - with static_proxy.connect(options="project=irrelevant -cproxytest.option=value") as conn: - with conn.cursor() as cur: - cur.execute("SHOW proxytest.option") - value = cur.fetchall()[0][0] - assert value == "value" + """ + Check that we pass extra `options` to the PostgreSQL server: + * `project=...` shouldn't be passed at all (otherwise postgres will raise an error). + * everything else should be passed as-is. + """ + + options = "project=irrelevant -cproxytest.option=value" + out = static_proxy.safe_psql("show proxytest.option", options=options) + assert out[0][0] == "value" + + options = "-c proxytest.foo=\\ str project=irrelevant" + out = static_proxy.safe_psql("show proxytest.foo", options=options) + assert out[0][0] == " str" def test_auth_errors(static_proxy: NeonProxy): + """ + Check that we throw very specific errors in some unsuccessful auth scenarios. + """ + # User does not exist with pytest.raises(psycopg2.Error) as exprinfo: static_proxy.connect(user="pinocchio", options="project=irrelevant") @@ -78,6 +103,10 @@ def test_auth_errors(static_proxy: NeonProxy): def test_forward_params_to_client(static_proxy: NeonProxy): + """ + Check that we forward all necessary PostgreSQL server params to client. + """ + # A subset of parameters (GUCs) which postgres # sends to the client during connection setup. # Unfortunately, `GUC_REPORT` can't be queried. From 2e4bf7cee4e43a868ad810c364efd9ea50889c4f Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 10 Mar 2023 23:51:06 +0300 Subject: [PATCH 149/426] [proxy] Immediately log all compute node connection errors. --- proxy/src/compute.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 2e12d9ee26..b5efc72803 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -6,9 +6,9 @@ use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; -use tracing::{error, info}; +use tracing::{error, info, warn}; -const COULD_NOT_CONNECT: &str = "Could not connect to compute node"; +const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] pub enum ConnectionError { @@ -131,7 +131,7 @@ impl ConnCfg { use tokio_postgres::config::Host; let connect_once = |host, port| { - info!("trying to connect to a compute node at {host}:{port}"); + info!("trying to connect to compute node at {host}:{port}"); TcpStream::connect((host, port)).and_then(|socket| async { let socket_addr = socket.peer_addr()?; // This prevents load balancer from severing the connection. @@ -151,7 +151,7 @@ impl ConnCfg { return Err(io::Error::new( io::ErrorKind::Other, format!( - "couldn't connect: bad compute config, \ + "bad compute config, \ ports and hosts entries' count does not match: {:?}", self.0 ), @@ -170,7 +170,7 @@ impl ConnCfg { Ok(socket) => return Ok(socket), Err(err) => { // We can't throw an error here, as there might be more hosts to try. - error!("failed to connect to a compute node at {host}:{port}: {err}"); + warn!("couldn't connect to compute node at {host}:{port}: {err}"); connection_error = Some(err); } } @@ -179,7 +179,7 @@ impl ConnCfg { Err(connection_error.unwrap_or_else(|| { io::Error::new( io::ErrorKind::Other, - format!("couldn't connect: bad compute config: {:?}", self.0), + format!("bad compute config: {:?}", self.0), ) })) } @@ -195,12 +195,11 @@ pub struct PostgresConnection { } impl ConnCfg { - /// Connect to a corresponding compute node. - pub async fn connect(&self) -> Result { + async fn do_connect(&self) -> Result { // TODO: establish a secure connection to the DB. let (socket_addr, mut stream) = self.connect_raw().await?; let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?; - info!("connected to user's compute node at {socket_addr}"); + info!("connected to compute node at {socket_addr}"); // This is very ugly but as of now there's no better way to // extract the connection parameters from tokio-postgres' connection. @@ -219,6 +218,16 @@ impl ConnCfg { Ok(connection) } + + /// Connect to a corresponding compute node. + pub async fn connect(&self) -> Result { + self.do_connect() + .inspect_err(|err| { + // Immediately log the error we have at our disposal. + error!("couldn't connect to compute node: {err}"); + }) + .await + } } /// Retrieve `options` from a startup message, dropping all proxy-secific flags. From 319402fc746355a8e24603e6a9e9f06d2ca99c57 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 14 Mar 2023 11:25:48 +0100 Subject: [PATCH 150/426] postgres_ffi: restore POSTGRES_INSTALL_DIR support (#3811) Fix path construction to `pg_config`: `pg_install_dir_versioned` already includes `pg_version` --- libs/postgres_ffi/build.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 25ff398bbd..66221af522 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -63,10 +63,7 @@ fn main() -> anyhow::Result<()> { pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } - let pg_config_bin = pg_install_dir_versioned - .join(pg_version) - .join("bin") - .join("pg_config"); + let pg_config_bin = pg_install_dir_versioned.join("bin").join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") From d6bb8caad46e3438eb339e9d17e3f70cf3cf9db7 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 14 Mar 2023 13:18:26 +0200 Subject: [PATCH 151/426] refactor: correct return value for not found L0's on LayerMap::replace (#3805) in prev implementation, an `ok_or_else(...)?` is used to cause a "precondition error" on LayerMap::replace, however we only see this particular error if an L0 for which replace fails is not in the layermap because it is not in `l0_delta_layers`. changes or fixes this to be Replacement::NotFound instead, making it more clear that an error would only be raised for actual preconditions, like trying to replace layer with completly unrelated layer. --- pageserver/src/tenant/layer_map.rs | 44 ++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 8d7d9c6f8f..4c659be9aa 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -154,11 +154,7 @@ where expected: &Arc, new: Arc, ) -> anyhow::Result>> { - fail::fail_point!("layermap-replace-notfound", |_| Ok( - // this is not what happens if an L0 layer was not found a anyhow error but perhaps - // that should be changed. this is good enough to show a replacement failure. - Replacement::NotFound - )); + fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound)); self.layer_map.replace_historic_noflush(expected, new) } @@ -340,12 +336,15 @@ where let l0_index = if expected_l0 { // find the index in case replace worked, we need to replace that as well - Some( - self.l0_delta_layers - .iter() - .position(|slot| Self::compare_arced_layers(slot, expected)) - .ok_or_else(|| anyhow::anyhow!("existing l0 delta layer was not found"))?, - ) + let pos = self + .l0_delta_layers + .iter() + .position(|slot| Self::compare_arced_layers(slot, expected)); + + if pos.is_none() { + return Ok(Replacement::NotFound); + } + pos } else { None }; @@ -804,6 +803,26 @@ mod tests { ) } + #[test] + fn replacing_missing_l0_is_notfound() { + // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should + // however only happen for precondition failures. + + let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69"; + let layer = LayerFileName::from_str(layer).unwrap(); + let layer = LayerDescriptor::from(layer); + + // same skeletan construction; see scenario below + let not_found: Arc = Arc::new(layer.clone()); + let new_version: Arc = Arc::new(layer); + + let mut map = LayerMap::default(); + + let res = map.batch_update().replace_historic(¬_found, new_version); + + assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}"); + } + fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) { let name = LayerFileName::from_str(layer_name).unwrap(); let skeleton = LayerDescriptor::from(name); @@ -813,7 +832,8 @@ mod tests { let mut map = LayerMap::default(); - // two disjoint Arcs in different lifecycle phases. + // two disjoint Arcs in different lifecycle phases. even if it seems they must be the + // same layer, we use LayerMap::compare_arced_layers as the identity of layers. assert!(!LayerMap::compare_arced_layers(&remote, &downloaded)); let expected_in_counts = (1, usize::from(expected_l0)); From 68ae020b3725eaac7cac80cdb512718047883341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Tue, 14 Mar 2023 15:23:46 +0200 Subject: [PATCH 152/426] Use RollingUpdate strategy also for legacy proxy (#3814) ## Describe your changes We have previously changed the neon-proxy to use RollingUpdate. This should be enabled in legacy proxy too in order to avoid breaking connections for the clients and allow for example backups to run even during deployment. (https://github.com/neondatabase/neon/pull/3683) ## Issue ticket number and link https://github.com/neondatabase/neon/issues/3333 --- ...od-us-west-2-eta.neon-proxy-scram-legacy.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml index e67a3e4461..d23ea41bd7 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -1,6 +1,22 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 604800"] +terminationGracePeriodSeconds: 604800 + + image: repository: neondatabase/neon From 3d869cbcde47ebcadf1d5250509e5dda1a491d77 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 14 Mar 2023 14:25:44 +0100 Subject: [PATCH 153/426] Replace flake8 and isort with ruff (#3810) - Introduce ruff (https://beta.ruff.rs/) to replace flake8 and isort - Update mypy and black --- .github/workflows/build_and_test.yml | 7 +- docs/sourcetree.md | 5 +- poetry.lock | 264 ++++++++---------- pre-commit.py | 23 +- pyproject.toml | 27 +- scripts/export_import_between_pageservers.py | 5 +- scripts/reformat | 3 +- scripts/sk_cleanup_tenants/script.py | 1 - setup.cfg | 8 - test_runner/fixtures/benchmark_fixture.py | 1 + test_runner/fixtures/compare_fixtures.py | 9 +- test_runner/fixtures/neon_fixtures.py | 20 +- test_runner/fixtures/types.py | 4 +- test_runner/fixtures/utils.py | 3 +- test_runner/performance/test_branching.py | 1 + test_runner/performance/test_bulk_update.py | 1 - .../performance/test_compare_pg_stats.py | 1 + test_runner/performance/test_gist_build.py | 1 - test_runner/performance/test_latency.py | 1 + test_runner/performance/test_layer_map.py | 1 - .../performance/test_parallel_copy_to.py | 1 - .../performance/test_wal_backpressure.py | 1 + .../performance/test_write_amplification.py | 1 - test_runner/regress/test_crafted_wal_end.py | 1 + test_runner/regress/test_fullbackup.py | 7 +- test_runner/regress/test_gc_aggressive.py | 2 - test_runner/regress/test_layer_eviction.py | 1 - test_runner/regress/test_ondemand_download.py | 2 +- test_runner/regress/test_read_validation.py | 2 - test_runner/regress/test_readonly_node.py | 2 +- test_runner/regress/test_remote_storage.py | 2 - test_runner/regress/test_tenant_detach.py | 1 + test_runner/regress/test_tenant_relocation.py | 1 - test_runner/regress/test_timeline_delete.py | 1 - test_runner/regress/test_timeline_size.py | 4 - test_runner/regress/test_truncate.py | 1 - test_runner/regress/test_wal_acceptor.py | 1 - .../regress/test_wal_acceptor_async.py | 2 - test_runner/regress/test_wal_restore.py | 7 +- 39 files changed, 177 insertions(+), 249 deletions(-) delete mode 100644 setup.cfg diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d479201305..e056cf0fcf 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -74,15 +74,12 @@ jobs: - name: Install Python deps run: ./scripts/pysync - - name: Run isort to ensure code format - run: poetry run isort --diff --check . + - name: Run ruff to ensure code format + run: poetry run ruff . - name: Run black to ensure code format run: poetry run black --diff --check . - - name: Run flake8 to ensure code format - run: poetry run flake8 . - - name: Run mypy to check types run: poetry run mypy . diff --git a/docs/sourcetree.md b/docs/sourcetree.md index db57338a71..95bed83ae5 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -129,13 +129,12 @@ Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks -We force code formatting via `black`, `isort` and type hints via `mypy`. +We force code formatting via `black`, `ruff`, and type hints via `mypy`. Run the following commands in the repository's root (next to `pyproject.toml`): ```bash -poetry run isort . # Imports are reformatted poetry run black . # All code is reformatted -poetry run flake8 . # Python linter +poetry run ruff . # Python linter poetry run mypy . # Ensure there are no typing errors ``` diff --git a/poetry.lock b/poetry.lock index bc2c56d74c..011d5d7817 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand. [[package]] name = "aiohttp" @@ -253,43 +253,46 @@ files = [ [[package]] name = "black" -version = "22.6.0" +version = "23.1.0" description = "The uncompromising code formatter." category = "dev" optional = false -python-versions = ">=3.6.2" +python-versions = ">=3.7" files = [ - {file = "black-22.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69"}, - {file = "black-22.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807"}, - {file = "black-22.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e"}, - {file = "black-22.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def"}, - {file = "black-22.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"}, - {file = "black-22.6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d"}, - {file = "black-22.6.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256"}, - {file = "black-22.6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78"}, - {file = "black-22.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849"}, - {file = "black-22.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c"}, - {file = "black-22.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90"}, - {file = "black-22.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f"}, - {file = "black-22.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e"}, - {file = "black-22.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6"}, - {file = "black-22.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad"}, - {file = "black-22.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf"}, - {file = "black-22.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c"}, - {file = "black-22.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2"}, - {file = "black-22.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee"}, - {file = "black-22.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b"}, - {file = "black-22.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4"}, - {file = "black-22.6.0-py3-none-any.whl", hash = "sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c"}, - {file = "black-22.6.0.tar.gz", hash = "sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9"}, + {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, + {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, + {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, + {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, + {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, + {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, + {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, + {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, + {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, + {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, + {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, + {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, + {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, + {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, + {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, + {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, + {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, + {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, + {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, + {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, + {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, + {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, + {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, + {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, + {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, ] [package.dependencies] click = ">=8.0.0" mypy-extensions = ">=0.4.3" +packaging = ">=22.0" pathspec = ">=0.9.0" platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -884,6 +887,8 @@ files = [ {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"}, {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"}, {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"}, + {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"}, {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"}, {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"}, {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"}, @@ -963,23 +968,6 @@ files = [ [package.extras] testing = ["pre-commit"] -[[package]] -name = "flake8" -version = "5.0.4" -description = "the modular source code checker: pep8 pyflakes and co" -category = "dev" -optional = false -python-versions = ">=3.6.1" -files = [ - {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, - {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, -] - -[package.dependencies] -mccabe = ">=0.7.0,<0.8.0" -pycodestyle = ">=2.9.0,<2.10.0" -pyflakes = ">=2.5.0,<2.6.0" - [[package]] name = "flask" version = "2.1.3" @@ -1075,24 +1063,6 @@ files = [ {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] -[[package]] -name = "isort" -version = "5.10.1" -description = "A Python utility / library to sort Python imports." -category = "dev" -optional = false -python-versions = ">=3.6.1,<4.0" -files = [ - {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, - {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, -] - -[package.extras] -colors = ["colorama (>=0.4.3,<0.5.0)"] -pipfile-deprecated-finder = ["pipreqs", "requirementslib"] -plugins = ["setuptools"] -requirements-deprecated-finder = ["pip-api", "pipreqs"] - [[package]] name = "itsdangerous" version = "2.1.2" @@ -1238,6 +1208,7 @@ category = "main" optional = false python-versions = "*" files = [ + {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"}, {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, ] @@ -1294,18 +1265,6 @@ files = [ {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] -[[package]] -name = "mccabe" -version = "0.7.0" -description = "McCabe checker, plugin for flake8" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, - {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, -] - [[package]] name = "moto" version = "4.1.2" @@ -1453,46 +1412,42 @@ files = [ [[package]] name = "mypy" -version = "0.991" +version = "1.1.1" description = "Optional static typing for Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"}, - {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"}, - {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"}, - {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"}, - {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"}, - {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"}, - {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"}, - {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"}, - {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"}, - {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"}, - {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"}, - {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"}, - {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"}, - {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"}, - {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"}, - {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"}, - {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"}, - {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"}, - {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"}, - {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"}, - {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"}, - {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"}, - {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"}, - {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"}, - {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"}, - {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"}, - {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"}, - {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"}, - {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"}, - {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"}, + {file = "mypy-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39c7119335be05630611ee798cc982623b9e8f0cff04a0b48dfc26100e0b97af"}, + {file = "mypy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61bf08362e93b6b12fad3eab68c4ea903a077b87c90ac06c11e3d7a09b56b9c1"}, + {file = "mypy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbb19c9f662e41e474e0cff502b7064a7edc6764f5262b6cd91d698163196799"}, + {file = "mypy-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:315ac73cc1cce4771c27d426b7ea558fb4e2836f89cb0296cbe056894e3a1f78"}, + {file = "mypy-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:5cb14ff9919b7df3538590fc4d4c49a0f84392237cbf5f7a816b4161c061829e"}, + {file = "mypy-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:26cdd6a22b9b40b2fd71881a8a4f34b4d7914c679f154f43385ca878a8297389"}, + {file = "mypy-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b5f81b40d94c785f288948c16e1f2da37203c6006546c5d947aab6f90aefef2"}, + {file = "mypy-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21b437be1c02712a605591e1ed1d858aba681757a1e55fe678a15c2244cd68a5"}, + {file = "mypy-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d809f88734f44a0d44959d795b1e6f64b2bbe0ea4d9cc4776aa588bb4229fc1c"}, + {file = "mypy-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:a380c041db500e1410bb5b16b3c1c35e61e773a5c3517926b81dfdab7582be54"}, + {file = "mypy-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7c7b708fe9a871a96626d61912e3f4ddd365bf7f39128362bc50cbd74a634d5"}, + {file = "mypy-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c10fa12df1232c936830839e2e935d090fc9ee315744ac33b8a32216b93707"}, + {file = "mypy-1.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0a28a76785bf57655a8ea5eb0540a15b0e781c807b5aa798bd463779988fa1d5"}, + {file = "mypy-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:ef6a01e563ec6a4940784c574d33f6ac1943864634517984471642908b30b6f7"}, + {file = "mypy-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d64c28e03ce40d5303450f547e07418c64c241669ab20610f273c9e6290b4b0b"}, + {file = "mypy-1.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64cc3afb3e9e71a79d06e3ed24bb508a6d66f782aff7e56f628bf35ba2e0ba51"}, + {file = "mypy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce61663faf7a8e5ec6f456857bfbcec2901fbdb3ad958b778403f63b9e606a1b"}, + {file = "mypy-1.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2b0c373d071593deefbcdd87ec8db91ea13bd8f1328d44947e88beae21e8d5e9"}, + {file = "mypy-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:2888ce4fe5aae5a673386fa232473014056967f3904f5abfcf6367b5af1f612a"}, + {file = "mypy-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:19ba15f9627a5723e522d007fe708007bae52b93faab00f95d72f03e1afa9598"}, + {file = "mypy-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:59bbd71e5c58eed2e992ce6523180e03c221dcd92b52f0e792f291d67b15a71c"}, + {file = "mypy-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9401e33814cec6aec8c03a9548e9385e0e228fc1b8b0a37b9ea21038e64cdd8a"}, + {file = "mypy-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b398d8b1f4fba0e3c6463e02f8ad3346f71956b92287af22c9b12c3ec965a9f"}, + {file = "mypy-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:69b35d1dcb5707382810765ed34da9db47e7f95b3528334a3c999b0c90fe523f"}, + {file = "mypy-1.1.1-py3-none-any.whl", hash = "sha256:4e4e8b362cdf99ba00c2b218036002bdcdf1e0de085cdb296a49df03fb31dfc4"}, + {file = "mypy-1.1.1.tar.gz", hash = "sha256:ae9ceae0f5b9059f33dbc62dea087e942c0ccab4b7a003719cb70f9b8abfa32f"}, ] [package.dependencies] -mypy-extensions = ">=0.4.3" +mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} typing-extensions = ">=3.10" @@ -1519,14 +1474,14 @@ typing-extensions = ">=4.1.0" [[package]] name = "mypy-extensions" -version = "0.4.3" -description = "Experimental type system extensions for programs checked with the mypy typechecker." +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." category = "dev" optional = false -python-versions = "*" +python-versions = ">=3.5" files = [ - {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, - {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] [[package]] @@ -1591,19 +1546,16 @@ requests = ["requests"] [[package]] name = "packaging" -version = "21.3" +version = "23.0" description = "Core utilities for Python packages" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, - {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, + {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, + {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] -[package.dependencies] -pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" - [[package]] name = "pathspec" version = "0.9.0" @@ -1712,6 +1664,7 @@ python-versions = ">=3.6" files = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -1745,6 +1698,7 @@ files = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -1756,6 +1710,7 @@ files = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -1788,33 +1743,10 @@ category = "main" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] -[[package]] -name = "pycodestyle" -version = "2.9.1" -description = "Python style guide checker" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, - {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, -] - [[package]] name = "pycparser" version = "2.21" @@ -1827,18 +1759,6 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] -[[package]] -name = "pyflakes" -version = "2.5.0" -description = "passive checker of Python programs" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, - {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, -] - [[package]] name = "pyjwt" version = "2.4.0" @@ -2008,8 +1928,8 @@ files = [ [package.dependencies] pytest = [ - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] [[package]] @@ -2121,6 +2041,13 @@ files = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, @@ -2205,6 +2132,33 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "ruff" +version = "0.0.255" +description = "An extremely fast Python linter, written in Rust." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.0.255-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b2d71fb6a7e50501a2473864acffc85dee6b750c25db198f7e71fe1dbbff1aad"}, + {file = "ruff-0.0.255-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6c97d746861a6010f941179e84bba9feb8a871815667471d9ed6beb98d45c252"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a7fa60085079b91a298b963361be9b1b1c724582af6c84be954cbabdbd9309a"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c089f7141496334ab5a127b54ce55e41f0d6714e68a4453a1e09d2204cdea8c3"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0423908caa7d437a416b853214565b9c33bbd1106c4f88147982216dddcbbd96"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:981493e92547cacbb8e0874904ec049fe744507ee890dc8736caf89a8864f9a7"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d5193d2aedb35db180824462b374dbcfc306b2e76076245088afa6e5837df2"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd5e00733c9d160c8a34a22e62b390da9d1e9f326676402421cb8c1236beefc3"}, + {file = "ruff-0.0.255-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:694418cf41838bd19c6229e4e1b2d04505b1e6b86fe3ab81165484fc96d36f01"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5d0408985c9777369daebb5d3340a99e9f7294bdd7120642239261508185cf89"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abd6376ef9d12f370d95a8c7c98682fbb9bfedfba59f40e84a816fef8ddcb8de"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9b1a5df0bc09193cbef58a6f78e4a9a0b058a4f9733c0442866d078006d1bb9"}, + {file = "ruff-0.0.255-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6a25c5f4ff087445b2e1bbcb9963f2ae7c868d65e4a8d5f84c36c12f71571179"}, + {file = "ruff-0.0.255-py3-none-win32.whl", hash = "sha256:1ff87a8310354f9f1a099625e54a27fdd6756d9cd2a40b45922f2e943daf982d"}, + {file = "ruff-0.0.255-py3-none-win_amd64.whl", hash = "sha256:f3d8416be618f023f93ec4fd6ee3048585ef85dba9563b2a7e38fc7e5131d5b1"}, + {file = "ruff-0.0.255-py3-none-win_arm64.whl", hash = "sha256:8ba124819624145d7b6b53add40c367c44318893215ffc1bfe3d72e0225a1c9c"}, + {file = "ruff-0.0.255.tar.gz", hash = "sha256:f9eb1d3b2eecbeedae419fa494c4e2a5e4484baf93a1ce0f81eddb005e1919c5"}, +] + [[package]] name = "s3transfer" version = "0.6.0" @@ -2643,4 +2597,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd" +content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91" diff --git a/pre-commit.py b/pre-commit.py index 560df6cd0c..dc0b9ed588 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -43,17 +43,13 @@ def black(fix_inplace: bool) -> str: return cmd -def isort(fix_inplace: bool) -> str: - cmd = "poetry run isort" - if not fix_inplace: - cmd += " --diff --check" +def ruff(fix_inplace: bool) -> str: + cmd = "poetry run ruff" + if fix_inplace: + cmd += " --fix" return cmd -def flake8() -> str: - return "poetry run flake8" - - def mypy() -> str: return "poetry run mypy" @@ -112,13 +108,6 @@ if __name__ == "__main__": changed_files=files, no_color=args.no_color, ) - check( - name="isort", - suffix=".py", - cmd=isort(fix_inplace=args.fix_inplace), - changed_files=files, - no_color=args.no_color, - ) check( name="black", suffix=".py", @@ -127,9 +116,9 @@ if __name__ == "__main__": no_color=args.no_color, ) check( - name="flake8", + name="ruff", suffix=".py", - cmd=flake8(), + cmd=ruff(fix_inplace=args.fix_inplace), changed_files=files, no_color=args.no_color, ) diff --git a/pyproject.toml b/pyproject.toml index 415f7f1ae7..f21c12b2e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,11 +35,10 @@ types-toml = "^0.10.8" pytest-httpserver = "^1.0.6" aiohttp = "3.7.4" -[tool.poetry.dev-dependencies] -flake8 = "^5.0.4" -mypy = "==0.991" -black = "^22.6.0" -isort = "^5.10.1" +[tool.poetry.group.dev.dependencies] +black = "^23.1.0" +mypy = "==1.1.1" +ruff = "^0.0.255" [build-system] requires = ["poetry-core>=1.0.0"] @@ -53,14 +52,6 @@ extend-exclude = ''' )/ ''' -[tool.isort] -profile = "black" -line_length = 100 -skip_gitignore = true -skip = [ - "vendor", -] - [tool.mypy] exclude = "^vendor/" check_untyped_defs = true @@ -80,3 +71,13 @@ module = [ "pg8000.*", ] ignore_missing_imports = true + +[tool.ruff] +extend-exclude = ["vendor/"] +ignore = ["E501"] +select = [ + "E", # pycodestyle + "F", # Pyflakes + "I", # isort + "W", # pycodestyle +] diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index db2b5e81ab..4292c981a9 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -308,8 +308,8 @@ def lsn_to_hex(num: int) -> str: def lsn_from_hex(lsn_hex: str) -> int: """Convert lsn from hex notation to int.""" - l, r = lsn_hex.split("/") - return (int(l, 16) << 32) + int(r, 16) + left, right = lsn_hex.split("/") + return (int(left, 16) << 32) + int(right, 16) def remote_consistent_lsn( @@ -398,7 +398,6 @@ def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) for relname, filepath in result: if filepath is not None: - if database == "template0copy": # Add all template0copy paths to template0 prefix = f"base/{oid}/" diff --git a/scripts/reformat b/scripts/reformat index 5346c78ead..8688044f66 100755 --- a/scripts/reformat +++ b/scripts/reformat @@ -6,6 +6,5 @@ set -euox pipefail echo 'Reformatting Rust code' cargo fmt echo 'Reformatting Python code' -poetry run isort test_runner scripts -poetry run flake8 test_runner scripts +poetry run ruff --fix test_runner scripts poetry run black test_runner scripts diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py index 4d010d85ea..fa22433614 100644 --- a/scripts/sk_cleanup_tenants/script.py +++ b/scripts/sk_cleanup_tenants/script.py @@ -68,7 +68,6 @@ def call_delete_tenant_api(tenant_id): def cleanup_tenant(tenant_id): - tenant_dir = Path(f"/storage/safekeeper/data/{tenant_id}") if not tenant_dir.exists(): diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index a067ee731d..0000000000 --- a/setup.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[flake8] -# Move config to pyproject.toml as soon as flake8 supports it -# https://github.com/PyCQA/flake8/issues/234 -extend-ignore = - E203, # Whitespace before ':' -- conflicts with black - E266, # Too many leading '#' for block comment -- we use it for formatting sometimes - E501 # Line too long -- black sorts it out -extend-exclude = vendor/ diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index a39aaf8241..67a99aa452 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -17,6 +17,7 @@ import pytest from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.terminal import TerminalReporter + from fixtures.neon_fixtures import NeonPageserver from fixtures.types import TenantId, TimelineId diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 0ba926c8d2..b328cea5c6 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -6,8 +6,15 @@ from typing import Dict, Iterator, List import pytest from _pytest.fixtures import FixtureRequest + from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres +from fixtures.neon_fixtures import ( + NeonEnv, + PgBin, + PgProtocol, + RemotePostgres, + VanillaPostgres, +) from fixtures.pg_stats import PgStatTable diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 70a6f1809e..a25709a305 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -35,6 +35,13 @@ import requests from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest + +# Type-related stuff +from psycopg2.extensions import connection as PgConnection +from psycopg2.extensions import cursor as PgCursor +from psycopg2.extensions import make_dsn, parse_dsn +from typing_extensions import Literal + from fixtures.log_helper import log from fixtures.metrics import Metrics, parse_metrics from fixtures.types import Lsn, TenantId, TimelineId @@ -46,12 +53,6 @@ from fixtures.utils import ( subprocess_capture, ) -# Type-related stuff -from psycopg2.extensions import connection as PgConnection -from psycopg2.extensions import cursor as PgCursor -from psycopg2.extensions import make_dsn, parse_dsn -from typing_extensions import Literal - """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -1243,7 +1244,6 @@ class PageserverHttpClient(requests.Session): include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, ) -> List[Dict[str, Any]]: - params = {} if include_non_incremental_logical_size: params["include-non-incremental-logical-size"] = "true" @@ -1375,7 +1375,6 @@ class PageserverHttpClient(requests.Session): timeline_id: TimelineId, max_concurrent_downloads: int, ) -> dict[str, Any]: - body = { "max_concurrent_downloads": max_concurrent_downloads, } @@ -1668,7 +1667,7 @@ class AbstractNeonCli(abc.ABC): env_vars["POSTGRES_DISTRIB_DIR"] = str(self.env.pg_distrib_dir) if self.env.rust_log_override is not None: env_vars["RUST_LOG"] = self.env.rust_log_override - for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): + for extra_env_key, extra_env_value in (extra_env_vars or {}).items(): env_vars[extra_env_key] = extra_env_value # Pass coverage settings @@ -2852,7 +2851,6 @@ class PostgresFactory: lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: - pg = Postgres( self.env, tenant_id=tenant_id or self.env.initial_tenant, @@ -2876,7 +2874,6 @@ class PostgresFactory: lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> Postgres: - pg = Postgres( self.env, tenant_id=tenant_id or self.env.initial_tenant, @@ -3323,7 +3320,6 @@ def check_restored_datadir_content( log.info(f"filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}") for f in mismatch: - f1 = os.path.join(pg.pgdata_dir, f) f2 = os.path.join(restored_dir_path, f) stdout_filename = "{}.filediff".format(f2) diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index 2bb962d44a..7d179cc7fb 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -17,8 +17,8 @@ class Lsn: self.lsn_int = x else: """Convert lsn from hex notation to int.""" - l, r = x.split("/") - self.lsn_int = (int(l, 16) << 32) + int(r, 16) + left, right = x.split("/") + self.lsn_int = (int(left, 16) << 32) + int(right, 16) assert 0 <= self.lsn_int <= 0xFFFFFFFF_FFFFFFFF def __str__(self) -> str: diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index df83fc6377..ce03658e8f 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -8,9 +8,10 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, TypeVar import allure # type: ignore -from fixtures.log_helper import log from psycopg2.extensions import cursor +from fixtures.log_helper import log + Fn = TypeVar("Fn", bound=Callable[..., Any]) diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index 0fe7306f87..4eaec40096 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -5,6 +5,7 @@ from typing import List from fixtures.benchmark_fixture import PgBenchRunResult from fixtures.compare_fixtures import NeonCompare from fixtures.neon_fixtures import fork_at_current_lsn + from performance.test_perf_pgbench import utc_now_timestamp # ----------------------------------------------------------------------- diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py index f8e29cda69..7aa6f09a40 100644 --- a/test_runner/performance/test_bulk_update.py +++ b/test_runner/performance/test_bulk_update.py @@ -13,7 +13,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn @pytest.mark.timeout(10000) @pytest.mark.parametrize("fillfactor", [10, 50, 100]) def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor): - env = neon_env_builder.init_start() n_records = 1000000 diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py index d39ea55fbb..d5dd1b4bd0 100644 --- a/test_runner/performance/test_compare_pg_stats.py +++ b/test_runner/performance/test_compare_pg_stats.py @@ -6,6 +6,7 @@ from typing import List import pytest from fixtures.compare_fixtures import PgCompare from fixtures.pg_stats import PgStatTable + from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index 311030b99d..45900d0c7f 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -13,7 +13,6 @@ def test_gist_buffering_build(neon_with_baseline: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - # Create test table. cur.execute("create table gist_point_tbl(id int4, p point)") cur.execute( diff --git a/test_runner/performance/test_latency.py b/test_runner/performance/test_latency.py index 9aa618650d..257e0421af 100644 --- a/test_runner/performance/test_latency.py +++ b/test_runner/performance/test_latency.py @@ -3,6 +3,7 @@ import threading import pytest from fixtures.compare_fixtures import PgCompare from fixtures.neon_fixtures import Postgres + from performance.test_perf_pgbench import get_scales_matrix from performance.test_wal_backpressure import record_read_latency diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index ac49ea9051..fb29c05273 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -7,7 +7,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder # Benchmark searching the layer map, when there are a lot of small layer files. # def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): - env = neon_env_builder.init_start() n_iters = 10 n_records = 100000 diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index b4a25e0edc..746c1b73dd 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -36,7 +36,6 @@ async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int): # Load 5 different tables in parallel with COPY TO def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_parallel=5): - env = neon_with_baseline conn = env.pg.connect() cur = conn.cursor() diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index f9a18c84fd..3939ca30b6 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -10,6 +10,7 @@ from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin from fixtures.types import Lsn + from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 30c217e392..3e290b3996 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -22,7 +22,6 @@ def test_write_amplification(neon_with_baseline: PgCompare): with conn.cursor() as cur: with env.record_pageserver_writes("pageserver_writes"): with env.record_duration("run"): - # NOTE: Because each iteration updates every table already created, # the runtime and write amplification is O(n^2), where n is the # number of iterations. diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index e94c9a2bd0..9899d424d1 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -2,6 +2,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft + # Restart nodes with WAL end having specially crafted shape, like last record # crossing segment boundary, to test decoding issues. diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index fc515e5878..b3d58edf6b 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -2,7 +2,12 @@ import os from pathlib import Path from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + PortDistributor, + VanillaPostgres, +) from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 77438e1b64..702d94c691 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -68,7 +68,6 @@ async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId): # (repro for https://github.com/neondatabase/neon/issues/1047) # def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): - # Disable pitr, because here we want to test branch creation after GC neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() @@ -101,7 +100,6 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): - # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index e7c9713f98..80e7ae8d7e 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -146,7 +146,6 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.LOCAL_FS, test_name="test_gc_of_remote_layers", diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 12088c3353..fd13651427 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -250,7 +250,7 @@ def test_ondemand_download_timetravel( # Run queries at different points in time num_layers_downloaded = [0] resident_size = [get_resident_physical_size()] - for (checkpoint_number, lsn) in lsns: + for checkpoint_number, lsn in lsns: pg_old = env.postgres.create_start( branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn ) diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 1e49c3b69f..47135dc56c 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -22,7 +22,6 @@ def test_read_validation(neon_simple_env: NeonEnv): with closing(pg.connect()) as con: with con.cursor() as c: - for e in extensions: c.execute("create extension if not exists {};".format(e)) @@ -150,7 +149,6 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): with closing(pg.connect()) as con: with con.cursor() as c: - for e in extensions: c.execute("create extension if not exists {};".format(e)) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 62c3ead0a7..7487757071 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -146,7 +146,7 @@ def test_timetravel(neon_simple_env: NeonEnv): env.pageserver.stop() env.pageserver.start() - for (i, lsn) in lsns: + for i, lsn in lsns: pg_old = env.postgres.create_start( branch_name="test_timetravel", node_name=f"test_old_lsn_{i}", lsn=lsn ) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index dd0b576c5e..1f6f0c67cc 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -212,7 +212,6 @@ def test_remote_storage_upload_queue_retries( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, test_name="test_remote_storage_upload_queue_retries", @@ -374,7 +373,6 @@ def test_remote_timeline_client_calls_started_metric( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, test_name="test_remote_timeline_client_metrics", diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index ac1f7b2891..27ec38e1be 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -177,6 +177,7 @@ async def reattach_while_busy( # running, and when we retry the queries, they should start working # after the attach has finished. + # FIXME: # # This is pretty unstable at the moment. I've seen it fail with a warning like this: diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 1b58937e2a..247fed846e 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -375,7 +375,6 @@ def test_tenant_relocation( neon_env_builder.broker, neon_env_builder.pg_distrib_dir, ): - # Migrate either by attaching from s3 or import/export basebackup if method == "major": cmd = [ diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index d8f9ef2f89..2226cab8ff 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -40,7 +40,6 @@ def test_timeline_delete(neon_simple_env: NeonEnv): with pytest.raises( PageserverApiException, match="Cannot delete timeline which has child timelines" ): - timeline_path = ( env.repo_dir / "tenants" diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index ea4b65c9a8..c4e8e7aa07 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -90,7 +90,6 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): cur.execute("CREATE DATABASE foodb") with closing(pgmain.connect(dbname="foodb")) as conn: with conn.cursor() as cur2: - cur2.execute("CREATE TABLE foo (t text)") cur2.execute( """ @@ -308,7 +307,6 @@ def test_timeline_initial_logical_size_calculation_cancellation( def test_timeline_physical_size_init( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): - if remote_storage_kind is not None: neon_env_builder.enable_remote_storage( remote_storage_kind, "test_timeline_physical_size_init" @@ -385,7 +383,6 @@ def test_timeline_physical_size_post_checkpoint( def test_timeline_physical_size_post_compaction( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): - if remote_storage_kind is not None: neon_env_builder.enable_remote_storage( remote_storage_kind, "test_timeline_physical_size_init" @@ -440,7 +437,6 @@ def test_timeline_physical_size_post_compaction( def test_timeline_physical_size_post_gc( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): - if remote_storage_kind is not None: neon_env_builder.enable_remote_storage( remote_storage_kind, "test_timeline_physical_size_init" diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index a358f94192..cfe8a7f067 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -7,7 +7,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder # Test truncation of FSM and VM forks of a relation # def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): - env = neon_env_builder.init_start() n_records = 10000 n_iter = 10 diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 489afb7b93..407085a01a 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -709,7 +709,6 @@ def test_sync_safekeepers( pg_bin: PgBin, port_distributor: PortDistributor, ): - # We don't really need the full environment for this test, just the # safekeepers would be enough. neon_env_builder.num_safekeepers = 3 diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 70ae6bae18..f10a40690e 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -521,7 +521,6 @@ async def run_race_conditions(env: NeonEnv, pg: Postgres): # do inserts while concurrently getting up/down subsets of acceptors def test_race_conditions(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -588,7 +587,6 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): # do inserts while restarting postgres and messing with safekeeper addresses def test_wal_lagging(neon_env_builder: NeonEnvBuilder): - neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index e1b1e03515..63d0b46f63 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -1,6 +1,11 @@ from pathlib import Path -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + PortDistributor, + VanillaPostgres, +) from fixtures.types import TenantId From 15b692ccc945de0179cefe0815aebf8554649533 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 14 Mar 2023 15:27:52 +0200 Subject: [PATCH 154/426] test: more strict finding of WARN, ERROR lines (#3798) this prevents flakyness when `WARN|ERROR` appears in some other part of the line, for example in a random filename. --- test_runner/fixtures/neon_fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a25709a305..c5e260a962 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2154,7 +2154,7 @@ class NeonPageserver(PgProtocol): def assert_no_errors(self): logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") - error_or_warn = re.compile("ERROR|WARN") + error_or_warn = re.compile(r"\s(ERROR|WARN)") errors = [] while True: line = logfile.readline() From c23c8946a3a867229735ae7732417fa2b0cb98f9 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 14 Mar 2023 15:29:02 +0200 Subject: [PATCH 155/426] chore: clippies introduced with rust 1.68 (#3781) - handle automatically fixable future clippies - tune run-clippy.sh to remove macos specifics which we no longer have Co-authored-by: Alexander Bayandin --- libs/pq_proto/src/lib.rs | 8 +++----- libs/utils/src/fs_ext.rs | 2 +- pageserver/src/tenant.rs | 7 ++----- pageserver/src/tenant/timeline.rs | 4 +--- proxy/src/scram/messages.rs | 2 +- run_clippy.sh | 18 +++++------------- 6 files changed, 13 insertions(+), 28 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 46d531239a..656c0ff312 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -265,11 +265,9 @@ impl FeMessage { b'c' => Ok(Some(FeMessage::CopyDone)), b'f' => Ok(Some(FeMessage::CopyFail)), b'p' => Ok(Some(FeMessage::PasswordMessage(msg))), - tag => { - return Err(ProtocolError::Protocol(format!( - "unknown message tag: {tag},'{msg:?}'" - ))) - } + tag => Err(ProtocolError::Protocol(format!( + "unknown message tag: {tag},'{msg:?}'" + ))), } } } diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index b8d00df409..d2cb7be816 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -11,7 +11,7 @@ where P: AsRef, { fn is_empty_dir(&self) -> io::Result { - Ok(fs::read_dir(self)?.into_iter().next().is_none()) + Ok(fs::read_dir(self)?.next().is_none()) } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6a373ad520..1fb312fe07 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1243,11 +1243,8 @@ impl Tenant { "Cannot run GC iteration on inactive tenant" ); - let gc_result = self - .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx) - .await; - - gc_result + self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx) + .await } /// Perform one compaction iteration. diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c304791ee2..a8847ec773 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3147,9 +3147,7 @@ impl Timeline { } fail_point!("delta-layer-writer-fail-before-finish", |_| { - return Err( - anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(), - ); + Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into()) }); writer.as_mut().unwrap().put_value(key, lsn, value)?; diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 05855e74df..b59baec508 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -14,7 +14,7 @@ pub const SCRAM_RAW_NONCE_LEN: usize = 18; fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { for mut chars in parts.map(|s| s.chars()) { let attr = chars.next()?; - if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) { + if !attr.is_ascii_alphabetic() { return None; } let eq = chars.next()?; diff --git a/run_clippy.sh b/run_clippy.sh index be07a0110a..ae9482ee96 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -8,21 +8,13 @@ # warnings and errors right in the editor. # In vscode, this setting is Rust-analyzer>Check On Save:Command - -# Not every feature is supported in macOS builds. Avoid running regular linting -# script that checks every feature. -# # manual-range-contains wants # !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len) # instead of # len < 4 || len > MAX_STARTUP_PACKET_LENGTH # , let's disagree. -if [[ "$OSTYPE" == "darwin"* ]]; then - # no extra features to test currently, add more here when needed - cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -A clippy::manual-range-contains -D warnings -else - # * `-A unknown_lints` – do not warn about unknown lint suppressions - # that people with newer toolchains might use - # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) - cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings -fi + +# * `-A unknown_lints` – do not warn about unknown lint suppressions +# that people with newer toolchains might use +# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) +cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings From 5396273541ebcb4b22b5577bfbae29c3a809353f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 14 Mar 2023 16:15:07 +0200 Subject: [PATCH 156/426] Avoid holes between generated image layers (#3771) ## Describe your changes When we perform partitioning of the whole key space, we take in account actual ranges of relation present in the database. So if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>. Generated image layers will contain the same boundaries. But when GC is checking image coverage to find out of old layer is fully covered by newer image layer and so can be deleted, it takes in account only full key range. I.e. if there is delta layer <100000000..300000000> then it never be garbage collected because image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.This is how it looks in practice: 000000067F000032AC00000A300000000000-000000067F000032AC00000A330000000000__000000000F761828 000000067F000032AC00000A31000000001F-000000067F000032AC00000A620000000005__0000000001696070-000000000442A551 000000067F000032AC00000A3300FFFFFFFF-000000067F000032AC00000A650100000000__000000000F761828 So there are two image layers covering delta layer but ... there is a hole: A330000000000...A3300FFFFFFFF and as a result delta layer is not collected. ## Issue ticket number and link This PR is deeply related with #3673 because it is addressing the same problem: old layers are not utilized by GC. The test test_gc_old_layers.py in #3673 can be used to see effect of this patch. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: Joonas Koivunen --- pageserver/src/tenant/timeline.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a8847ec773..fff288c683 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2715,10 +2715,22 @@ impl Timeline { ) -> Result, PageReconstructError> { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); + + // We need to avoid holes between generated image layers. + // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one + // image layer with hole between them. In this case such layer can not be utilized by GC. + // + // How such hole between partitions can appear? + // if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of + // KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>. + // If there is delta layer <100000000..300000000> then it never be garbage collected because + // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. + let mut start = Key::MIN; + for partition in partitioning.parts.iter() { + let img_range = start..partition.ranges.last().unwrap().end; + start = img_range.end; if force || self.time_for_new_image_layer(partition, lsn)? { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, @@ -2732,7 +2744,6 @@ impl Timeline { "failpoint image-layer-writer-fail-before-finish" ))) }); - for range in &partition.ranges { let mut key = range.start; while key < range.end { From 4a92799f24a6b470b3ba1bddebf228589fca5ba2 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 14 Mar 2023 13:10:00 +0200 Subject: [PATCH 157/426] Fix check for trailing garbage in basebackup import. There was a warning for trailing garbage after end-of-tar archive, but it didn't always work. The reason is that we created a StreamReader over the original copyin-stream, but performed the check for garbage on the copyin-stream. There could be some garbage bytes buffered in the StreamReader, which were not caught by the warning. I considered turning the the warning into a fatal error, aborting the import, but I wasn't sure if we handle aborting the import properly. Do we clean up the timeline directory on error? If we don't, we should make that more robust, but that's a different story. Also, normally a valid tar archive ends with two 512-byte blocks of zeros. The tokio_tar crate stops at the first all-zeros block. Read and check the second all-zeros block, and error out if it's not there, or contains something unexpected. --- pageserver/src/page_service.rs | 77 +++++++++++++++++++++--------- pageserver/src/tenant.rs | 7 +-- test_runner/regress/test_import.py | 19 +++++++- 3 files changed, 74 insertions(+), 29 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index dc9bf955f7..aad6099952 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -12,7 +12,7 @@ use anyhow::Context; use bytes::Buf; use bytes::Bytes; -use futures::{Stream, StreamExt}; +use futures::Stream; use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, @@ -31,6 +31,7 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use tokio_util::io::StreamReader; use tracing::*; use utils::id::ConnectionId; use utils::{ @@ -115,6 +116,49 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream anyhow::Result<()> { + use tokio::io::AsyncReadExt; + let mut buf = [0u8; 512]; + + // Read the all-zeros block, and verify it + let mut total_bytes = 0; + while total_bytes < 512 { + let nbytes = reader.read(&mut buf[total_bytes..]).await?; + total_bytes += nbytes; + if nbytes == 0 { + break; + } + } + if total_bytes < 512 { + anyhow::bail!("incomplete or invalid tar EOF marker"); + } + if !buf.iter().all(|&x| x == 0) { + anyhow::bail!("invalid tar EOF marker"); + } + + // Drain any data after the EOF marker + let mut trailing_bytes = 0; + loop { + let nbytes = reader.read(&mut buf).await?; + trailing_bytes += nbytes; + if nbytes == 0 { + break; + } + } + if trailing_bytes > 0 { + warn!("ignored {trailing_bytes} unexpected bytes after the tar archive"); + } + Ok(()) +} + /////////////////////////////////////////////////////////////////////////////// /// @@ -422,19 +466,14 @@ impl PageServerHandler { pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let copyin_reader = StreamReader::new(copyin_stream(pgb)); + tokio::pin!(copyin_reader); timeline - .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx) + .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx) .await?; - // Drain the rest of the Copy data - let mut bytes_after_tar = 0; - while let Some(bytes) = copyin_stream.next().await { - bytes_after_tar += bytes?.len(); - } - if bytes_after_tar > 0 { - warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); - } + // Read the end of the tar archive. + read_tar_eof(copyin_reader).await?; // TODO check checksum // Meanwhile you can verify client-side by taking fullbackup @@ -473,19 +512,13 @@ impl PageServerHandler { info!("importing wal"); pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let mut copyin_stream = Box::pin(copyin_stream(pgb)); - let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream); - import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?; + let copyin_reader = StreamReader::new(copyin_stream(pgb)); + tokio::pin!(copyin_reader); + import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; info!("wal import complete"); - // Drain the rest of the Copy data - let mut bytes_after_tar = 0; - while let Some(bytes) = copyin_stream.next().await { - bytes_after_tar += bytes?.len(); - } - if bytes_after_tar > 0 { - warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); - } + // Read the end of the tar archive. + read_tar_eof(copyin_reader).await?; // TODO Does it make sense to overshoot? if timeline.get_last_record_lsn() < end_lsn { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1fb312fe07..5f1e23b873 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,9 +12,7 @@ //! use anyhow::{bail, Context}; -use bytes::Bytes; use futures::FutureExt; -use futures::Stream; use pageserver_api::models::TimelineState; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; @@ -239,14 +237,13 @@ impl UninitializedTimeline<'_> { /// Prepares timeline data by loading it from the basebackup archive. pub async fn import_basebackup_from_tar( self, - copyin_stream: &mut (impl Stream> + Sync + Send + Unpin), + copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, ctx: &RequestContext, ) -> anyhow::Result> { let raw_timeline = self.raw_timeline()?; - let mut reader = tokio_util::io::StreamReader::new(copyin_stream); - import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx) + import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx) .await .context("Failed to import basebackup")?; diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 0388e24e98..1dc10fbf4f 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -61,6 +61,12 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build cwd=unpacked_base, ) + # Make copy of base.tar and append some garbage to it. + base_plus_garbage_tar = os.path.join(basebackup_dir, "base-plus-garbage.tar") + shutil.copyfile(base_tar, base_plus_garbage_tar) + with open(base_plus_garbage_tar, "a") as f: + f.write("trailing garbage") + # Get start_lsn and end_lsn with open(os.path.join(basebackup_dir, "backup_manifest")) as f: manifest = json.load(f) @@ -74,7 +80,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Set up pageserver for import neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() - env.pageserver.http_client().tenant_create(tenant) + client = env.pageserver.http_client() + client.tenant_create(tenant) env.pageserver.allowed_errors.extend( [ @@ -85,6 +92,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ".*InternalServerError.*Tenant .* not found.*", ".*InternalServerError.*Timeline .* not found.*", ".*InternalServerError.*Cannot delete timeline which has child timelines.*", + ".*ignored .* unexpected bytes after the tar archive.*", ] ) @@ -130,11 +138,18 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build with pytest.raises(Exception): import_tar(corrupt_base_tar, wal_tar) + # A tar with trailing garbage is currently accepted. It prints a warnings + # to the pageserver log, however. Check that. + import_tar(base_plus_garbage_tar, wal_tar) + assert env.pageserver.log_contains( + ".*WARN.*ignored .* unexpected bytes after the tar archive.*" + ) + client.timeline_delete(tenant, timeline) + # Importing correct backup works import_tar(base_tar, wal_tar) # Wait for data to land in s3 - client = env.pageserver.http_client() wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) From 2672fd09d808deaaf1ce6da11d9c22e02570282f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 14 Mar 2023 17:36:10 +0200 Subject: [PATCH 158/426] Make test independent of the order of config lines. --- test_runner/regress/test_tenant_relocation.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 247fed846e..aaf33c0d59 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -434,10 +434,13 @@ def test_tenant_relocation( ) # rewrite neon cli config to use new pageserver for basebackup to start new compute - cli_config_lines = (env.repo_dir / "config").read_text().splitlines() - cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" - cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" - (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) + lines = (env.repo_dir / "config").read_text().splitlines() + for i, line in enumerate(lines): + if line.startswith("listen_http_addr"): + lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" + if line.startswith("listen_pg_addr"): + lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" + (env.repo_dir / "config").write_text("\n".join(lines)) old_local_path_main = switch_pg_to_new_pageserver( env, @@ -496,7 +499,10 @@ def test_tenant_relocation( # bring old pageserver back for clean shutdown via neon cli # new pageserver will be shut down by the context manager - cli_config_lines = (env.repo_dir / "config").read_text().splitlines() - cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" - cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" - (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) + lines = (env.repo_dir / "config").read_text().splitlines() + for i, line in enumerate(lines): + if line.startswith("listen_http_addr"): + lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" + if line.startswith("listen_pg_addr"): + lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" + (env.repo_dir / "config").write_text("\n".join(lines)) From a7ab53c80cea7ab7922ebd2a5a4d71963ffcb3ba Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 15 Mar 2023 11:44:55 +0400 Subject: [PATCH 159/426] Forward framed read buf contents to compute before proxy pass. Otherwise they get lost. Normally buffer is empty before proxy pass, but this is not the case with pipeline mode of out npm driver; fixes connection hangup introduced by b80fe41af3e for it. fixes https://github.com/neondatabase/neon/issues/3822 --- libs/pq_proto/src/framed.rs | 6 +++--- proxy/src/proxy.rs | 27 +++++++++++++++++++++------ proxy/src/stream.rs | 5 +++-- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 972730cbab..3cdca45009 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -63,9 +63,9 @@ impl Framed { &self.stream } - /// Extract the underlying stream. - pub fn into_inner(self) -> S { - self.stream + /// Deconstruct into the underlying stream and read buffer. + pub fn into_inner(self) -> (S, BytesMut) { + (self.stream, self.read_buf) } /// Return new Framed with stream type transformed by async f, for TLS diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index abeff6a33b..efe0e8795b 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -16,7 +16,7 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tracing::{error, info, warn}; use utils::measured_stream::MeasuredStream; @@ -209,9 +209,18 @@ async fn handshake( if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. - stream = PqStream::new( - stream.into_inner().upgrade(tls.to_server_config()).await?, - ); + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empy. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + bail!("data is sent before server replied with EncryptionResponse"); + } + stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?); } } _ => bail!(ERR_PROTO_VIOLATION), @@ -443,11 +452,17 @@ impl Client<'_, S> { value: mut node_info, } = auth_result; - let node = connect_to_compute(&mut node_info, params, &extra, &creds) + let mut node = connect_to_compute(&mut node_info, params, &extra, &creds) .or_else(|e| stream.throw_error(e)) .await?; prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?; - proxy_pass(stream.into_inner(), node.stream, &node_info.aux).await + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + proxy_pass(stream, node.stream, &node_info.aux).await } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 9dfc435e39..7cb292ed58 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,5 +1,6 @@ use crate::error::UserFacingError; use anyhow::bail; +use bytes::BytesMut; use pin_project_lite::pin_project; use pq_proto::framed::{ConnectionError, Framed}; use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; @@ -27,8 +28,8 @@ impl PqStream { } } - /// Extract the underlying stream. - pub fn into_inner(self) -> S { + /// Extract the underlying stream and read buffer. + pub fn into_inner(self) -> (S, BytesMut) { self.framed.into_inner() } From 10a5d36af80984d2314fa0d372a4b56574f69cac Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 15 Mar 2023 13:52:29 +0200 Subject: [PATCH 160/426] Separate mgmt and libpq authentication configs in pageserver. (#3773) This makes it possible to enable authentication only for the mgmt HTTP API or the compute API. The HTTP API doesn't need to be directly accessible from compute nodes, and it can be secured through network policies. This also allows rolling out authentication in a piecemeal fashion. --- control_plane/safekeepers.conf | 3 +- control_plane/simple.conf | 3 +- control_plane/src/bin/neon_local.rs | 7 ++-- control_plane/src/compute.rs | 2 +- control_plane/src/local_env.rs | 9 +++-- control_plane/src/pageserver.rs | 22 ++++++++---- docs/authentication.md | 8 +++-- pageserver/src/bin/pageserver.rs | 40 ++++++++++++++------- pageserver/src/config.rs | 43 ++++++++++++++++------- pageserver/src/http/routes.rs | 1 + test_runner/fixtures/neon_fixtures.py | 6 ++-- test_runner/regress/test_compatibility.py | 7 ++++ 12 files changed, 107 insertions(+), 44 deletions(-) diff --git a/control_plane/safekeepers.conf b/control_plane/safekeepers.conf index df7dd2adca..576cc4a3a9 100644 --- a/control_plane/safekeepers.conf +++ b/control_plane/safekeepers.conf @@ -2,7 +2,8 @@ [pageserver] listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' -auth_type = 'Trust' +pg_auth_type = 'Trust' +http_auth_type = 'Trust' [[safekeepers]] id = 1 diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 6014e8dffd..243e13f3d3 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -3,7 +3,8 @@ [pageserver] listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' -auth_type = 'Trust' +pg_auth_type = 'Trust' +http_auth_type = 'Trust' [[safekeepers]] id = 1 diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 49b1d31dbc..a9b66f479a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -53,14 +53,15 @@ listen_addr = '{DEFAULT_BROKER_ADDR}' id = {DEFAULT_PAGESERVER_ID} listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}' listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}' -auth_type = '{pageserver_auth_type}' +pg_auth_type = '{trust_auth}' +http_auth_type = '{trust_auth}' [[safekeepers]] id = {DEFAULT_SAFEKEEPER_ID} pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} "#, - pageserver_auth_type = AuthType::Trust, + trust_auth = AuthType::Trust, ) } @@ -627,7 +628,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let node = cplane.nodes.get(&(tenant_id, node_name.to_string())); - let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) { + let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b7029aabc5..094d2add8d 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -97,7 +97,7 @@ impl ComputeControlPlane { }); node.create_pgdata()?; - node.setup_pg_conf(self.env.pageserver.auth_type)?; + node.setup_pg_conf(self.env.pageserver.pg_auth_type)?; self.nodes .insert((tenant_id, node.name.clone()), Arc::clone(&node)); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 09180d96c4..630f8bb664 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -110,12 +110,14 @@ impl NeonBroker { pub struct PageServerConf { // node id pub id: NodeId, + // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - // used to determine which auth type is used - pub auth_type: AuthType, + // auth type used for the PG and HTTP ports + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, // jwt auth token used for communication with pageserver pub auth_token: String, @@ -127,7 +129,8 @@ impl Default for PageServerConf { id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), - auth_type: AuthType::Trust, + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, auth_token: String::new(), } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 4b7180c250..07ead45d5b 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -82,7 +82,7 @@ impl PageServerNode { let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr) .expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); - let password = if env.pageserver.auth_type == AuthType::NeonJWT { + let password = if env.pageserver.pg_auth_type == AuthType::NeonJWT { Some(env.pageserver.auth_token.clone()) } else { None @@ -106,25 +106,32 @@ impl PageServerNode { self.env.pg_distrib_dir_raw().display() ); - let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); + let http_auth_type_param = + format!("http_auth_type='{}'", self.env.pageserver.http_auth_type); let listen_http_addr_param = format!( "listen_http_addr='{}'", self.env.pageserver.listen_http_addr ); + + let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type); let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); + let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); let mut overrides = vec![ id, pg_distrib_dir_param, - authg_type_param, + http_auth_type_param, + pg_auth_type_param, listen_http_addr_param, listen_pg_addr_param, broker_endpoint_param, ]; - if self.env.pageserver.auth_type != AuthType::Trust { + if self.env.pageserver.http_auth_type != AuthType::Trust + || self.env.pageserver.pg_auth_type != AuthType::Trust + { overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned()); } overrides @@ -247,7 +254,10 @@ impl PageServerNode { } fn pageserver_env_variables(&self) -> anyhow::Result> { - Ok(if self.env.pageserver.auth_type != AuthType::Trust { + // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper + // needs a token, and how to generate that token, seems independent to whether + // the pageserver requires a token in incoming requests. + Ok(if self.env.pageserver.http_auth_type != AuthType::Trust { // Generate a token to connect from the pageserver to a safekeeper let token = self .env @@ -283,7 +293,7 @@ impl PageServerNode { fn http_request(&self, method: Method, url: U) -> RequestBuilder { let mut builder = self.http_client.request(method, url); - if self.env.pageserver.auth_type == AuthType::NeonJWT { + if self.env.pageserver.http_auth_type == AuthType::NeonJWT { builder = builder.bearer_auth(&self.env.pageserver.auth_token) } builder diff --git a/docs/authentication.md b/docs/authentication.md index 1637519211..e6b5fa5707 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -137,10 +137,12 @@ Each compute should present a token valid for the timeline's tenant. Pageserver also has HTTP API: some parts are per-tenant, some parts are server-wide, these are different scopes. -The `auth_type` configuration variable in Pageserver's config may have -either of three values: +Authentication can be enabled separately for the HTTP mgmt API, and +for the libpq connections from compute. The `http_auth_type` and +`pg_auth_type` configuration variables in Pageserver's config may +have one of these values: -* `Trust` removes all authentication. The outdated `MD5` value does likewise +* `Trust` removes all authentication. * `NeonJWT` enables JWT validation. Tokens are validated using the public key which lies in a PEM file specified in the `auth_validation_public_key_path` config. diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 564a3de82c..14e86ddcb6 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -270,15 +270,31 @@ fn start_pageserver( WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; // Initialize authentication for incoming connections - let auth = match &conf.auth_type { - AuthType::Trust => None, - AuthType::NeonJWT => { - // unwrap is ok because check is performed when creating config, so path is set and file exists - let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); - Some(JwtAuth::from_key_path(key_path)?.into()) - } - }; - info!("Using auth: {:#?}", conf.auth_type); + let http_auth; + let pg_auth; + if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { + // unwrap is ok because check is performed when creating config, so path is set and file exists + let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); + info!( + "Loading public key for verifying JWT tokens from {:#?}", + key_path + ); + let auth: Arc = Arc::new(JwtAuth::from_key_path(key_path)?); + + http_auth = match &conf.http_auth_type { + AuthType::Trust => None, + AuthType::NeonJWT => Some(auth.clone()), + }; + pg_auth = match &conf.pg_auth_type { + AuthType::Trust => None, + AuthType::NeonJWT => Some(auth), + }; + } else { + http_auth = None; + pg_auth = None; + } + info!("Using auth for http API: {:#?}", conf.http_auth_type); + info!("Using auth for pg connections: {:#?}", conf.pg_auth_type); match var("NEON_AUTH_TOKEN") { Ok(v) => { @@ -308,7 +324,7 @@ fn start_pageserver( { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, launch_ts, auth.clone(), remote_storage)? + let router = http::make_router(conf, launch_ts, http_auth, remote_storage)? .build() .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); @@ -382,9 +398,9 @@ fn start_pageserver( async move { page_service::libpq_listener_main( conf, - auth, + pg_auth, pageserver_listener, - conf.auth_type, + conf.pg_auth_type, libpq_ctx, ) .await diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index fde889d01a..d17f0bc143 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -138,9 +138,15 @@ pub struct PageServerConf { pub pg_distrib_dir: PathBuf, - pub auth_type: AuthType, - + // Authentication + /// authentication method for the HTTP mgmt API + pub http_auth_type: AuthType, + /// authentication method for libpq connections from compute + pub pg_auth_type: AuthType, + /// Path to a file containing public key for verifying JWT tokens. + /// Used for both mgmt and compute auth, if enabled. pub auth_validation_public_key_path: Option, + pub remote_storage_config: Option, pub default_tenant_conf: TenantConf, @@ -208,7 +214,8 @@ struct PageServerConfigBuilder { pg_distrib_dir: BuilderValue, - auth_type: BuilderValue, + http_auth_type: BuilderValue, + pg_auth_type: BuilderValue, // auth_validation_public_key_path: BuilderValue>, @@ -251,7 +258,8 @@ impl Default for PageServerConfigBuilder { pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") .join("pg_install")), - auth_type: Set(AuthType::Trust), + http_auth_type: Set(AuthType::Trust), + pg_auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), id: NotSet, @@ -323,8 +331,12 @@ impl PageServerConfigBuilder { self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) } - pub fn auth_type(&mut self, auth_type: AuthType) { - self.auth_type = BuilderValue::Set(auth_type) + pub fn http_auth_type(&mut self, auth_type: AuthType) { + self.http_auth_type = BuilderValue::Set(auth_type) + } + + pub fn pg_auth_type(&mut self, auth_type: AuthType) { + self.pg_auth_type = BuilderValue::Set(auth_type) } pub fn auth_validation_public_key_path( @@ -419,7 +431,10 @@ impl PageServerConfigBuilder { pg_distrib_dir: self .pg_distrib_dir .ok_or(anyhow!("missing pg_distrib_dir"))?, - auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?, + http_auth_type: self + .http_auth_type + .ok_or(anyhow!("missing http_auth_type"))?, + pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?, auth_validation_public_key_path: self .auth_validation_public_key_path .ok_or(anyhow!("missing auth_validation_public_key_path"))?, @@ -612,7 +627,8 @@ impl PageServerConf { "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( PathBuf::from(parse_toml_string(key, item)?), )), - "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), + "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), + "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) } @@ -647,7 +663,7 @@ impl PageServerConf { let mut conf = builder.build().context("invalid config")?; - if conf.auth_type == AuthType::NeonJWT { + if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); @@ -766,7 +782,8 @@ impl PageServerConf { superuser: "cloud_admin".to_string(), workdir: repo_dir, pg_distrib_dir, - auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + pg_auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, default_tenant_conf: TenantConf::default(), @@ -951,7 +968,8 @@ log_format = 'json' max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, workdir, pg_distrib_dir, - auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + pg_auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, default_tenant_conf: TenantConf::default(), @@ -1008,7 +1026,8 @@ log_format = 'json' max_file_descriptors: 333, workdir, pg_distrib_dir, - auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + pg_auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, default_tenant_conf: TenantConf::default(), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 111bc480c4..9faa994f16 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -82,6 +82,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { get_state(request).conf } +/// Check that the requester is authorized to operate on given tenant fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { check_permission_with(request, |claims| { crate::auth::check_permission(claims, tenant_id) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c5e260a962..8a64be51f1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -924,7 +924,8 @@ class NeonEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust" + http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" + pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" toml += textwrap.dedent( f""" @@ -932,7 +933,8 @@ class NeonEnv: id=1 listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' - auth_type = '{pageserver_auth_type}' + pg_auth_type = '{pg_auth_type}' + http_auth_type = '{http_auth_type}' """ ) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 731e78a3e3..66625dd6f8 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -246,6 +246,13 @@ def prepare_snapshot( if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0": pageserver_config["broker_endpoints"] = etcd_broker_endpoints # old etcd version + # Older pageserver versions had just one `auth_type` setting. Now there + # are separate settings for pg and http ports. We don't use authentication + # in compatibility tests so just remove authentication related settings. + pageserver_config.pop("auth_type", None) + pageserver_config.pop("pg_auth_type", None) + pageserver_config.pop("http_auth_type", None) + if pg_distrib_dir: pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) From cd17802b1f20b2066546e143f48dd348ce2ae86c Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 15 Mar 2023 16:18:38 +0300 Subject: [PATCH 161/426] Add pg_clients test for serverless driver (#3827) Fixes #3819 --- test_runner/pg_clients/README.md | 10 ++++ test_runner/pg_clients/test_pg_clients.py | 1 + .../serverless-driver/.dockerignore | 1 + .../typescript/serverless-driver/.gitignore | 1 + .../typescript/serverless-driver/Dockerfile | 7 +++ .../typescript/serverless-driver/index.js | 20 ++++++++ .../serverless-driver/package-lock.json | 51 +++++++++++++++++++ .../typescript/serverless-driver/package.json | 7 +++ 8 files changed, 98 insertions(+) create mode 100644 test_runner/pg_clients/README.md create mode 100644 test_runner/pg_clients/typescript/serverless-driver/.dockerignore create mode 100644 test_runner/pg_clients/typescript/serverless-driver/.gitignore create mode 100644 test_runner/pg_clients/typescript/serverless-driver/Dockerfile create mode 100755 test_runner/pg_clients/typescript/serverless-driver/index.js create mode 100644 test_runner/pg_clients/typescript/serverless-driver/package-lock.json create mode 100644 test_runner/pg_clients/typescript/serverless-driver/package.json diff --git a/test_runner/pg_clients/README.md b/test_runner/pg_clients/README.md new file mode 100644 index 0000000000..dc316a17ef --- /dev/null +++ b/test_runner/pg_clients/README.md @@ -0,0 +1,10 @@ +# pg_clients + +To run a single test locally: + +```bash +export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb + +# will filter only tests with "serverless" in the name +./scripts/pytest -m remote_cluster -k serverless +``` diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index f44fdd1724..18e3f379da 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -22,6 +22,7 @@ from fixtures.utils import subprocess_capture ), "swift/PostgresNIOExample", "typescript/postgresql-client", + "typescript/serverless-driver", ], ) def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: str): diff --git a/test_runner/pg_clients/typescript/serverless-driver/.dockerignore b/test_runner/pg_clients/typescript/serverless-driver/.dockerignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/serverless-driver/.dockerignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/serverless-driver/.gitignore b/test_runner/pg_clients/typescript/serverless-driver/.gitignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/serverless-driver/.gitignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile new file mode 100644 index 0000000000..a5ad832a5c --- /dev/null +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -0,0 +1,7 @@ +FROM node:18 +WORKDIR /source + +COPY . . +RUN npm clean-install + +CMD ["/source/index.js"] diff --git a/test_runner/pg_clients/typescript/serverless-driver/index.js b/test_runner/pg_clients/typescript/serverless-driver/index.js new file mode 100755 index 0000000000..91db4037e1 --- /dev/null +++ b/test_runner/pg_clients/typescript/serverless-driver/index.js @@ -0,0 +1,20 @@ +#! /usr/bin/env node + +import { Client } from '@neondatabase/serverless' + +(async () => { + const client = new Client({ + host: process.env.NEON_HOST, + database: process.env.NEON_DATABASE, + user: process.env.NEON_USER, + password: process.env.NEON_PASSWORD, + }); + client.connect(); + const result = await client.query({ + text: 'select 1', + rowMode: 'array', + }); + const rows = result.rows; + await client.end(); + console.log(rows[0][0]); +})() diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json new file mode 100644 index 0000000000..c4d03bffcb --- /dev/null +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -0,0 +1,51 @@ +{ + "name": "serverless-driver", + "lockfileVersion": 2, + "requires": true, + "packages": { + "": { + "dependencies": { + "@neondatabase/serverless": "^0.2.8", + "ws": "^8.13.0" + } + }, + "node_modules/@neondatabase/serverless": { + "version": "0.2.8", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz", + "integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA==" + }, + "node_modules/ws": { + "version": "8.13.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", + "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + } + }, + "dependencies": { + "@neondatabase/serverless": { + "version": "0.2.8", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.2.8.tgz", + "integrity": "sha512-+yWjIOJsFnrtt2xvtLVEzWM2lfvemawk/DBg4mD2cZOF/IC6Jn4wEctZyk60TscZMSxfozNkPoxmZvBmNuQ0vA==" + }, + "ws": { + "version": "8.13.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", + "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", + "requires": {} + } + } +} diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json new file mode 100644 index 0000000000..2b01a22d3e --- /dev/null +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -0,0 +1,7 @@ +{ + "type": "module", + "dependencies": { + "@neondatabase/serverless": "^0.2.8", + "ws": "^8.13.0" + } +} From f1d960d2c2e88bd6e92d249b86bc0f6b87d9c3cf Mon Sep 17 00:00:00 2001 From: Rahul Patil Date: Wed, 15 Mar 2023 16:58:28 +0100 Subject: [PATCH 162/426] Add new pageserver to eu-central-1 (#3829) ## Describe your changes ## Issue ticket number and link ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- .github/ansible/prod.eu-central-1.hosts.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index 2236dcbc06..b3cd5de01c 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -27,6 +27,8 @@ storage: ansible_host: i-0cd8d316ecbb715be pageserver-1.eu-central-1.aws.neon.tech: ansible_host: i-090044ed3d383fef0 + pageserver-2.eu-central-1.aws.neon.tech: + ansible_host: i-033584edf3f4b6742 safekeepers: hosts: From 768c8d9972b7064b6e50b7d421506b76b6074221 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 16 Mar 2023 11:03:21 +0200 Subject: [PATCH 163/426] test: allow gc to get unlucky (#3826) this failure case was probably introduced by b220ba6, because earlier the gc would always have run fast enough for restart every 1s. however, test got added later, so we have just been lucky. fixes #3824 by allowing this error to happen. --- .../regress/test_pageserver_restarts_under_workload.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index 28159778fe..eab8b112f0 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -17,6 +17,12 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB n_restarts = 10 scale = 10 + # the background task may complete the init task delay after finding an + # active tenant, but shutdown starts right before Tenant::gc_iteration + env.pageserver.allowed_errors.append( + r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant" + ) + def run_pgbench(pg: Postgres): connstr = pg.connstr() log.info(f"Start a pgbench workload on pg {connstr}") From b067378d0d4f2e5ba39fd4ddf8bcedb6ef75ce98 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 16 Mar 2023 16:24:01 +0200 Subject: [PATCH 164/426] Measure cross-AZ traffic in safekeepers (#3806) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create `safekeeper_pg_io_bytes_total` metric to track total amount of bytes written/read in a postgres connections to safekeepers. This metric has the following labels: - `client_az` – availability zone of the connection initiator, or `"unknown"` - `sk_az` – availability zone of the safekeeper, or `"unknown"` - `app_name` – `application_name` of the postgres client - `dir` – data direction, either `"read"` or `"write"` - `same_az` – `"true"`, `"false"` or `"unknown"`. Can be derived from `client_az` and `sk_az`, exists purely for convenience. This is implemented by passing availability zone in the connection string, like this: `-c tenant_id=AAA timeline_id=BBB availability-zone=AZ-1`. Update ansible deployment scripts to add availability_zone argument to safekeeper and pageserver in systemd service files. --- .github/ansible/deploy.yaml | 18 +++ .github/ansible/systemd/pageserver.service | 2 +- .github/ansible/systemd/safekeeper.service | 2 +- control_plane/src/safekeeper.rs | 6 + pageserver/src/config.rs | 17 +++ pageserver/src/tenant/timeline.rs | 1 + .../walreceiver/connection_manager.rs | 18 ++- safekeeper/src/bin/safekeeper.rs | 4 + safekeeper/src/handler.rs | 13 +- safekeeper/src/lib.rs | 2 + safekeeper/src/metrics.rs | 131 +++++++++++++++++- safekeeper/src/wal_service.rs | 19 +-- test_runner/regress/test_tenants.py | 47 +++++++ 13 files changed, 263 insertions(+), 17 deletions(-) diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index 0243e91f37..d4c1dec8ea 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -91,6 +91,15 @@ tags: - pageserver + # used in `pageserver.service` template + - name: learn current availability_zone + shell: + cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone" + register: ec2_availability_zone + + - set_fact: + ec2_availability_zone={{ ec2_availability_zone.stdout }} + - name: upload systemd service definition ansible.builtin.template: src: systemd/pageserver.service @@ -153,6 +162,15 @@ tags: - safekeeper + # used in `safekeeper.service` template + - name: learn current availability_zone + shell: + cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone" + register: ec2_availability_zone + + - set_fact: + ec2_availability_zone={{ ec2_availability_zone.stdout }} + # in the future safekeepers should discover pageservers byself # but currently use first pageserver that was discovered - name: set first pageserver var for safekeepers diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service index 4570a666fa..177fc9f9d2 100644 --- a/.github/ansible/systemd/pageserver.service +++ b/.github/ansible/systemd/pageserver.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=pageserver Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }} -ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data +ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -c "availability_zone='{{ ec2_availability_zone }}'" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service index d7d8d26b1a..a8ef4bc0f5 100644 --- a/.github/ansible/systemd/safekeeper.service +++ b/.github/ansible/systemd/safekeeper.service @@ -6,7 +6,7 @@ After=network.target auditd.service Type=simple User=safekeeper Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }} -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}' +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}' --availability-zone={{ ec2_availability_zone }} ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed KillSignal=SIGINT diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 4c0812a5e3..228968be80 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -115,6 +115,10 @@ impl SafekeeperNode { let datadir = self.datadir_path(); let id_string = id.to_string(); + // TODO: add availability_zone to the config. + // Right now we just specify any value here and use it to check metrics in tests. + let availability_zone = format!("sk-{}", id_string); + let mut args = vec![ "-D", datadir.to_str().with_context(|| { @@ -126,6 +130,8 @@ impl SafekeeperNode { &listen_pg, "--listen-http", &listen_http, + "--availability-zone", + &availability_zone, ]; if !self.conf.sync { args.push("--no-sync"); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index d17f0bc143..16f35355af 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -118,6 +118,9 @@ pub struct PageServerConf { /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, + /// Current availability zone. Used for traffic metrics. + pub availability_zone: Option, + // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. pub wait_lsn_timeout: Duration, // How long to wait for WAL redo to complete. @@ -202,6 +205,8 @@ struct PageServerConfigBuilder { listen_http_addr: BuilderValue, + availability_zone: BuilderValue>, + wait_lsn_timeout: BuilderValue, wal_redo_timeout: BuilderValue, @@ -247,6 +252,7 @@ impl Default for PageServerConfigBuilder { Self { listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), + availability_zone: Set(None), wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) .expect("cannot parse default wait lsn timeout")), wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) @@ -303,6 +309,10 @@ impl PageServerConfigBuilder { self.listen_http_addr = BuilderValue::Set(listen_http_addr) } + pub fn availability_zone(&mut self, availability_zone: Option) { + self.availability_zone = BuilderValue::Set(availability_zone) + } + pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) } @@ -414,6 +424,9 @@ impl PageServerConfigBuilder { listen_http_addr: self .listen_http_addr .ok_or(anyhow!("missing listen_http_addr"))?, + availability_zone: self + .availability_zone + .ok_or(anyhow!("missing availability_zone"))?, wait_lsn_timeout: self .wait_lsn_timeout .ok_or(anyhow!("missing wait_lsn_timeout"))?, @@ -614,6 +627,7 @@ impl PageServerConf { match key { "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), + "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)), "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), @@ -779,6 +793,7 @@ impl PageServerConf { max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + availability_zone: None, superuser: "cloud_admin".to_string(), workdir: repo_dir, pg_distrib_dir, @@ -961,6 +976,7 @@ log_format = 'json' id: NodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + availability_zone: None, wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, superuser: defaults::DEFAULT_SUPERUSER.to_string(), @@ -1019,6 +1035,7 @@ log_format = 'json' id: NodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), + availability_zone: None, wait_lsn_timeout: Duration::from_secs(111), wal_redo_timeout: Duration::from_secs(111), superuser: "zzzz".to_string(), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index fff288c683..0c42ed3079 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1335,6 +1335,7 @@ impl Timeline { lagging_wal_timeout, max_lsn_wal_lag, crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), + self.conf.availability_zone.clone(), background_ctx, ); } diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 64a79b6d1b..0c770136db 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -45,6 +45,7 @@ pub fn spawn_connection_manager_task( lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, auth_token: Option>, + availability_zone: Option, ctx: RequestContext, ) { let mut broker_client = get_broker_client().clone(); @@ -67,6 +68,7 @@ pub fn spawn_connection_manager_task( lagging_wal_timeout, max_lsn_wal_lag, auth_token, + availability_zone, ); loop { select! { @@ -334,6 +336,7 @@ struct WalreceiverState { /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id. wal_stream_candidates: HashMap, auth_token: Option>, + availability_zone: Option, } /// Current connection data. @@ -381,6 +384,7 @@ impl WalreceiverState { lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, auth_token: Option>, + availability_zone: Option, ) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_id, @@ -396,6 +400,7 @@ impl WalreceiverState { wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), auth_token, + availability_zone, } } @@ -740,6 +745,7 @@ impl WalreceiverState { None => None, Some(x) => Some(x), }, + self.availability_zone.as_deref(), ) { Ok(connstr) => Some((*sk_id, info, connstr)), Err(e) => { @@ -824,17 +830,24 @@ fn wal_stream_connection_config( }: TenantTimelineId, listen_pg_addr_str: &str, auth_token: Option<&str>, + availability_zone: Option<&str>, ) -> anyhow::Result { let (host, port) = parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; let port = port.unwrap_or(5432); - Ok(PgConnectionConfig::new_host_port(host, port) + let mut connstr = PgConnectionConfig::new_host_port(host, port) .extend_options([ "-c".to_owned(), format!("timeline_id={}", timeline_id), format!("tenant_id={}", tenant_id), ]) - .set_password(auth_token.map(|s| s.to_owned()))) + .set_password(auth_token.map(|s| s.to_owned())); + + if let Some(availability_zone) = availability_zone { + connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]); + } + + Ok(connstr) } #[cfg(test)] @@ -1273,6 +1286,7 @@ mod tests { wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), auth_token: None, + availability_zone: None, } } } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d2cb9f79b9..848b955af8 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -71,6 +71,9 @@ struct Args { /// Listen http endpoint for management and metrics in the form host:port. #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)] listen_http: String, + /// Availability zone of the safekeeper. + #[arg(long)] + availability_zone: Option, /// Do not wait for changes to be written safely to disk. Unsafe. #[arg(short, long)] no_sync: bool, @@ -166,6 +169,7 @@ fn main() -> anyhow::Result<()> { my_id: id, listen_pg_addr: args.listen_pg, listen_http_addr: args.listen_http, + availability_zone: args.availability_zone, no_sync: args.no_sync, broker_endpoint: args.broker_endpoint, broker_keepalive_interval: args.broker_keepalive_interval, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 3b8434b2de..a589fe1869 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -9,6 +9,7 @@ use tracing::{info, info_span, Instrument}; use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; +use crate::metrics::TrafficMetrics; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::QueryError; @@ -33,6 +34,7 @@ pub struct SafekeeperPostgresHandler { /// Unique connection id is logged in spans for observability. pub conn_id: ConnectionId, claims: Option, + io_metrics: Option, } /// Parsed Postgres command. @@ -94,6 +96,11 @@ impl postgres_backend::Handler format!("Failed to parse {value} as timeline id") })?); } + Some(("availability_zone", client_az)) => { + if let Some(metrics) = self.io_metrics.as_ref() { + metrics.set_client_az(client_az) + } + } _ => continue, } } @@ -101,6 +108,9 @@ impl postgres_backend::Handler if let Some(app_name) = params.get("application_name") { self.appname = Some(app_name.to_owned()); + if let Some(metrics) = self.io_metrics.as_ref() { + metrics.set_app_name(app_name) + } } Ok(()) @@ -187,7 +197,7 @@ impl postgres_backend::Handler } impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf, conn_id: u32) -> Self { + pub fn new(conf: SafeKeeperConf, conn_id: u32, io_metrics: Option) -> Self { SafekeeperPostgresHandler { conf, appname: None, @@ -196,6 +206,7 @@ impl SafekeeperPostgresHandler { ttid: TenantTimelineId::empty(), conn_id, claims: None, + io_metrics, } } diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index f4e753cdbf..2c28c5218d 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -52,6 +52,7 @@ pub struct SafeKeeperConf { pub my_id: NodeId, pub listen_pg_addr: String, pub listen_http_addr: String, + pub availability_zone: Option, pub no_sync: bool, pub broker_endpoint: Uri, pub broker_keepalive_interval: Duration, @@ -82,6 +83,7 @@ impl SafeKeeperConf { no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + availability_zone: None, remote_storage: None, my_id: NodeId(0), broker_endpoint: storage_broker::DEFAULT_ENDPOINT diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 16aca24927..c3077b6dc5 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,15 +1,19 @@ //! Global safekeeper mertics and per-timeline safekeeper metrics. -use std::time::{Instant, SystemTime}; +use std::{ + sync::{Arc, RwLock}, + time::{Instant, SystemTime}, +}; use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; use anyhow::Result; use metrics::{ - core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, + core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, register_int_counter_vec, Gauge, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; + use postgres_ffi::XLogSegNo; use utils::{id::TenantTimelineId, lsn::Lsn}; @@ -63,13 +67,132 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { }); pub static PG_IO_BYTES: Lazy = Lazy::new(|| { register_int_counter_vec!( - "safekeeper_pg_io_bytes", + "safekeeper_pg_io_bytes_total", "Bytes read from or written to any PostgreSQL connection", - &["direction"] + &["client_az", "sk_az", "app_name", "dir", "same_az"] ) .expect("Failed to register safekeeper_pg_io_bytes gauge") }); +pub const LABEL_UNKNOWN: &str = "unknown"; + +/// Labels for traffic metrics. +#[derive(Clone)] +struct ConnectionLabels { + /// Availability zone of the connection origin. + client_az: String, + /// Availability zone of the current safekeeper. + sk_az: String, + /// Client application name. + app_name: String, +} + +impl ConnectionLabels { + fn new() -> Self { + Self { + client_az: LABEL_UNKNOWN.to_string(), + sk_az: LABEL_UNKNOWN.to_string(), + app_name: LABEL_UNKNOWN.to_string(), + } + } + + fn build_metrics( + &self, + ) -> ( + GenericCounter, + GenericCounter, + ) { + let same_az = match (self.client_az.as_str(), self.sk_az.as_str()) { + (LABEL_UNKNOWN, _) | (_, LABEL_UNKNOWN) => LABEL_UNKNOWN, + (client_az, sk_az) => { + if client_az == sk_az { + "true" + } else { + "false" + } + } + }; + + let read = PG_IO_BYTES.with_label_values(&[ + &self.client_az, + &self.sk_az, + &self.app_name, + "read", + same_az, + ]); + let write = PG_IO_BYTES.with_label_values(&[ + &self.client_az, + &self.sk_az, + &self.app_name, + "write", + same_az, + ]); + (read, write) + } +} + +struct TrafficMetricsState { + /// Labels for traffic metrics. + labels: ConnectionLabels, + /// Total bytes read from this connection. + read: GenericCounter, + /// Total bytes written to this connection. + write: GenericCounter, +} + +/// Metrics for measuring traffic (r/w bytes) in a single PostgreSQL connection. +#[derive(Clone)] +pub struct TrafficMetrics { + state: Arc>, +} + +impl Default for TrafficMetrics { + fn default() -> Self { + Self::new() + } +} + +impl TrafficMetrics { + pub fn new() -> Self { + let labels = ConnectionLabels::new(); + let (read, write) = labels.build_metrics(); + let state = TrafficMetricsState { + labels, + read, + write, + }; + Self { + state: Arc::new(RwLock::new(state)), + } + } + + pub fn set_client_az(&self, value: &str) { + let mut state = self.state.write().unwrap(); + state.labels.client_az = value.to_string(); + (state.read, state.write) = state.labels.build_metrics(); + } + + pub fn set_sk_az(&self, value: &str) { + let mut state = self.state.write().unwrap(); + state.labels.sk_az = value.to_string(); + (state.read, state.write) = state.labels.build_metrics(); + } + + pub fn set_app_name(&self, value: &str) { + let mut state = self.state.write().unwrap(); + state.labels.app_name = value.to_string(); + (state.read, state.write) = state.labels.build_metrics(); + } + + pub fn observe_read(&self, cnt: usize) { + self.state.read().unwrap().read.inc_by(cnt as u64) + } + + pub fn observe_write(&self, cnt: usize) { + self.state.read().unwrap().write.inc_by(cnt as u64) + } +} + /// Metrics for WalStorage in a single timeline. #[derive(Clone, Default)] pub struct WalStorageMetrics { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 5f58c4f7fc..22f50c3428 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -9,8 +9,9 @@ use tokio::net::TcpStream; use tracing::*; use utils::measured_stream::MeasuredStream; +use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::TrafficMetrics; use crate::SafeKeeperConf; -use crate::{handler::SafekeeperPostgresHandler, metrics::PG_IO_BYTES}; use postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. @@ -68,20 +69,21 @@ fn handle_socket( .build()?; let local = tokio::task::LocalSet::new(); - let read_metrics = PG_IO_BYTES.with_label_values(&["read"]); - let write_metrics = PG_IO_BYTES.with_label_values(&["write"]); - socket.set_nodelay(true)?; let peer_addr = socket.peer_addr()?; - // TODO: measure cross-az traffic + let traffic_metrics = TrafficMetrics::new(); + if let Some(current_az) = conf.availability_zone.as_deref() { + traffic_metrics.set_sk_az(current_az); + } + let socket = MeasuredStream::new( socket, |cnt| { - read_metrics.inc_by(cnt as u64); + traffic_metrics.observe_read(cnt); }, |cnt| { - write_metrics.inc_by(cnt as u64); + traffic_metrics.observe_write(cnt); }, ); @@ -89,7 +91,8 @@ fn handle_socket( None => AuthType::Trust, Some(_) => AuthType::NeonJWT, }; - let mut conn_handler = SafekeeperPostgresHandler::new(conf, conn_id); + let mut conn_handler = + SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index bf87cb3ad4..8021bf9914 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -3,6 +3,7 @@ import shutil import time from contextlib import closing from datetime import datetime +from itertools import chain from pathlib import Path from typing import List @@ -87,6 +88,7 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + neon_env_builder.pageserver_config_override = "availability_zone='test_ps_az'" env = neon_env_builder.init_start() tenant_1, _ = env.neon_cli.create_tenant() @@ -122,6 +124,17 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): ps_metrics = all_metrics[0] sk_metrics = all_metrics[1:] + # Find all metrics among all safekeepers, accepts the same arguments as query_all() + def query_all_safekeepers(name, filter): + return list( + chain.from_iterable( + map( + lambda sk: sk.query_all(name, filter), + sk_metrics, + ) + ) + ) + ttids = [ {"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)}, {"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)}, @@ -162,6 +175,40 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}" ) + for io_direction in ["read", "write"]: + # Querying all metrics for number of bytes read/written by pageserver in another AZ + io_metrics = query_all_safekeepers( + "safekeeper_pg_io_bytes_total", + { + "app_name": "pageserver", + "client_az": "test_ps_az", + "dir": io_direction, + "same_az": "false", + }, + ) + total_bytes = sum(int(metric.value) for metric in io_metrics) + log.info(f"Pageserver {io_direction} bytes from another AZ: {total_bytes}") + # We expect some bytes to be read/written, to make sure metrics are working + assert total_bytes > 0 + + # Test (a subset of) safekeeper global metrics + for sk_m in sk_metrics: + # Test that every safekeeper has read some bytes + assert any( + map( + lambda x: x.value > 0, + sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "read"}), + ) + ), f"{sk_m.name} has not read bytes" + + # Test that every safekeeper has written some bytes + assert any( + map( + lambda x: x.value > 0, + sk_m.query_all("safekeeper_pg_io_bytes_total", {"dir": "write"}), + ) + ), f"{sk_m.name} has not written bytes" + # Test (a subset of) pageserver global metrics for metric in PAGESERVER_GLOBAL_METRICS: ps_samples = ps_metrics.query_all(metric, {}) From b917270c676142e1017ddae89e981b60cf072ed7 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 16 Mar 2023 15:47:37 +0100 Subject: [PATCH 165/426] remove unused TenantConfig::update function --- pageserver/src/tenant/config.rs | 42 --------------------------------- 1 file changed, 42 deletions(-) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 1a52b26ae7..48cb6be121 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -222,48 +222,6 @@ impl TenantConfOpt { eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), } } - - pub fn update(&mut self, other: &TenantConfOpt) { - if let Some(checkpoint_distance) = other.checkpoint_distance { - self.checkpoint_distance = Some(checkpoint_distance); - } - if let Some(checkpoint_timeout) = other.checkpoint_timeout { - self.checkpoint_timeout = Some(checkpoint_timeout); - } - if let Some(compaction_target_size) = other.compaction_target_size { - self.compaction_target_size = Some(compaction_target_size); - } - if let Some(compaction_period) = other.compaction_period { - self.compaction_period = Some(compaction_period); - } - if let Some(compaction_threshold) = other.compaction_threshold { - self.compaction_threshold = Some(compaction_threshold); - } - if let Some(gc_horizon) = other.gc_horizon { - self.gc_horizon = Some(gc_horizon); - } - if let Some(gc_period) = other.gc_period { - self.gc_period = Some(gc_period); - } - if let Some(image_creation_threshold) = other.image_creation_threshold { - self.image_creation_threshold = Some(image_creation_threshold); - } - if let Some(pitr_interval) = other.pitr_interval { - self.pitr_interval = Some(pitr_interval); - } - if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout { - self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout); - } - if let Some(lagging_wal_timeout) = other.lagging_wal_timeout { - self.lagging_wal_timeout = Some(lagging_wal_timeout); - } - if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { - self.max_lsn_wal_lag = Some(max_lsn_wal_lag); - } - if let Some(trace_read_requests) = other.trace_read_requests { - self.trace_read_requests = Some(trace_read_requests); - } - } } impl Default for TenantConf { From f6e2e0042d66b87b09b8e03a1ab9026eaf545b12 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Thu, 16 Mar 2023 17:09:45 -0700 Subject: [PATCH 166/426] Fix + re-enable VM cgroup creation + running (#3820) Re-enable cgroup shenanigans in VMs, with some special care taken to make sure that our version of cgroup-tools supports cgroup v2 (debian bullseye does not, and probably won't because it requires a breaking change in libcgroup). This involves manually building libcgroup / cgroup-tools from source, then copying the output into the final build stage. We originally considered pulling the package from debian's testing repo (which is up-to-date), but decided against it. Refer to the PR for more details. Prior work, for reference: * 2153d2e0 - Run compute_ctl in a cgroup in VMs * 1360361f - Fix missing VM cgconfig.conf * 8dae8799 - Disable VM cgroup shenanigans --- Dockerfile.vm-compute-node | 61 ++++++++++++++++++++++++++++++++------ vm-cgconfig.conf | 12 ++++++++ 2 files changed, 64 insertions(+), 9 deletions(-) create mode 100644 vm-cgconfig.conf diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node index faea311708..957166ecd1 100644 --- a/Dockerfile.vm-compute-node +++ b/Dockerfile.vm-compute-node @@ -2,26 +2,69 @@ ARG SRC_IMAGE ARG VM_INFORMANT_VERSION=v0.1.14 +# on libcgroup update, make sure to check bootstrap.sh for changes +ARG LIBCGROUP_VERSION=v2.0.3 -# Pull VM informant and set up inittab +# Pull VM informant, to copy from later FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant +# Build cgroup-tools +# +# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically +# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant +# requires cgroup v2, so we'll build cgroup-tools ourselves. +FROM debian:bullseye-slim as libcgroup-builder +ARG LIBCGROUP_VERSION + +RUN set -exu \ + && apt update \ + && apt install --no-install-recommends -y \ + git \ + ca-certificates \ + automake \ + cmake \ + make \ + gcc \ + byacc \ + flex \ + libtool \ + libpam0g-dev \ + && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ + && INSTALL_DIR="/libcgroup-install" \ + && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ + && cd libcgroup \ + # extracted from bootstrap.sh, with modified flags: + && (test -d m4 || mkdir m4) \ + && autoreconf -fi \ + && rm -rf autom4te.cache \ + && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ + # actually build the thing... + && make install + +# Combine, starting from non-VM compute node image. +FROM $SRC_IMAGE as base + +# Temporarily set user back to root so we can run adduser, set inittab +USER root +RUN adduser vm-informant --disabled-password --no-create-home + RUN set -e \ && rm -f /etc/inittab \ && touch /etc/inittab RUN set -e \ + && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ && CONNSTR="dbname=neondb user=cloud_admin sslmode=disable" \ - && ARGS="--auto-restart --pgconnstr=\"$CONNSTR\"" \ + && ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \ && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab -# Combine, starting from non-VM compute node image. -FROM $SRC_IMAGE as base - -# Temporarily set user back to root so we can run adduser -USER root -RUN adduser vm-informant --disabled-password --no-create-home USER postgres -COPY --from=informant /etc/inittab /etc/inittab +ADD vm-cgconfig.conf /etc/cgconfig.conf COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant + +COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ +COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ +COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ + +ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"] diff --git a/vm-cgconfig.conf b/vm-cgconfig.conf new file mode 100644 index 0000000000..a2e201708e --- /dev/null +++ b/vm-cgconfig.conf @@ -0,0 +1,12 @@ +# Configuration for cgroups in VM compute nodes +group neon-postgres { + perm { + admin { + uid = vm-informant; + } + task { + gid = users; + } + } + memory {} +} From 93f3f4ab5fb589296ea44929f5c113f680777ee6 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Sun, 19 Mar 2023 10:44:42 +0200 Subject: [PATCH 167/426] Return NotFound in mgmt API requests when tenant is not present in the pageserver (#3818) ## Describe your changes Add Error enum for tenant state response to allow better error handling in mgmt api ## Issue ticket number and link #2238 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- pageserver/src/http/routes.rs | 116 ++++++++---------- pageserver/src/page_service.rs | 5 +- pageserver/src/tenant/mgr.rs | 64 ++++++---- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_tenant_detach.py | 12 +- test_runner/regress/test_timeline_delete.py | 4 +- .../test_walredo_not_left_behind_on_detach.py | 4 +- 7 files changed, 104 insertions(+), 105 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9faa994f16..2f03e251fd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -21,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; -use crate::tenant::mgr::TenantMapInsertError; +use crate::tenant::mgr::{TenantMapInsertError, TenantStateError}; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{PageReconstructError, Timeline}; @@ -89,32 +89,45 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res }) } -fn apierror_from_prerror(err: PageReconstructError) -> ApiError { - match err { - PageReconstructError::Other(err) => ApiError::InternalServerError(err), - PageReconstructError::NeedsDownload(_, _) => { - // This shouldn't happen, because we use a RequestContext that requests to - // download any missing layer files on-demand. - ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) - } - PageReconstructError::Cancelled => { - ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) - } - PageReconstructError::WalRedo(err) => { - ApiError::InternalServerError(anyhow::Error::new(err)) +impl From for ApiError { + fn from(pre: PageReconstructError) -> ApiError { + match pre { + PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), + PageReconstructError::NeedsDownload(_, _) => { + // This shouldn't happen, because we use a RequestContext that requests to + // download any missing layer files on-demand. + ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file")) + } + PageReconstructError::Cancelled => { + ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) + } + PageReconstructError::WalRedo(pre) => { + ApiError::InternalServerError(anyhow::Error::new(pre)) + } } } } -fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError { - match e { - TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { - ApiError::InternalServerError(anyhow::Error::new(e)) +impl From for ApiError { + fn from(tmie: TenantMapInsertError) -> ApiError { + match tmie { + TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => { + ApiError::InternalServerError(anyhow::Error::new(tmie)) + } + TenantMapInsertError::TenantAlreadyExists(id, state) => { + ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) + } + TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), } - TenantMapInsertError::TenantAlreadyExists(id, state) => { - ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}")) + } +} + +impl From for ApiError { + fn from(tse: TenantStateError) -> ApiError { + match tse { + TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)), + _ => ApiError::InternalServerError(anyhow::Error::new(tse)), } - TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e), } } @@ -218,9 +231,7 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::NotFound)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -268,7 +277,7 @@ async fn timeline_list_handler(request: Request) -> Result, response_data.push(timeline_info); } - Ok(response_data) + Ok::, ApiError>(response_data) } .instrument(info_span!("timeline_list", tenant = %tenant_id)) .await?; @@ -287,9 +296,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result format!("{lsn}"), @@ -353,8 +357,7 @@ async fn tenant_attach_handler(request: Request) -> Result, if let Some(remote_storage) = &state.remote_storage { mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx) .instrument(info_span!("tenant_attach", tenant = %tenant_id)) - .await - .map_err(apierror_from_tenant_map_insert_error)?; + .await?; } else { return Err(ApiError::BadRequest(anyhow!( "attach_tenant is not possible because pageserver was configured without remote storage" @@ -373,11 +376,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let conf = state.conf; mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) - .await - // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. - // Replace this with better handling once the error type permits it. - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -407,8 +403,7 @@ async fn tenant_load_handler(request: Request) -> Result, A let state = get_state(&request); mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx) .instrument(info_span!("load", tenant = %tenant_id)) - .await - .map_err(apierror_from_tenant_map_insert_error)?; + .await?; json_response(StatusCode::ACCEPTED, ()) } @@ -421,10 +416,7 @@ async fn tenant_ignore_handler(request: Request) -> Result, let conf = state.conf; mgr::ignore_tenant(conf, tenant_id) .instrument(info_span!("ignore_tenant", tenant = %tenant_id)) - .await - // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors. - // Replace this with better handling once the error type permits it. - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -498,9 +490,7 @@ async fn tenant_size_handler(request: Request) -> Result, A let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::InternalServerError)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; // this can be long operation let inputs = tenant @@ -763,8 +753,7 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result Result, ApiError> { - let tenant = mgr::get_tenant(tenant_id, true) - .await - .map_err(ApiError::NotFound)?; + let tenant = mgr::get_tenant(tenant_id, true).await?; tenant .get_timeline(timeline_id, true) .map_err(ApiError::NotFound) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index aad6099952..b63ee31d5e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1107,7 +1107,10 @@ async fn get_active_tenant_with_timeout( tenant_id: TenantId, _ctx: &RequestContext, /* require get a context to support cancellation in the future */ ) -> Result, GetActiveTenantError> { - let tenant = mgr::get_tenant(tenant_id, false).await?; + let tenant = match mgr::get_tenant(tenant_id, false).await { + Ok(tenant) => tenant, + Err(e) => return Err(GetActiveTenantError::Other(e.into())), + }; let wait_time = Duration::from_secs(30); match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await { Ok(Ok(())) => Ok(tenant), diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a44cb02b4d..a4212ea8a6 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -289,7 +289,7 @@ pub async fn set_new_tenant_config( conf: &'static PageServerConf, new_tenant_conf: TenantConfOpt, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; @@ -306,16 +306,20 @@ pub async fn set_new_tenant_config( /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { +pub async fn get_tenant( + tenant_id: TenantId, + active_only: bool, +) -> Result, TenantStateError> { let m = TENANTS.read().await; let tenant = m .get(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + .ok_or(TenantStateError::NotFound(tenant_id))?; if active_only && !tenant.is_active() { - anyhow::bail!( + tracing::warn!( "Tenant {tenant_id} is not active. Current state: {:?}", tenant.current_state() - ) + ); + Err(TenantStateError::NotActive(tenant_id)) } else { Ok(Arc::clone(tenant)) } @@ -325,21 +329,28 @@ pub async fn delete_timeline( tenant_id: TenantId, timeline_id: TimelineId, ctx: &RequestContext, -) -> anyhow::Result<()> { - match get_tenant(tenant_id, true).await { - Ok(tenant) => { - tenant.delete_timeline(timeline_id, ctx).await?; - } - Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), - } - +) -> Result<(), TenantStateError> { + let tenant = get_tenant(tenant_id, true).await?; + tenant.delete_timeline(timeline_id, ctx).await?; Ok(()) } +#[derive(Debug, thiserror::Error)] +pub enum TenantStateError { + #[error("Tenant {0} not found")] + NotFound(TenantId), + #[error("Tenant {0} is stopping")] + IsStopping(TenantId), + #[error("Tenant {0} is not active")] + NotActive(TenantId), + #[error(transparent)] + Other(#[from] anyhow::Error), +} + pub async fn detach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { remove_tenant_from_memory(tenant_id, async { let local_tenant_directory = conf.tenant_path(&tenant_id); fs::remove_dir_all(&local_tenant_directory) @@ -379,7 +390,7 @@ pub async fn load_tenant( pub async fn ignore_tenant( conf: &'static PageServerConf, tenant_id: TenantId, -) -> anyhow::Result<()> { +) -> Result<(), TenantStateError> { remove_tenant_from_memory(tenant_id, async { let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id); fs::File::create(&ignore_mark_file) @@ -489,7 +500,7 @@ where async fn remove_tenant_from_memory( tenant_id: TenantId, tenant_cleanup: F, -) -> anyhow::Result +) -> Result where F: std::future::Future>, { @@ -505,11 +516,9 @@ where | TenantState::Loading | TenantState::Broken | TenantState::Active => tenant.set_stopping(), - TenantState::Stopping => { - anyhow::bail!("Tenant {tenant_id} is stopping already") - } + TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)), }, - None => anyhow::bail!("Tenant not found for id {tenant_id}"), + None => return Err(TenantStateError::NotFound(tenant_id)), } } @@ -532,10 +541,15 @@ where Err(e) => { let tenants_accessor = TENANTS.read().await; match tenants_accessor.get(&tenant_id) { - Some(tenant) => tenant.set_broken(&e.to_string()), - None => warn!("Tenant {tenant_id} got removed from memory"), + Some(tenant) => { + tenant.set_broken(&e.to_string()); + } + None => { + warn!("Tenant {tenant_id} got removed from memory"); + return Err(TenantStateError::NotFound(tenant_id)); + } } - Err(e) + Err(TenantStateError::Other(e)) } } } @@ -555,7 +569,7 @@ pub async fn immediate_gc( let tenant = guard .get(&tenant_id) .map(Arc::clone) - .with_context(|| format!("Tenant {tenant_id} not found")) + .with_context(|| format!("tenant {tenant_id}")) .map_err(ApiError::NotFound)?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); @@ -605,7 +619,7 @@ pub async fn immediate_compact( let tenant = guard .get(&tenant_id) .map(Arc::clone) - .with_context(|| format!("Tenant {tenant_id} not found")) + .with_context(|| format!("tenant {tenant_id}")) .map_err(ApiError::NotFound)?; let timeline = tenant diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8a64be51f1..8b228ad804 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2086,8 +2086,8 @@ class NeonPageserver(PgProtocol): # https://github.com/neondatabase/neon/issues/2442 ".*could not remove ephemeral file.*No such file or directory.*", # FIXME: These need investigation - ".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*", - ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*", + ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*", + ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*", ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", ".*Removing intermediate uninit mark file.*", diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 27ec38e1be..e061ab92a4 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -225,7 +225,7 @@ def test_tenant_reattach_while_busy( # Attempts to connect from compute to pageserver while the tenant is # temporarily detached produces these errors in the pageserver log. - env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*") + env.pageserver.allowed_errors.append(".*Tenant .* not found.*") env.pageserver.allowed_errors.append( ".*Tenant .* will not become active\\. Current state: Stopping.*" ) @@ -257,18 +257,18 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found") + env.pageserver.allowed_errors.append(".*NotFound: Tenant .*") # first check for non existing tenant tenant_id = TenantId.generate() with pytest.raises( expected_exception=PageserverApiException, - match=f"Tenant not found for id {tenant_id}", + match=f"NotFound: tenant {tenant_id}", ): pageserver_http.tenant_detach(tenant_id) # the error will be printed to the log too - env.pageserver.allowed_errors.append(".*Tenant not found for id.*") + env.pageserver.allowed_errors.append(".*NotFound: tenant *") # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() @@ -294,7 +294,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # the error will be printed to the log too env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") - # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that + # Timelines get stopped during detach, ignore the gc calls that error, witnessing that env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*") # Detach while running manual GC. @@ -320,7 +320,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found" + expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}" ): pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 2226cab8ff..b9c4f5b83f 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -10,7 +10,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*Timeline .* was not found.*") env.pageserver.allowed_errors.append(".*timeline not found.*") env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*") - env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*") + env.pageserver.allowed_errors.append(".*NotFound: tenant .*") ps_http = env.pageserver.http_client() @@ -24,7 +24,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): invalid_tenant_id = TenantId.generate() with pytest.raises( PageserverApiException, - match=f"Tenant {invalid_tenant_id} not found in the local state", + match=f"NotFound: tenant {invalid_tenant_id}", ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 24045e2eb7..395d54b8c3 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -23,7 +23,7 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # We intentionally test for a non-existent tenant. - env.pageserver.allowed_errors.append(".*Tenant not found.*") + env.pageserver.allowed_errors.append(".*NotFound: tenant.*") pageserver_http = env.pageserver.http_client() pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text()) @@ -34,7 +34,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() with pytest.raises( expected_exception=PageserverApiException, - match=f"Tenant not found for id {tenant_id}", + match=f"NotFound: tenant {tenant_id}", ): pageserver_http.tenant_detach(tenant_id) From 3c15874c48863cd1f0f4dc8c4bb6d0899874df36 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 17 Mar 2023 18:50:42 +0100 Subject: [PATCH 168/426] allow specifying eviction_policy in TenantCreateRequest This was on oversight from 175a577ad42476a49978d277a7428e8a078dd6ae. Nothing uses this AFAIK, but, let's fix it anyways. Noticed while working on https://github.com/neondatabase/neon/issues/3728 --- control_plane/src/pageserver.rs | 5 +++++ libs/pageserver_api/src/models.rs | 5 +++++ pageserver/src/http/routes.rs | 8 ++++++++ 3 files changed, 18 insertions(+) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 07ead45d5b..db8cb61395 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -362,6 +362,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'trace_read_requests' as bool")?, + eviction_policy: settings + .get("eviction_policy") + .map(|x| serde_json::from_str(x)) + .transpose() + .context("Failed to parse 'eviction_policy' json")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 3ac7e31ec2..7a43100ba5 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -115,6 +115,11 @@ pub struct TenantCreateRequest { pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub trace_read_requests: Option, + // We defer the parsing of the eviction_policy field to the request handler. + // Otherwise we'd have to move the types for eviction policy into this package. + // We might do that once the eviction feature has stabilizied. + // For now, this field is not even documented in the openapi_spec.yml. + pub eviction_policy: Option, } #[serde_as] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2f03e251fd..39f2776952 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -738,6 +738,14 @@ async fn tenant_create_handler(mut request: Request) -> Result Date: Mon, 20 Mar 2023 11:57:27 +0200 Subject: [PATCH 169/426] feat: store initial timeline in env fixture (#3839) minor change, but will allow more use in future for the default tenants. Co-authored-by: Alexander Bayandin --- test_runner/fixtures/neon_fixtures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8b228ad804..cd4f0678b5 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -643,6 +643,7 @@ class NeonEnvBuilder: f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" ) initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant) + env.initial_timeline = initial_timeline log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully") return env @@ -904,6 +905,7 @@ class NeonEnv: # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = config.initial_tenant + self.initial_timeline: Optional[TimelineId] = None # Create a config file corresponding to the options toml = textwrap.dedent( From 1ddb9249aaf25436cf49e466392a90537268ed3b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 20 Mar 2023 15:49:16 +0200 Subject: [PATCH 170/426] Reduce the # of histogram buckets in metrics. (#3850) Shrinks the total number of metrics collected for each timeline by about 50%. See https://github.com/neondatabase/neon/issues/2848. This doesn't fully solve the problem, we still collect a lot of metrics even with this, but this gives us a lot of headroom. --- pageserver/src/metrics.rs | 60 ++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 4826a0b7ae..6a8aecfd25 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -9,22 +9,18 @@ use once_cell::sync::Lazy; use pageserver_api::models::state; use utils::id::{TenantId, TimelineId}; -/// Prometheus histogram buckets (in seconds) that capture the majority of -/// latencies in the microsecond range but also extend far enough up to distinguish -/// "bad" from "really bad". -fn get_buckets_for_critical_operations() -> Vec { - let buckets_per_digit = 5; - let min_exponent = -6; - let max_exponent = 2; - - let mut buckets = vec![]; - // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp - // because it's more numerically stable and doesn't result in numbers like 9.999999 - for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { - buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) - } - buckets -} +/// Prometheus histogram buckets (in seconds) for operations in the critical +/// path. In other words, operations that directly affect that latency of user +/// queries. +/// +/// The buckets capture the majority of latencies in the microsecond and +/// millisecond range but also extend far enough up to distinguish "bad" from +/// "really bad". +const CRITICAL_OP_BUCKETS: &[f64] = &[ + 0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us + 0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms + 1.0, 10.0, 100.0, // 1 s, 10 s, 100 s +]; // Metrics collected on operations on the storage repository. const STORAGE_TIME_OPERATIONS: &[&str] = &[ @@ -55,12 +51,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +// Buckets for background operations like compaction, GC, size calculation +const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; + pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_storage_operations_seconds_global", "Time spent on storage operations", &["operation"], - get_buckets_for_critical_operations(), + STORAGE_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -71,7 +70,7 @@ static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value", &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -90,7 +89,7 @@ static WAIT_LSN_TIME: Lazy = Lazy::new(|| { "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", &["tenant_id", "timeline_id"], - get_buckets_for_critical_operations(), + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -196,14 +195,13 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { }); // Metrics collected on disk IO operations +// +// Roughly logarithmic scale. const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ - 0.000001, // 1 usec - 0.00001, // 10 usec - 0.0001, // 100 usec - 0.001, // 1 msec - 0.01, // 10 msec - 0.1, // 100 msec - 1.0, // 1 sec + 0.000030, // 30 usec + 0.001000, // 1000 usec + 0.030, // 30 ms + 1.000, // 1000 ms ]; const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ @@ -238,20 +236,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[ "get_db_size", ]; -const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[ - 0.00001, // 1/100000 s - 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s - 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s - 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s - 0.1, // 1/10 s -]; - pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds", "Time spent on smgr query handling", &["smgr_query_type", "tenant_id", "timeline_id"], - SMGR_QUERY_TIME_BUCKETS.into() + CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); From 1da963b2f91309a16aa3b98d6b0ce376d47eb3f8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 20 Mar 2023 15:50:23 +0200 Subject: [PATCH 171/426] Remove some unused code in control plane. --- control_plane/src/pageserver.rs | 7 ------- control_plane/src/safekeeper.rs | 7 ------- 2 files changed, 14 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index db8cb61395..2a8b889d84 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -280,13 +280,6 @@ impl PageServerNode { background_process::stop_process(immediate, "pageserver", &self.pid_file()) } - pub fn page_server_psql(&self, sql: &str) -> Vec { - let mut client = self.pg_connection_config.connect_no_tls().unwrap(); - - println!("Pageserver query: '{sql}'"); - client.simple_query(sql).unwrap() - } - pub fn page_server_psql_client(&self) -> result::Result { self.pg_connection_config.connect_no_tls() } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 228968be80..84d6320573 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,7 +1,6 @@ use std::io::Write; use std::path::PathBuf; use std::process::Child; -use std::sync::Arc; use std::{io, result}; use anyhow::Context; @@ -11,7 +10,6 @@ use reqwest::{IntoUrl, Method}; use thiserror::Error; use utils::{http::error::HttpErrorBody, id::NodeId}; -use crate::pageserver::PageServerNode; use crate::{ background_process, local_env::{LocalEnv, SafekeeperConf}, @@ -65,14 +63,10 @@ pub struct SafekeeperNode { pub env: LocalEnv, pub http_client: Client, pub http_base_url: String, - - pub pageserver: Arc, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { - let pageserver = Arc::new(PageServerNode::from_env(env)); - SafekeeperNode { id: conf.id, conf: conf.clone(), @@ -80,7 +74,6 @@ impl SafekeeperNode { env: env.clone(), http_client: Client::new(), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), - pageserver, } } From 77107607f3d69c21d4a17b20a533385ff653b639 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 20 Mar 2023 15:50:27 +0200 Subject: [PATCH 172/426] Allow JWT key generation to fail if authentication is not enabled. This allows you to run without the 'openssl' binary as long as you don't enable authentication. This becomes more important with the next commit, which switches the JWT algorithm to EdDSA. LibreSSL does not support EdDSA, and LibreSSL comes with macOS, so the next commit makes it much more likely for the key generation to fail for macOS users. To allow running without a keypair, don't generate the authentication token in the 'neon_local init' step. Instead, generate a new token on every request that needs one, using the private key. --- control_plane/src/local_env.rs | 106 +++++++++++++--------- control_plane/src/pageserver.rs | 43 +++++---- test_runner/regress/test_compatibility.py | 2 - 3 files changed, 86 insertions(+), 65 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 630f8bb664..d4de72b6bf 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -18,7 +18,7 @@ use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use utils::{ - auth::{encode_from_key_file, Claims, Scope}, + auth::{encode_from_key_file, Claims}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, }; @@ -118,9 +118,6 @@ pub struct PageServerConf { // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, - - // jwt auth token used for communication with pageserver - pub auth_token: String, } impl Default for PageServerConf { @@ -131,7 +128,6 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, - auth_token: String::new(), } } } @@ -404,48 +400,33 @@ impl LocalEnv { fs::create_dir(base_path)?; - // generate keys for jwt - // openssl genrsa -out private_key.pem 2048 - let private_key_path; + // Generate keypair for JWT. + // + // The keypair is only needed if authentication is enabled in any of the + // components. For convenience, we generate the keypair even if authentication + // is not enabled, so that you can easily enable it after the initialization + // step. However, if the key generation fails, we treat it as non-fatal if + // authentication was not enabled. if self.private_key_path == PathBuf::new() { - private_key_path = base_path.join("auth_private_key.pem"); - let keygen_output = Command::new("openssl") - .arg("genrsa") - .args(["-out", private_key_path.to_str().unwrap()]) - .arg("2048") - .stdout(Stdio::null()) - .output() - .context("failed to generate auth private key")?; - if !keygen_output.status.success() { - bail!( - "openssl failed: '{}'", - String::from_utf8_lossy(&keygen_output.stderr) - ); - } - self.private_key_path = PathBuf::from("auth_private_key.pem"); - - let public_key_path = base_path.join("auth_public_key.pem"); - // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem - let keygen_output = Command::new("openssl") - .arg("rsa") - .args(["-in", private_key_path.to_str().unwrap()]) - .arg("-pubout") - .args(["-outform", "PEM"]) - .args(["-out", public_key_path.to_str().unwrap()]) - .stdout(Stdio::null()) - .output() - .context("failed to generate auth private key")?; - if !keygen_output.status.success() { - bail!( - "openssl failed: '{}'", - String::from_utf8_lossy(&keygen_output.stderr) - ); + match generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) { + Ok(()) => { + self.private_key_path = PathBuf::from("auth_private_key.pem"); + } + Err(e) => { + if !self.auth_keys_needed() { + eprintln!("Could not generate keypair for JWT authentication: {e}"); + eprintln!("Continuing anyway because authentication was not enabled"); + self.private_key_path = PathBuf::from("auth_private_key.pem"); + } else { + return Err(e); + } + } } } - self.pageserver.auth_token = - self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; - fs::create_dir_all(self.pg_data_dirs_path())?; for safekeeper in &self.safekeepers { @@ -454,6 +435,12 @@ impl LocalEnv { self.persist_config(base_path) } + + fn auth_keys_needed(&self) -> bool { + self.pageserver.pg_auth_type == AuthType::NeonJWT + || self.pageserver.http_auth_type == AuthType::NeonJWT + || self.safekeepers.iter().any(|sk| sk.auth_enabled) + } } fn base_path() -> PathBuf { @@ -463,6 +450,39 @@ fn base_path() -> PathBuf { } } +/// Generate a public/private key pair for JWT authentication +fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> { + let keygen_output = Command::new("openssl") + .arg("genrsa") + .args(["-out", private_key_path.to_str().unwrap()]) + .arg("2048") + .stdout(Stdio::null()) + .output() + .context("failed to generate auth private key")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem + let keygen_output = Command::new("openssl") + .arg("rsa") + .args(["-in", private_key_path.to_str().unwrap()]) + .arg("-pubout") + .args(["-outform", "PEM"]) + .args(["-out", public_key_path.to_str().unwrap()]) + .output() + .context("failed to extract public key from private key")?; + if !keygen_output.status.success() { + bail!( + "openssl failed: '{}'", + String::from_utf8_lossy(&keygen_output.stderr) + ); + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2a8b889d84..3c66400a05 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -82,15 +82,8 @@ impl PageServerNode { let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr) .expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); - let password = if env.pageserver.pg_auth_type == AuthType::NeonJWT { - Some(env.pageserver.auth_token.clone()) - } else { - None - }; - Self { - pg_connection_config: PgConnectionConfig::new_host_port(host, port) - .set_password(password), + pg_connection_config: PgConnectionConfig::new_host_port(host, port), env: env.clone(), http_client: Client::new(), http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr), @@ -280,20 +273,30 @@ impl PageServerNode { background_process::stop_process(immediate, "pageserver", &self.pid_file()) } - pub fn page_server_psql_client(&self) -> result::Result { - self.pg_connection_config.connect_no_tls() + pub fn page_server_psql_client(&self) -> anyhow::Result { + let mut config = self.pg_connection_config.clone(); + if self.env.pageserver.pg_auth_type == AuthType::NeonJWT { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; + config = config.set_password(Some(token)); + } + Ok(config.connect_no_tls()?) } - fn http_request(&self, method: Method, url: U) -> RequestBuilder { + fn http_request(&self, method: Method, url: U) -> anyhow::Result { let mut builder = self.http_client.request(method, url); if self.env.pageserver.http_auth_type == AuthType::NeonJWT { - builder = builder.bearer_auth(&self.env.pageserver.auth_token) + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; + builder = builder.bearer_auth(token) } - builder + Ok(builder) } pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/status", self.http_base_url)) + self.http_request(Method::GET, format!("{}/status", self.http_base_url))? .send()? .error_from_body()?; Ok(()) @@ -301,7 +304,7 @@ impl PageServerNode { pub fn tenant_list(&self) -> Result> { Ok(self - .http_request(Method::GET, format!("{}/tenant", self.http_base_url)) + .http_request(Method::GET, format!("{}/tenant", self.http_base_url))? .send()? .error_from_body()? .json()?) @@ -364,7 +367,7 @@ impl PageServerNode { if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") } - self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) + self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))? .json(&request) .send()? .error_from_body()? @@ -381,7 +384,7 @@ impl PageServerNode { } pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { - self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url)) + self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))? .json(&TenantConfigRequest { tenant_id, checkpoint_distance: settings @@ -444,7 +447,7 @@ impl PageServerNode { .http_request( Method::GET, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - ) + )? .send()? .error_from_body()? .json()?; @@ -463,7 +466,7 @@ impl PageServerNode { self.http_request( Method::POST, format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), - ) + )? .json(&TimelineCreateRequest { new_timeline_id, ancestor_start_lsn, @@ -500,7 +503,7 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let mut client = self.pg_connection_config.connect_no_tls().unwrap(); + let mut client = self.page_server_psql_client()?; // Init base reader let (start_lsn, base_tarfile_path) = base; diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 66625dd6f8..e9dadb5348 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -365,11 +365,9 @@ def check_neon_works( tenant_id = snapshot_config["default_tenant_id"] timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1] - auth_token = snapshot_config["pageserver"]["auth_token"] pageserver_http = PageserverHttpClient( port=pageserver_port, is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled - auth_token=auth_token, ) shutil.rmtree(repo_dir / "local_fs_remote_storage") From fea4b5f5512ac80057fc6aa2b82d027fdd3fb85d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 20 Mar 2023 15:50:30 +0200 Subject: [PATCH 173/426] Switch to EdDSA algorithm for the storage JWT authentication tokens. The control plane currently only supports EdDSA. We need to either teach the storage to use EdDSA, or the control plane to use RSA. EdDSA is more modern, so let's use that. We could support both, but it would require a little more code and tests, and we don't really need the flexibility since we control both sides. --- README.md | 5 +- control_plane/src/local_env.rs | 14 ++-- docs/authentication.md | 15 ++-- libs/utils/src/auth.rs | 99 +++++++-------------------- safekeeper/src/bin/safekeeper.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 2 +- 6 files changed, 49 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index 819693f1f3..43f3e3a02b 100644 --- a/README.md +++ b/README.md @@ -46,11 +46,14 @@ postgresql-libs cmake postgresql protobuf curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` -#### Installing dependencies on OSX (12.3.1) +#### Installing dependencies on macOS (12.3.1) 1. Install XCode and dependencies ``` xcode-select --install brew install protobuf openssl flex bison + +# add openssl to PATH, required for ed25519 keys generation in neon_local +echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index d4de72b6bf..8cc6329ce6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -452,10 +452,13 @@ fn base_path() -> PathBuf { /// Generate a public/private key pair for JWT authentication fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> { + // Generate the key pair + // + // openssl genpkey -algorithm ed25519 -out auth_private_key.pem let keygen_output = Command::new("openssl") - .arg("genrsa") + .arg("genpkey") + .args(["-algorithm", "ed25519"]) .args(["-out", private_key_path.to_str().unwrap()]) - .arg("2048") .stdout(Stdio::null()) .output() .context("failed to generate auth private key")?; @@ -465,12 +468,13 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow String::from_utf8_lossy(&keygen_output.stderr) ); } - // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem + // Extract the public key from the private key file + // + // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem let keygen_output = Command::new("openssl") - .arg("rsa") + .arg("pkey") .args(["-in", private_key_path.to_str().unwrap()]) .arg("-pubout") - .args(["-outform", "PEM"]) .args(["-out", public_key_path.to_str().unwrap()]) .output() .context("failed to extract public key from private key")?; diff --git a/docs/authentication.md b/docs/authentication.md index e6b5fa5707..dc402d1bca 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -29,15 +29,22 @@ These components should not have access to the private key and may only get toke The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`. There is currently no way to rotate the key without bringing down all components. +### Best practices + +See [RFC 8725: JSON Web Token Best Current Practices](https://www.rfc-editor.org/rfc/rfc8725) + + ### Token format -The JWT tokens in Neon use RSA as the algorithm. Example: +The JWT tokens in Neon use "EdDSA" as the algorithm (defined in [RFC8037](https://www.rfc-editor.org/rfc/rfc8037)). + +Example: Header: ``` { - "alg": "RS512", # RS256, RS384, or RS512 + "alg": "EdDSA", "typ": "JWT" } ``` @@ -68,8 +75,8 @@ Currently also used for connection from any pageserver to any safekeeper. CLI generates a key pair during call to `neon_local init` with the following commands: ```bash -openssl genrsa -out auth_private_key.pem 2048 -openssl rsa -in auth_private_key.pem -pubout -outform PEM -out auth_public_key.pem +openssl genpkey -algorithm ed25519 -out auth_private_key.pem +openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem ``` Configuration files for all components point to `public_key.pem` for JWT validation. diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 027950cb39..0fb45e01c6 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,7 +1,4 @@ // For details about authentication see docs/authentication.md -// -// TODO: use ed25519 keys -// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 use serde; use std::fs; @@ -9,26 +6,15 @@ use std::path::Path; use anyhow::Result; use jsonwebtoken::{ - decode, encode, Algorithm, Algorithm::*, DecodingKey, EncodingKey, Header, TokenData, - Validation, + decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, }; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use crate::id::TenantId; -/// Algorithms accepted during validation. -/// -/// Accept all RSA-based algorithms. We pass this list to jsonwebtoken::decode, -/// which checks that the algorithm in the token is one of these. -/// -/// XXX: It also fails the validation if there are any algorithms in this list that belong -/// to different family than the token's algorithm. In other words, we can *not* list any -/// non-RSA algorithms here, or the validation always fails with InvalidAlgorithm error. -const ACCEPTED_ALGORITHMS: &[Algorithm] = &[RS256, RS384, RS512]; - -/// Algorithm to use when generating a new token in [`encode_from_key_file`] -const ENCODE_ALGORITHM: Algorithm = Algorithm::RS256; +/// Algorithm to use. We require EdDSA. +const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "lowercase")] @@ -69,7 +55,7 @@ pub struct JwtAuth { impl JwtAuth { pub fn new(decoding_key: DecodingKey) -> Self { let mut validation = Validation::default(); - validation.algorithms = ACCEPTED_ALGORITHMS.into(); + validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM]; // The default 'required_spec_claims' is 'exp'. But we don't want to require // expiration. validation.required_spec_claims = [].into(); @@ -81,7 +67,7 @@ impl JwtAuth { pub fn from_key_path(key_path: &Path) -> Result { let public_key = fs::read(key_path)?; - Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?)) + Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?)) } pub fn decode(&self, token: &str) -> Result> { @@ -99,8 +85,8 @@ impl std::fmt::Debug for JwtAuth { // this function is used only for testing purposes in CLI e g generate tokens during init pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result { - let key = EncodingKey::from_rsa_pem(key_data)?; - Ok(encode(&Header::new(ENCODE_ALGORITHM), claims, &key)?) + let key = EncodingKey::from_ed_pem(key_data)?; + Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?) } #[cfg(test)] @@ -108,49 +94,19 @@ mod tests { use super::*; use std::str::FromStr; - // generated with: + // Generated with: // - // openssl genpkey -algorithm rsa -out storage-auth-priv.pem - // openssl pkey -in storage-auth-priv.pem -pubout -out storage-auth-pub.pem - const TEST_PUB_KEY_RSA: &[u8] = br#" + // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem + // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem + const TEST_PUB_KEY_ED25519: &[u8] = br#" -----BEGIN PUBLIC KEY----- -MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy6OZ+/kQXcueVJA/KTzO -v4ljxylc/Kcb0sXWuXg1GB8k3nDA1gK66LFYToH0aTnqrnqG32Vu6wrhwuvqsZA7 -jQvP0ZePAbWhpEqho7EpNunDPcxZ/XDy5TQlB1P58F9I3lkJXDC+DsHYLuuzwhAv -vo2MtWRdYlVHblCVLyZtANHhUMp2HUhgjHnJh5UrLIKOl4doCBxkM3rK0wjKsNCt -M92PCR6S9rvYzldfeAYFNppBkEQrXt2CgUqZ4KaS4LXtjTRUJxljijA4HWffhxsr -euRu3ufq8kVqie7fum0rdZZSkONmce0V0LesQ4aE2jB+2Sn48h6jb4dLXGWdq8TV -wQIDAQAB +MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w= -----END PUBLIC KEY----- "#; - const TEST_PRIV_KEY_RSA: &[u8] = br#" + + const TEST_PRIV_KEY_ED25519: &[u8] = br#" -----BEGIN PRIVATE KEY----- -MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDLo5n7+RBdy55U -kD8pPM6/iWPHKVz8pxvSxda5eDUYHyTecMDWArrosVhOgfRpOequeobfZW7rCuHC -6+qxkDuNC8/Rl48BtaGkSqGjsSk26cM9zFn9cPLlNCUHU/nwX0jeWQlcML4Owdgu -67PCEC++jYy1ZF1iVUduUJUvJm0A0eFQynYdSGCMecmHlSssgo6Xh2gIHGQzesrT -CMqw0K0z3Y8JHpL2u9jOV194BgU2mkGQRCte3YKBSpngppLgte2NNFQnGWOKMDgd -Z9+HGyt65G7e5+ryRWqJ7t+6bSt1llKQ42Zx7RXQt6xDhoTaMH7ZKfjyHqNvh0tc -ZZ2rxNXBAgMBAAECggEAVz3u4Wlx3o02dsoZlSQs+xf0PEX3RXKeU+1YMbtTG9Nz -6yxpIQaoZrpbt76rJE2gwkFR+PEu1NmjoOuLb6j4KlQuI4AHz1auOoGSwFtM6e66 -K4aZ4x95oEJ3vqz2fkmEIWYJwYpMUmwvnuJx76kZm0xvROMLsu4QHS2+zCVtO5Tr -hvS05IMVuZ2TdQBZw0+JaFdwXbgDjQnQGY5n9MoTWSx1a4s/FF4Eby65BbDutcpn -Vt3jQAOmO1X2kbPeWSGuPJRzyUs7Kg8qfeglBIR3ppGP3vPYAdWX+ho00bmsVkSp -Q8vjul6C3WiM+kjwDxotHSDgbl/xldAl7OqPh0bfAQKBgQDnycXuq14Vg8nZvyn9 -rTnvucO8RBz5P6G+FZ+44cAS2x79+85onARmMnm+9MKYLSMo8fOvsK034NDI68XM -04QQ/vlfouvFklMTGJIurgEImTZbGCmlMYCvFyIxaEWixon8OpeI4rFe4Hmbiijh -PxhxWg221AwvBS2sco8J/ylEkQKBgQDg6Rh2QYb/j0Wou1rJPbuy3NhHofd5Rq35 -4YV3f2lfVYcPrgRhwe3T9SVII7Dx8LfwzsX5TAlf48ESlI3Dzv40uOCDM+xdtBRI -r96SfSm+jup6gsXU3AsdNkrRK3HoOG9Z/TkrUp213QAIlVnvIx65l4ckFMlpnPJ0 -lo1LDXZWMQKBgFArzjZ7N5OhfdO+9zszC3MLgdRAivT7OWqR+CjujIz5FYMr8Xzl -WfAvTUTrS9Nu6VZkObFvHrrRG+YjBsuN7YQjbQXTSFGSBwH34bgbn2fl9pMTjHQC -50uoaL9GHa/rlBaV/YvvPQJgCi/uXa1rMX0jdNLkDULGO8IF7cu7Yf7BAoGBAIUU -J29BkpmAst0GDs/ogTlyR18LTR0rXyHt+UUd1MGeH859TwZw80JpWWf4BmkB4DTS -hH3gKePdJY7S65ci0XNsuRupC4DeXuorde0DtkGU2tUmr9wlX0Ynq9lcdYfMbMa4 -eK1TsxG69JwfkxlWlIWITWRiEFM3lJa7xlrUWmLhAoGAFpKWF/hn4zYg3seU9gai -EYHKSbhxA4mRb+F0/9IlCBPMCqFrL5yftUsYIh2XFKn8+QhO97Nmk8wJSK6TzQ5t -ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp -8ismApXVGHpOCstzikV9W7k= +MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH -----END PRIVATE KEY----- "#; @@ -161,8 +117,7 @@ ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp scope: Scope::Tenant, }; - // Here are tokens containing the following payload, signed using TEST_PRIV_KEY_RSA - // using RS512, RS384 and RS256 algorithms: + // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519: // // ``` // { @@ -174,21 +129,13 @@ ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp // } // ``` // - // These were encoded with the online debugger at https://jwt.io - // - let encoded_rs512 = "eyJhbGciOiJSUzUxMiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.QmqfteDQmDGoxQ5EFkasbt35Lx0W0Nh63muQnYZvFq93DSh4ZbOG9Mc4yaiXZoiS5HgeKtFKv3mbWkDqjz3En06aY17hWwguBtAsGASX48lYeCPADYGlGAuaWnOnVRwe3iiOC7tvPFvwX_45S84X73sNUXyUiXv6nLdcDqVXudtNrGST_DnZDnjuUJX11w7sebtKqQQ8l9-iGHiXOl5yevpMCoB1OcTWcT6DfDtffoNuMHDC3fyhmEGG5oKAt1qBybqAIiyC9-UBAowRZXhdfxrzUl-I9jzKWvk85c5ulhVRwbPeP6TTTlPKwFzBNHg1i2U-1GONew5osQ3aoptwsA"; + let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw"; - let encoded_rs384 = "eyJhbGciOiJSUzM4NCIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.qqk4nkxKzOJP38c_g57_w_SfdQVmCsDT_bsLmdFj_N6LIB22gr6U6_P_5mvk3pIAsp0VCTDwPrCU908TxqjibEkwvQoJwbogHamSGHpD7eJBxGblSnA-Nr3MlEMxpFtec8QokSm6C5mH7DoBYjB2xzeOlxAmpR2GAzInKiMkU4kZ_OcqqrmVcMXY_6VnbxZWMekuw56zE1-PP_qNF1HvYOH-P08ONP8qdo5UPtBG7QBEFlCqZXJZCFihQaI4Vzil9rDuZGCm3I7xQJ8-yh1PX3BTbGo8EzqLdRyBeTpr08UTuRbp_MJDWevHpP3afvJetAItqZXIoZQrbJjcByHqKw"; + // Check it can be validated with the public key + let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?); + let claims_from_token = auth.decode(encoded_eddsa)?.claims; + assert_eq!(claims_from_token, expected_claims); - let encoded_rs256 = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.dF2N9KXG8ftFKHYbd5jQtXMQqv0Ej8FISGp1b_dmqOCotXj5S1y2AWjwyB_EXHM77JXfbEoJPAPrFFBNfd8cWtkCSTvpxWoHaecGzegDFGv5ZSc5AECFV1Daahc3PI3jii9wEiGkFOiwiBNfZ5INomOAsV--XXxlqIwKbTcgSYI7lrOTfecXAbAHiMKQlQYiIBSGnytRCgafhRkyGzPAL8ismthFJ9RHfeejyskht-9GbVHURw02bUyijuHEulpf9eEY3ZiB28de6jnCdU7ftIYaUMaYWt0nZQGkzxKPSfSLZNy14DTOYLDS04DVstWQPqnCUW_ojg0wJETOOfo9Zw"; - - // Check that RS512, RS384 and RS256 tokens can all be validated - let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?); - - for encoded in [encoded_rs512, encoded_rs384, encoded_rs256] { - let claims_from_token = auth.decode(encoded)?.claims; - assert_eq!(claims_from_token, expected_claims); - } Ok(()) } @@ -199,10 +146,10 @@ ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp scope: Scope::Tenant, }; - let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_RSA)?; + let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?; // decode it back - let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?); + let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?); let decoded = auth.decode(&encoded)?; assert_eq!(decoded.claims, claims); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 848b955af8..8966e8c49b 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -111,7 +111,7 @@ struct Args { /// WAL backup horizon. #[arg(long)] disable_wal_backup: bool, - /// Path to an RSA .pem public key which is used to check JWT tokens. + /// Path to a .pem public key which is used to check JWT tokens. #[arg(long)] auth_validation_public_key_path: Option, /// Format for logging, either 'plain' or 'json'. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index cd4f0678b5..6429b1e940 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -431,7 +431,7 @@ class AuthKeys: priv: str def generate_token(self, *, scope: str, **token_data: str) -> str: - token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="RS256") + token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") # cast(Any, self.priv) # jwt.encode can return 'bytes' or 'str', depending on Python version or type From 881356c417c56344553763f0e5fffcc1760ef93e Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 20 Mar 2023 16:11:36 +0100 Subject: [PATCH 174/426] add metrics to detect eviction-induced thrashing (#3837) This patch adds two metrics that will enable us to detect *thrashing* of layers, i.e., repetitions of `eviction, on-demand-download, eviction, ... ` for a given layer. The first metric counts all layer evictions per timeline. It requires no further explanation. The second metric counts the layer evictions where the layer was resident for less than a given threshold. We can alert on increments to the second metric. The first metric will serve as a baseline, and further, it's generally interesting, outside of thrashing. The second metric's threshold is configurable in PageServerConf and defaults to 24h. The threshold value is reproduced as a label in the metric because the counter's value is semantically tied to that threshold. Since changes to the config and hence the label value are infrequent, this will have low storage overhead in the metrics storage. The data source to determine the time that the layer was resident is the file's `mtime`. Using `mtime` is more of a crutch. It would be better if Pageserver did its own persistent bookkeeping of residence change events instead of relying on the filesystem. We had some discussion about this: https://github.com/neondatabase/neon/pull/3809#issuecomment-1470448900 My position is that `mtime` is good enough for now. It can theoretically jump forward if someone copies files without resetting `mtime`. But that shouldn't happen in practice. Note that moving files back and forth doesn't change `mtime`, nor does `chown` or `chmod`. Lastly, `rsync -a`, which is typically used for filesystem-level backup / restore, correctly syncs `mtime`. I've added a label that identifies the data source to keep options open for a future, better data source than `mtime`. Since this value will stay the same for the time being, it's not a problem for metrics storage. refs https://github.com/neondatabase/neon/issues/3728 --- pageserver/src/config.rs | 34 ++++++++++ pageserver/src/metrics.rs | 108 +++++++++++++++++++++++++++++- pageserver/src/tenant/timeline.rs | 33 ++++++++- test_runner/fixtures/metrics.py | 2 + 4 files changed, 173 insertions(+), 4 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 16f35355af..39282ce320 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -61,6 +61,7 @@ pub mod defaults { pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; /// /// Default built-in configuration file. @@ -89,6 +90,8 @@ pub mod defaults { #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' +#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -170,6 +173,9 @@ pub struct PageServerConf { pub metric_collection_endpoint: Option, pub synthetic_size_calculation_interval: Duration, + // See the corresponding metric's help string. + pub evictions_low_residence_duration_metric_threshold: Duration, + pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -240,6 +246,8 @@ struct PageServerConfigBuilder { metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, + evictions_low_residence_duration_metric_threshold: BuilderValue, + test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, @@ -293,6 +301,11 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")), + test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), @@ -408,6 +421,10 @@ impl PageServerConfigBuilder { self.test_remote_failures = BuilderValue::Set(fail_first); } + pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) { + self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value); + } + pub fn ondemand_download_behavior_treat_error_as_warn( &mut self, ondemand_download_behavior_treat_error_as_warn: bool, @@ -481,6 +498,11 @@ impl PageServerConfigBuilder { synthetic_size_calculation_interval: self .synthetic_size_calculation_interval .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, + evictions_low_residence_duration_metric_threshold: self + .evictions_low_residence_duration_metric_threshold + .ok_or(anyhow!( + "missing evictions_low_residence_duration_metric_threshold" + ))?, test_remote_failures: self .test_remote_failures .ok_or(anyhow!("missing test_remote_failuers"))?, @@ -670,6 +692,7 @@ impl PageServerConf { "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), + "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?), "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } @@ -810,6 +833,10 @@ impl PageServerConf { cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, synthetic_size_calculation_interval: Duration::from_secs(60), + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .unwrap(), test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, } @@ -951,6 +978,9 @@ metric_collection_interval = '222 s' cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' + +evictions_low_residence_duration_metric_threshold = '444 s' + log_format = 'json' "#; @@ -1005,6 +1035,9 @@ log_format = 'json' synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD + )?, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -1056,6 +1089,7 @@ log_format = 'json' cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), + evictions_low_residence_duration_metric_threshold: Duration::from_secs(444), test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 6a8aecfd25..b5563ad186 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -194,6 +194,93 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static EVICTIONS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_evictions", + "Number of layers evicted from the pageserver", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_evictions_with_low_residence_duration", + "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \ + Residence duration is determined using the `residence_duration_data_source`.", + &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] + ) + .expect("failed to define a metric") +}); + +/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. +#[derive(Debug)] +pub struct EvictionsWithLowResidenceDuration { + data_source: &'static str, + threshold: Duration, + counter: Option, +} + +pub struct EvictionsWithLowResidenceDurationBuilder { + data_source: &'static str, + threshold: Duration, +} + +impl EvictionsWithLowResidenceDurationBuilder { + pub fn new(data_source: &'static str, threshold: Duration) -> Self { + Self { + data_source, + threshold, + } + } + + fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration { + let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION + .get_metric_with_label_values(&[ + tenant_id, + timeline_id, + self.data_source, + &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold), + ]) + .unwrap(); + EvictionsWithLowResidenceDuration { + data_source: self.data_source, + threshold: self.threshold, + counter: Some(counter), + } + } +} + +impl EvictionsWithLowResidenceDuration { + fn threshold_label_value(threshold: Duration) -> String { + format!("{}", threshold.as_secs()) + } + + pub fn observe(&self, observed_value: Duration) { + if self.threshold < observed_value { + self.counter + .as_ref() + .expect("nobody calls this function after `remove_from_vec`") + .inc(); + } + } + + // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`. + fn remove(&mut self, tenant_id: &str, timeline_id: &str) { + let Some(_counter) = self.counter.take() else { + return; + }; + EVICTIONS_WITH_LOW_RESIDENCE_DURATION + .remove_label_values(&[ + tenant_id, + timeline_id, + self.data_source, + &Self::threshold_label_value(self.threshold), + ]) + .expect("we own the metric, no-one else should remove it"); + } +} + // Metrics collected on disk IO operations // // Roughly logarithmic scale. @@ -510,10 +597,16 @@ pub struct TimelineMetrics { pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, pub persistent_bytes_written: IntCounter, + pub evictions: IntCounter, + pub evictions_with_low_residence_duration: EvictionsWithLowResidenceDuration, } impl TimelineMetrics { - pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + pub fn new( + tenant_id: &TenantId, + timeline_id: &TimelineId, + evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, + ) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); let reconstruct_time_histo = RECONSTRUCT_TIME @@ -550,6 +643,11 @@ impl TimelineMetrics { let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let evictions = EVICTIONS + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let evictions_with_low_residence_duration = + evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); TimelineMetrics { tenant_id, @@ -569,6 +667,8 @@ impl TimelineMetrics { current_logical_size_gauge, num_persistent_files_created, persistent_bytes_written, + evictions, + evictions_with_low_residence_duration, } } } @@ -585,7 +685,9 @@ impl Drop for TimelineMetrics { let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); - + let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); + self.evictions_with_low_residence_duration + .remove(tenant_id, timeline_id); for op in STORAGE_TIME_OPERATIONS { let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); @@ -620,7 +722,7 @@ use std::collections::HashMap; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; -use std::time::Instant; +use std::time::{Duration, Instant}; pub struct RemoteTimelineClientMetrics { tenant_id: String, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0c42ed3079..f5dbe63b0b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1051,6 +1051,22 @@ impl Timeline { .file_size() .expect("Local layer should have a file size"); + let local_layer_mtime = local_layer + .local_path() + .expect("local layer should have a local path") + .metadata() + .context("get local layer file stat")? + .modified() + .context("get mtime of layer file")?; + let local_layer_residence_duration = + match SystemTime::now().duration_since(local_layer_mtime) { + Err(e) => { + warn!("layer mtime is in the future: {}", e); + None + } + Ok(delta) => Some(delta), + }; + let layer_metadata = LayerFileMetadata::new(layer_file_size); let new_remote_layer = Arc::new(match local_layer.filename() { @@ -1093,6 +1109,14 @@ impl Timeline { .resident_physical_size_gauge .sub(layer_file_size); + self.metrics.evictions.inc(); + + if let Some(delta) = local_layer_residence_duration { + self.metrics + .evictions_with_low_residence_duration + .observe(delta); + } + true } Replacement::NotFound => { @@ -1208,7 +1232,14 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new(&tenant_id, &timeline_id), + metrics: TimelineMetrics::new( + &tenant_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + conf.evictions_low_residence_duration_metric_threshold, + ), + ), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 450c02735a..2984f2c7d3 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -78,5 +78,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", "pageserver_tenant_states_count", + "pageserver_evictions_total", + "pageserver_evictions_with_low_residence_duration_total", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, ) From 699f20081171c716ea35e1549294888f15fe6caf Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 20 Mar 2023 12:49:56 +0400 Subject: [PATCH 175/426] Send error context chain to the client when Copy stream errors. --- libs/postgres_backend/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 4d88b958f0..60932a5950 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -767,7 +767,7 @@ impl PostgresBackend { let err_to_send_and_errcode = match &end { ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), - Other(_) => Some((end.to_string(), SQLSTATE_INTERNAL_ERROR)), + Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)), // Note: CopyFail in duplex copy is somewhat unexpected (at least to // PG walsender; evidently and per my docs reading client should // finish it with CopyDone). It is not a problem to recover from it From 5a786fab4f2877bc94f47ff2e369efd081baf846 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 20 Mar 2023 20:51:32 +0200 Subject: [PATCH 176/426] Remove duplicated global variables in neon extension. Walproposer used to live in the backend, while pagestore_smgr was an extension. But now that both are part of the neon extension, walproposer can access the same 'neon_tenant' and 'neon_timeline' variables as the pageserver_smgr code. --- pgxn/neon/libpagestore.c | 8 ++------ pgxn/neon/neon.h | 5 +++++ pgxn/neon/walproposer.c | 28 ++++++++++++---------------- pgxn/neon/walproposer.h | 4 ---- 4 files changed, 19 insertions(+), 26 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 3fe6d38251..a3f34247bb 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -536,10 +536,6 @@ pg_init_libpagestore(void) /* substitute password in pageserver_connstring */ page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); - /* Is there more correct way to pass CustomGUC to postgres code? */ - neon_timeline_walproposer = neon_timeline; - neon_tenant_walproposer = neon_tenant; - /* retrieve the token for Safekeeper, if present */ if (safekeeper_token_env != NULL) { if (safekeeper_token_env[0] != '$') { @@ -548,8 +544,8 @@ pg_init_libpagestore(void) errmsg("expected safekeeper auth token environment variable's name starting with $ but found: %s", safekeeper_token_env))); } - neon_safekeeper_token_walproposer = getenv(&safekeeper_token_env[1]); - if (!neon_safekeeper_token_walproposer) { + neon_safekeeper_token = getenv(&safekeeper_token_env[1]); + if (!neon_safekeeper_token) { ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("cannot get safekeeper auth token, environment variable %s is not set", diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 6b9ba372fb..da441b783d 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -12,6 +12,11 @@ #ifndef NEON_H #define NEON_H +/* GUCs */ +extern char *neon_safekeeper_token; +extern char *neon_timeline; +extern char *neon_tenant; + extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index bf8bb02493..aef2465e54 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -78,10 +78,6 @@ int wal_acceptor_reconnect_timeout; int wal_acceptor_connection_timeout; bool am_wal_proposer; -char *neon_timeline_walproposer = NULL; -char *neon_tenant_walproposer = NULL; -char *neon_safekeeper_token_walproposer = NULL; - #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" static int n_safekeepers = 0; @@ -514,15 +510,15 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) Safekeeper *sk = &safekeeper[n_safekeepers]; int written = 0; - if (neon_safekeeper_token_walproposer != NULL) { + if (neon_safekeeper_token != NULL) { written = snprintf((char *) &sk->conninfo, MAXCONNINFO, "host=%s port=%s password=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", - sk->host, sk->port, neon_safekeeper_token_walproposer, neon_timeline_walproposer, - neon_tenant_walproposer); + sk->host, sk->port, neon_safekeeper_token, neon_timeline, + neon_tenant); } else { written = snprintf((char *) &sk->conninfo, MAXCONNINFO, "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", - sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer); + sk->host, sk->port, neon_timeline, neon_tenant); } if (written > MAXCONNINFO || written < 0) @@ -550,16 +546,16 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) greetRequest.pgVersion = PG_VERSION_NUM; pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); greetRequest.systemId = systemId; - if (!neon_timeline_walproposer) + if (!neon_timeline) elog(FATAL, "neon.timeline_id is not provided"); - if (*neon_timeline_walproposer != '\0' && - !HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16)) - elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer); - if (!neon_tenant_walproposer) + if (*neon_timeline != '\0' && + !HexDecodeString(greetRequest.timeline_id, neon_timeline, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline); + if (!neon_tenant) elog(FATAL, "neon.tenant_id is not provided"); - if (*neon_tenant_walproposer != '\0' && - !HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16)) - elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer); + if (*neon_tenant != '\0' && + !HexDecodeString(greetRequest.tenant_id, neon_tenant, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant); #if PG_VERSION_NUM >= 150000 /* FIXME don't use hardcoded timeline id */ diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 1abaab2cc6..357d6378f8 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -39,10 +39,6 @@ typedef struct WalProposerConn WalProposerConn; struct WalMessage; typedef struct WalMessage WalMessage; -extern char *neon_timeline_walproposer; -extern char *neon_tenant_walproposer; -extern char *neon_safekeeper_token_walproposer; - /* Possible return values from ReadPGAsync */ typedef enum { From 299db9d0288d28bf6b8205140ce443c5c54e215d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 20 Mar 2023 20:51:36 +0200 Subject: [PATCH 177/426] Simplify and clean up the $NEON_AUTH_TOKEN stuff in compute - Remove the neon.safekeeper_token_env GUC. It was used to set the name of an environment variable, which was then used in pageserver and safekeeper connection strings to in place of the password. Instead, always look up the environment variable called NEON_AUTH_TOKEN. That's what neon.safekeeper_token_env was always set to in practice, and I don't see the need for the extra level of indirection or configurability. - Instead of substituting $NEON_AUTH_TOKEN in the connection strings, pass $NEON_AUTH_TOKEN "out-of-band" as the password, when we connect to the pageserver or safekeepers. That's simpler. - Also use the password from $NEON_AUTH_TOKEN in compute_ctl, when it connects to the pageserver to get the "base backup". --- compute_tools/src/compute.rs | 18 +++- control_plane/src/compute.rs | 28 ++---- docs/authentication.md | 26 +++--- pgxn/neon/libpagestore.c | 173 ++++++++--------------------------- pgxn/neon/libpqwalproposer.c | 31 ++++++- pgxn/neon/neon.h | 2 +- pgxn/neon/pagestore_smgr.c | 8 -- pgxn/neon/walproposer.c | 16 +--- pgxn/neon/walproposer.h | 2 +- 9 files changed, 112 insertions(+), 192 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8ceef44d61..09272262de 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -18,6 +18,7 @@ use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::RwLock; @@ -126,7 +127,22 @@ impl ComputeNode { fn get_basebackup(&self, lsn: &str) -> Result<()> { let start_time = Utc::now(); - let mut client = Client::connect(&self.pageserver_connstr, NoTls)?; + let mut config = postgres::Config::from_str(&self.pageserver_connstr)?; + + // Like in the neon extension, if the $NEON_AUTH_TOKEN env variable is + // set, use it as the password when connecting to pageserver. + // + // Note: this overrides any password set in the connection string. + match std::env::var("NEON_AUTH_TOKEN") { + Ok(val) => { + info!("Got pageserver auth token from NEON_AUTH_TOKEN env variable"); + config.password(val); + } + Err(std::env::VarError::NotPresent) => info!("NEON_AUTH_TOKEN env variable not set"), + Err(e) => info!("could not parse NEON_AUTH_TOKEN env variable: {}", e), + }; + + let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 094d2add8d..730cacf40b 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -11,7 +11,6 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; -use postgres_backend::AuthType; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, @@ -97,7 +96,7 @@ impl ComputeControlPlane { }); node.create_pgdata()?; - node.setup_pg_conf(self.env.pageserver.pg_auth_type)?; + node.setup_pg_conf()?; self.nodes .insert((tenant_id, node.name.clone()), Arc::clone(&node)); @@ -278,7 +277,7 @@ impl PostgresNode { // Write postgresql.conf with default configuration // and PG_VERSION file to the data directory of a new node. - fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { + fn setup_pg_conf(&self) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); conf.append("wal_log_hints", "off"); @@ -302,29 +301,12 @@ impl PostgresNode { let config = &self.pageserver.pg_connection_config; let (host, port) = (config.host(), config.port()); - // Set up authentication - // - // $NEON_AUTH_TOKEN will be replaced with value from environment - // variable during compute pg startup. It is done this way because - // otherwise user will be able to retrieve the value using SHOW - // command or pg_settings - let password = if let AuthType::NeonJWT = auth_type { - "$NEON_AUTH_TOKEN" - } else { - "" - }; - // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere. - // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN - // We parse this string and build it back with token from env var, and for simplicity rebuild - // uses only needed variables namely host, port, user, password. - format!("postgresql://no_user:{password}@{host}:{port}") + // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere. + format!("postgresql://no_user@{host}:{port}") }; conf.append("shared_preload_libraries", "neon"); conf.append_line(""); conf.append("neon.pageserver_connstring", &pageserver_connstr); - if let AuthType::NeonJWT = auth_type { - conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN"); - } conf.append("neon.tenant_id", &self.tenant_id.to_string()); conf.append("neon.timeline_id", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { @@ -447,6 +429,8 @@ impl PostgresNode { "DYLD_LIBRARY_PATH", self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ); + + // Pass authentication token used for the connections to pageserver and safekeepers if let Some(token) = auth_token { cmd.env("NEON_AUTH_TOKEN", token); } diff --git a/docs/authentication.md b/docs/authentication.md index dc402d1bca..f768b04c5b 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -106,20 +106,22 @@ Their authentication is just plain PostgreSQL authentication and out of scope fo There is no administrative API except those provided by PostgreSQL. #### Outgoing connections -Compute connects to Pageserver for getting pages. -The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`. -The environment variable inside the connection string is substituted with -the JWT token. +Compute connects to Pageserver for getting pages. The connection string is +configured by the `neon.pageserver_connstring` PostgreSQL GUC, +e.g. `postgresql://no_user@localhost:15028`. If the `$NEON_AUTH_TOKEN` +environment variable is set, it is used as the password for the connection. (The +pageserver uses JWT tokens for authentication, so the password is really a +token.) -Compute connects to Safekeepers to write and commit data. -The token is the same for all safekeepers. -It's stored in an environment variable, whose name is configured -by the `neon.safekeeper_token_env` PostgreSQL GUC. -If the GUC is unset, no token is passed. +Compute connects to Safekeepers to write and commit data. The list of safekeeper +addresses is given in the `neon.safekeepers` GUC. The connections to the +safekeepers take the password from the `$NEON_AUTH_TOKEN` environment +variable, if set. -Note that both tokens can be (and typically are) the same; -the scope is the tenant and the token is usually passed through the -`$NEON_AUTH_TOKEN` environment variable. +The `compute_ctl` binary that runs before the PostgreSQL server, and launches +PostgreSQL, also makes a connection to the pageserver. It uses it to fetch the +initial "base backup" dump, to initialize the PostgreSQL data directory. It also +uses `$NEON_AUTH_TOKEN` as the password for the connection. ### Pageserver #### Overview diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index a3f34247bb..c44e8fcda5 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -46,8 +46,12 @@ PGconn *pageserver_conn = NULL; */ WaitEventSet *pageserver_conn_wes = NULL; -char *page_server_connstring_raw; -char *safekeeper_token_env; +/* GUCs */ +char *neon_timeline; +char *neon_tenant; +int32 max_cluster_size; +char *page_server_connstring; +char *neon_auth_token; int n_unflushed_requests = 0; int flush_every_n_requests = 8; @@ -60,10 +64,37 @@ pageserver_connect(int elevel) { char *query; int ret; + const char *keywords[3]; + const char *values[3]; + int n; Assert(!connected); - pageserver_conn = PQconnectdb(page_server_connstring); + /* + * Connect using the connection string we got from the + * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment + * variable was set, use that as the password. + * + * The connection options are parsed in the order they're given, so + * when we set the password before the connection string, the + * connection string can override the password from the env variable. + * Seems useful, although we don't currently use that capability + * anywhere. + */ + n = 0; + if (neon_auth_token) + { + keywords[n] = "password"; + values[n] = neon_auth_token; + n++; + } + keywords[n] = "dbname"; + values[n] = page_server_connstring; + n++; + keywords[n] = NULL; + values[n] = NULL; + n++; + pageserver_conn = PQconnectdbParams(keywords, values, 1); if (PQstatus(pageserver_conn) == CONNECTION_BAD) { @@ -125,7 +156,7 @@ pageserver_connect(int elevel) } } - neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); + neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring); connected = true; return true; @@ -354,105 +385,6 @@ check_neon_id(char **newval, void **extra, GucSource source) return **newval == '\0' || HexDecodeString(id, *newval, 16); } -static char * -substitute_pageserver_password(const char *page_server_connstring_raw) -{ - char *host = NULL; - char *port = NULL; - char *user = NULL; - char *auth_token = NULL; - char *err = NULL; - char *page_server_connstring = NULL; - PQconninfoOption *conn_options; - PQconninfoOption *conn_option; - MemoryContext oldcontext; - - /* - * Here we substitute password in connection string with an environment - * variable. To simplify things we construct a connection string back with - * only known options. In particular: host port user and password. We do - * not currently use other options and constructing full connstring in an - * URI shape is quite messy. - */ - - if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') - return NULL; - - /* extract the auth token from the connection string */ - conn_options = PQconninfoParse(page_server_connstring_raw, &err); - if (conn_options == NULL) - { - /* The error string is malloc'd, so we must free it explicitly */ - char *errcopy = err ? pstrdup(err) : "out of memory"; - - PQfreemem(err); - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("invalid connection string syntax: %s", errcopy))); - } - - /* - * Trying to populate pageserver connection string with auth token from - * environment. We are looking for password in with placeholder value like - * $ENV_VAR_NAME, so if password field is present and starts with $ we try - * to fetch environment variable value and fail loudly if it is not set. - */ - for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) - { - if (strcmp(conn_option->keyword, "host") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - host = conn_option->val; - } - else if (strcmp(conn_option->keyword, "port") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - port = conn_option->val; - } - else if (strcmp(conn_option->keyword, "user") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - user = conn_option->val; - } - else if (strcmp(conn_option->keyword, "password") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - { - /* ensure that this is a template */ - if (strncmp(conn_option->val, "$", 1) != 0) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); - - neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); - auth_token = getenv(&conn_option->val[1]); - if (!auth_token) - { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); - } - else - { - neon_log(LOG, "using auth token from environment passed via env"); - } - } - } - } - - /* - * allocate connection string in TopMemoryContext to make sure it is not - * freed - */ - oldcontext = CurrentMemoryContext; - MemoryContextSwitchTo(TopMemoryContext); - page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); - MemoryContextSwitchTo(oldcontext); - - PQconninfoFree(conn_options); - return page_server_connstring; -} - /* * Module initialization function */ @@ -462,21 +394,12 @@ pg_init_libpagestore(void) DefineCustomStringVariable("neon.pageserver_connstring", "connection string to the page server", NULL, - &page_server_connstring_raw, + &page_server_connstring, "", PGC_POSTMASTER, 0, /* no flags required */ NULL, NULL, NULL); - DefineCustomStringVariable("neon.safekeeper_token_env", - "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN", - NULL, - &safekeeper_token_env, - NULL, - PGC_POSTMASTER, - 0, /* no flags required */ - NULL, NULL, NULL); - DefineCustomStringVariable("neon.timeline_id", "Neon timeline_id the server is running on", NULL, @@ -533,26 +456,10 @@ pg_init_libpagestore(void) neon_log(PageStoreTrace, "libpagestore already loaded"); page_server = &api; - /* substitute password in pageserver_connstring */ - page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); - - /* retrieve the token for Safekeeper, if present */ - if (safekeeper_token_env != NULL) { - if (safekeeper_token_env[0] != '$') { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("expected safekeeper auth token environment variable's name starting with $ but found: %s", - safekeeper_token_env))); - } - neon_safekeeper_token = getenv(&safekeeper_token_env[1]); - if (!neon_safekeeper_token) { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("cannot get safekeeper auth token, environment variable %s is not set", - &safekeeper_token_env[1]))); - } - neon_log(LOG, "using safekeeper auth token from environment variable"); - } + /* Retrieve the auth token to use when connecting to pageserver and safekeepers */ + neon_auth_token = getenv("NEON_AUTH_TOKEN"); + if (neon_auth_token) + neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable"); if (page_server_connstring && page_server_connstring[0]) { diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c index 6b1e6a8bcc..9b6175a621 100644 --- a/pgxn/neon/libpqwalproposer.c +++ b/pgxn/neon/libpqwalproposer.c @@ -51,12 +51,39 @@ walprop_status(WalProposerConn *conn) } WalProposerConn * -walprop_connect_start(char *conninfo) +walprop_connect_start(char *conninfo, char *password) { WalProposerConn *conn; PGconn *pg_conn; + const char *keywords[3]; + const char *values[3]; + int n; - pg_conn = PQconnectStart(conninfo); + /* + * Connect using the given connection string. If the + * NEON_AUTH_TOKEN environment variable was set, use that as + * the password. + * + * The connection options are parsed in the order they're given, so + * when we set the password before the connection string, the + * connection string can override the password from the env variable. + * Seems useful, although we don't currently use that capability + * anywhere. + */ + n = 0; + if (password) + { + keywords[n] = "password"; + values[n] = neon_auth_token; + n++; + } + keywords[n] = "dbname"; + values[n] = conninfo; + n++; + keywords[n] = NULL; + values[n] = NULL; + n++; + pg_conn = PQconnectStartParams(keywords, values, 1); /* * Allocation of a PQconn can fail, and will return NULL. We want to fully diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index da441b783d..3eac8f4570 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -13,7 +13,7 @@ #define NEON_H /* GUCs */ -extern char *neon_safekeeper_token; +extern char *neon_auth_token; extern char *neon_timeline; extern char *neon_tenant; diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ca91112195..5b30641856 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -92,14 +92,6 @@ const int SmgrTrace = DEBUG5; page_server_api *page_server; -/* GUCs */ -char *page_server_connstring; - -/*with substituted password*/ -char *neon_timeline; -char *neon_tenant; -int32 max_cluster_size; - /* unlogged relation build states */ typedef enum { diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index aef2465e54..b0b2a23e3c 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -510,17 +510,9 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) Safekeeper *sk = &safekeeper[n_safekeepers]; int written = 0; - if (neon_safekeeper_token != NULL) { - written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s password=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", - sk->host, sk->port, neon_safekeeper_token, neon_timeline, - neon_tenant); - } else { - written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", - sk->host, sk->port, neon_timeline, neon_tenant); - } - + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, + "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + sk->host, sk->port, neon_timeline, neon_tenant); if (written > MAXCONNINFO || written < 0) elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); } @@ -696,7 +688,7 @@ ResetConnection(Safekeeper *sk) /* * Try to establish new connection */ - sk->conn = walprop_connect_start((char *) &sk->conninfo); + sk->conn = walprop_connect_start((char *) &sk->conninfo, neon_auth_token); /* * "If the result is null, then libpq has been unable to allocate a new diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 357d6378f8..537c733850 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -454,7 +454,7 @@ extern char *walprop_error_message(WalProposerConn *conn); extern WalProposerConnStatusType walprop_status(WalProposerConn *conn); /* Re-exported PQconnectStart */ -extern WalProposerConn * walprop_connect_start(char *conninfo); +extern WalProposerConn * walprop_connect_start(char *conninfo, char *password); /* Re-exported PQconectPoll */ extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn); From 809acb5fa93411f44091ab6a56dddeffdcee0d89 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Tue, 21 Mar 2023 19:32:36 +0200 Subject: [PATCH 178/426] Move neon-image-depot to a larger runner (#3860) ## Describe your changes https://neondb.slack.com/archives/C039YKBRZB4/p1679413279637059 ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e056cf0fcf..d50a42d83c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -552,7 +552,7 @@ jobs: neon-image-depot: # For testing this will run side-by-side for a few merges. # This action is not really optimized yet, but gets the job done - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, gen3, large ] needs: [ tag ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned permissions: From 4158e24e60d294e0f039395ea95dd87f8ab317d9 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 21 Mar 2023 20:03:27 +0200 Subject: [PATCH 179/426] rfc: delete pageserver data from s3 (#3792) [Rendered](https://github.com/neondatabase/neon/blob/main/docs/rfcs/022-pageserver-delete-from-s3.md) --------- Co-authored-by: Joonas Koivunen --- docs/rfcs/022-pageserver-delete-from-s3.md | 269 +++++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 docs/rfcs/022-pageserver-delete-from-s3.md diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md new file mode 100644 index 0000000000..260e549670 --- /dev/null +++ b/docs/rfcs/022-pageserver-delete-from-s3.md @@ -0,0 +1,269 @@ +# Deleting pageserver part of tenants data from s3 + +Created on 08.03.23 + +## Motivation + +Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). + +This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident) + +## Summary + +TLDR; There are two options, one based on control plane issuing actual delete requests to s3 and the other one that keeps s3 stuff bound to pageserver. Each one has its pros and cons. + +The decision is to stick with pageserver centric approach. For motivation see [Decision](#decision). + +## Components + +pageserver, control-plane + +## Requirements + +Deletion should successfully finish (eventually) without leaving dangling files in presense of: + +- component restarts +- component outage +- pageserver loss + +## Proposed implementation + +Before the options are discussed, note that deletion can be quite long process. For deletion from s3 the obvious choice is [DeleteObjects](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) API call. It allows to batch deletion of up to 1k objects in one API call. So deletion operation linearly depends on number of layer files. + +Another design limitation is that there is no cheap `mv` operation available for s3. `mv` from `aws s3 mv` uses `copy(src, dst) + delete(src)`. So `mv`-like operation is not feasible as a building block because it actually amplifies the problem with both duration and resulting cost of the operation. + +The case when there are multiple pageservers handling the same tenants is largely out of scope of the RFC. We still consider case with migration from one PS to another, but do not consider case when tenant exists on multiple pageservers for extended period of time. The case with multiple pageservers can be reduced to case with one pageservers by calling detach on all pageservers except the last one, for it actual delete needs to be called. + +For simplicity lets look into deleting tenants. Differences in deletion process between tenants and timelines are mentioned in paragraph ["Differences between tenants and timelines"](#differences-between-tenants-and-timelines) + +### 1. Pageserver owns deletion machinery + +#### The sequence + +TLDR; With this approach control plane needs to call delete on a tenant and poll for progress. As much as possible is handled on pageserver. Lets see the sequence. + +Happy path: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS as Pageserver + participant S3 + + CP->>PS: Delete tenant + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + PS->>PS: Create deleted mark file locally + PS->>CP: Accepted + PS->>PS: delete local files other than deleted mark + loop Delete layers for each timeline + PS->>S3: delete(..) + CP->>PS: Finished? + PS->>CP: False + end + PS->>S3: Delete mark file + PS->>PS: Delete local mark file + + loop Poll for status + CP->>PS: Finished? + PS->>CP: True or False + end +``` + +Why two mark files? +Remote one is needed for cases when pageserver is lost during deletion so other pageserver can learn the deletion from s3 during attach. + +Why local mark file is needed? + +If we dont have one, we have two choices, delete local data before deleting the remote part or do that after. + +If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants). + +If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote. + +Thus we need local record of tenant being deleted as well. + +##### Handle pageserver crashes + +Lets explore sequences with various crash points. + +Pageserver crashes before `deleted` mark file is persisted in s3: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS as Pageserver + participant S3 + + CP->>PS: Delete tenant + note over PS: Crash point 1. + CP->>PS: Retry delete request + + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + PS->>PS: Create deleted mark file locally + + PS->>CP: Accepted + + PS->>PS: delete local files other than deleted mark + + loop Delete layers for each timeline + PS->>S3: delete(..) + CP->>PS: Finished? + PS->>CP: False + end + PS->>S3: Delete mark file + PS->>PS: Delete local mark file + + CP->>PS: Finished? + PS->>CP: True +``` + +Pageserver crashed when deleted mark was about to be persisted in s3, before Control Plane gets a response: + +```mermaid +sequenceDiagram + autonumber + participant CP as Control Plane + participant PS as Pageserver + participant S3 + + CP->>PS: Delete tenant + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + + note over PS: Crash point 2. + note over PS: During startup we reconcile
with remote and see
whether the remote mark exists + alt Remote mark exists + PS->>PS: create local mark if its missing + PS->>PS: delete local files other than deleted mark + loop Delete layers for each timeline + PS->>S3: delete(..) + end + + note over CP: Eventually console should
retry delete request + + CP->>PS: Retry delete tenant + PS->>CP: Not modified + else Mark is missing + note over PS: Continue to operate the tenant as if deletion didnt happen + + note over CP: Eventually console should
retry delete request + + CP->>PS: Retry delete tenant + PS->>S3: Create deleted mark file at
/tenant/meta/deleted + PS->>CP: Delete tenant + end + + PS->>PS: Continue with layer file deletions + loop Delete layers for each timeline + PS->>S3: delete(..) + CP->>PS: Finished? + PS->>CP: False + end + + PS->>S3: Delete mark file + PS->>PS: Delete local mark file + + CP->>PS: Finished? + PS->>CP: True +``` + +Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response. + +If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success. + +The same applies if pageserver crashes in the end, when remote mark is deleted but before local one gets deleted. In this case on restart pageserver moves forward with deletion of local mark and Control Plane will receive 404. + +##### Differences between tenants and timelines + +For timeline the sequence is the same with the following differences: + +- remote delete mark file can be replaced with a boolean "deleted" flag in index_part.json +- local deletion mark is not needed, because whole tenant is kept locally so situation described in motivation for local mark is impossible + +##### Handle pageserver loss + +If pageseserver is lost then the deleted tenant should be attached to different pageserver and delete request needs to be retried against new pageserver. Then attach logic is shared with one described for pageserver restarts (local deletion mark wont be available so needs to be created). + +##### Restrictions for tenant that is in progress of being deleted + +I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status. + +#### Summary + +Pros: + +- Storage is not dependent on control plane. Storage can be restarted even if control plane is not working. +- Allows for easier dogfooding, console can use Neon backed database as primary operational data store. If storage depends on control plane and control plane depends on storage we're stuck. +- No need to share inner s3 workings with control plane. Pageserver presents api contract and S3 paths are not part of this contract. +- No need to pass list of alive timelines to attach call. This will be solved by pageserver observing deleted flag. See + +Cons: + +- Logic is a tricky, needs good testing +- Anything else? + +### 2. Control plane owns deletion machinery + +In this case the only action performed on pageserver is removal of local files. + +Everything else is done by control plane. The steps are as follows: + +1. Control plane marks tenant as "delete pending" in its database +2. It lists the s3 for all the files and repeatedly calls delete until nothing is left behind +3. When no files are left marks deletion as completed + +In case of restart it selects all tenants marked as "delete pending" and continues the deletion. + +For tenants it is simple. For timelines there are caveats. + +Assume that the same workflow is used for timelines. + +If a tenant gets relocated during timeline deletion the attach call with its current logic will pick up deleted timeline in its half deleted state. + +Available options: + +- require list of alive timelines to be passed to attach call +- use the same schema with flag in index_part.json (again part of the caveats around pageserver restart applies). In this case nothing stops pageserver from implementing deletion inside if we already have these deletion marks. + +With first option the following problem becomes apparent: + +Who is the source of truth regarding timeline liveness? + +Imagine: +PS1 fails. +PS2 gets assigned the tenant. +New branch gets created +PS1 starts up (is it possible or we just recycle it?) +PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane. + +So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane. + +### Summary + +Cons: + +- Potential thundering herd-like problem during storage restart (requests to control plane) +- Potential increase in storage startup time (additional request to control plane) +- Storage startup starts to depend on console +- Erroneous attach call can attach tenant in half deleted state + +Pros: + +- Easier to reason about if you dont have to account for pageserver restarts + +### Extra notes + +There was a concern that having deletion code in pageserver is a littlebit scary, but we need to have this code somewhere. So to me it is equally scary to have that in whatever place it ends up at. + +Delayed deletion can be done with both approaches. As discussed with Anna (@stepashka) this is only relevant for tenants (projects) not for timelines. For first approach detach can be called immediately and deletion can be done later with attach + delete. With second approach control plane needs to start the deletion whenever necessary. + +## Decision + +After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete. + +To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes. + +With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo. + +So the decision is to proceed with pageserver centric approach. From 6fdd9c10d18270a5e30704f17e573ea14ee978ce Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 21 Mar 2023 15:24:49 +0200 Subject: [PATCH 180/426] Read storage auth token from spec file. We read the pageserver connection string from the spec file, so let's read the auth token from the same place. We've been talking about pre-launching compute nodes that are not associated with any particular tenant at startup, so that the spec file is delivered to the compute node later. We cannot change the env variables after the process has been launched. We still pass the token to 'postgres' binary in the NEON_AUTH_TOKEN env variable, but compute_ctl is now responsible for setting it. --- compute_tools/src/bin/compute_ctl.rs | 2 ++ compute_tools/src/compute.rs | 29 +++++++++++++++++----------- compute_tools/src/spec.rs | 2 ++ 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index a4e9262072..b96842e416 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -133,6 +133,7 @@ fn main() -> Result<()> { .settings .find("neon.pageserver_connstring") .expect("pageserver connstr should be provided"); + let storage_auth_token = spec.storage_auth_token.clone(); let tenant = spec .cluster .settings @@ -153,6 +154,7 @@ fn main() -> Result<()> { tenant, timeline, pageserver_connstr, + storage_auth_token, metrics: ComputeMetrics::default(), state: RwLock::new(ComputeState::new()), }; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 09272262de..00d1e234ab 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -45,6 +45,7 @@ pub struct ComputeNode { pub tenant: String, pub timeline: String, pub pageserver_connstr: String, + pub storage_auth_token: Option, pub metrics: ComputeMetrics, /// Volatile part of the `ComputeNode` so should be used under `RwLock` /// to allow HTTP API server to serve status requests, while configuration @@ -129,18 +130,14 @@ impl ComputeNode { let mut config = postgres::Config::from_str(&self.pageserver_connstr)?; - // Like in the neon extension, if the $NEON_AUTH_TOKEN env variable is - // set, use it as the password when connecting to pageserver. - // + // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. - match std::env::var("NEON_AUTH_TOKEN") { - Ok(val) => { - info!("Got pageserver auth token from NEON_AUTH_TOKEN env variable"); - config.password(val); - } - Err(std::env::VarError::NotPresent) => info!("NEON_AUTH_TOKEN env variable not set"), - Err(e) => info!("could not parse NEON_AUTH_TOKEN env variable: {}", e), - }; + if let Some(storage_auth_token) = &self.storage_auth_token { + info!("Got storage auth token from spec file"); + config.password(storage_auth_token); + } else { + info!("Storage auth token not set"); + } let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { @@ -179,6 +176,11 @@ impl ComputeNode { let sync_handle = Command::new(&self.pgbin) .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .envs(if let Some(storage_auth_token) = &self.storage_auth_token { + vec![("NEON_AUTH_TOKEN", storage_auth_token)] + } else { + vec![] + }) .stdout(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); @@ -256,6 +258,11 @@ impl ComputeNode { // Run postgres as a child process. let mut pg = Command::new(&self.pgbin) .args(["-D", &self.pgdata]) + .envs(if let Some(storage_auth_token) = &self.storage_auth_token { + vec![("NEON_AUTH_TOKEN", storage_auth_token)] + } else { + vec![] + }) .spawn() .expect("cannot start postgres process"); diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 47f1d69cff..9694ba9a88 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -24,6 +24,8 @@ pub struct ComputeSpec { pub cluster: Cluster, pub delta_operations: Option>, + pub storage_auth_token: Option, + pub startup_tracing_context: Option>, } From dd22c871003275d1087a9a5a4948f030ad6a8eda Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 21 Mar 2023 23:33:28 +0200 Subject: [PATCH 181/426] Remove older layer metadata format support code (#3854) The PR enforces current newest `index_part.json` format in the type system (version `1`), not allowing any previous forms of it, that were used in the past. Similarly, the code to mitigate the https://github.com/neondatabase/neon/issues/3024 issue is now also removed. Current code does not produce old formats and extra files in the index_part.json, in the future we will be able to use https://github.com/neondatabase/aversion or other approach to make version transitions more explicit. See https://neondb.slack.com/archives/C033RQ5SPDH/p1679134185248119 for the justification on the breaking changes. --- libs/pageserver_api/src/models.rs | 4 +- pageserver/src/http/routes.rs | 4 +- .../src/tenant/remote_timeline_client.rs | 38 +-- .../tenant/remote_timeline_client/download.rs | 21 +- .../tenant/remote_timeline_client/index.rs | 219 +++++------------- .../tenant/remote_timeline_client/upload.rs | 10 +- pageserver/src/tenant/storage_layer.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 6 +- .../src/tenant/storage_layer/filename.rs | 9 + .../src/tenant/storage_layer/image_layer.rs | 6 +- .../src/tenant/storage_layer/remote_layer.rs | 2 +- pageserver/src/tenant/timeline.rs | 88 +++---- pageserver/src/tenant/upload_queue.rs | 15 +- .../test_tenants_with_remote_storage.py | 200 ---------------- 14 files changed, 132 insertions(+), 492 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 7a43100ba5..0f860d0a6d 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -346,7 +346,7 @@ pub enum InMemoryLayerInfo { pub enum HistoricLayerInfo { Delta { layer_file_name: String, - layer_file_size: Option, + layer_file_size: u64, #[serde_as(as = "DisplayFromStr")] lsn_start: Lsn, @@ -357,7 +357,7 @@ pub enum HistoricLayerInfo { }, Image { layer_file_name: String, - layer_file_size: Option, + layer_file_size: u64, #[serde_as(as = "DisplayFromStr")] lsn_start: Lsn, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 39f2776952..d91e421a52 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -185,7 +185,7 @@ fn build_timeline_info_common( None } }; - let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok()); + let current_physical_size = Some(timeline.layer_size_sum()); let state = timeline.current_state(); let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); @@ -451,7 +451,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { - current_physical_size += timeline.layer_size_sum().approximate_is_ok(); + current_physical_size += timeline.layer_size_sum(); } let state = tenant.current_state(); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index f3943298f2..28c4943dbd 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -210,7 +210,6 @@ pub use download::{is_temp_download_file, list_remote_timelines}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; -use anyhow::ensure; use remote_storage::{DownloadError, GenericRemoteStorage}; use std::ops::DerefMut; use tokio::runtime::Runtime; @@ -347,7 +346,7 @@ impl RemoteTimelineClient { .layer_metadata .values() // If we don't have the file size for the layer, don't account for it in the metric. - .map(|ilmd| ilmd.file_size.unwrap_or(0)) + .map(|ilmd| ilmd.file_size) .sum() } else { 0 @@ -420,34 +419,6 @@ impl RemoteTimelineClient { .await? }; - // Update the metadata for given layer file. The remote index file - // might be missing some information for the file; this allows us - // to fill in the missing details. - if layer_metadata.file_size().is_none() { - let new_metadata = LayerFileMetadata::new(downloaded_size); - let mut guard = self.upload_queue.lock().unwrap(); - let upload_queue = guard.initialized_mut()?; - if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { - if upgraded.merge(&new_metadata) { - upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; - } - // If we don't do an index file upload inbetween here and restart, - // the value will go back down after pageserver restart, since we will - // have lost this data point. - // But, we upload index part fairly frequently, and restart pageserver rarely. - // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner. - self.metrics - .remote_physical_size_gauge() - .add(downloaded_size); - } else { - // The file should exist, since we just downloaded it. - warn!( - "downloaded file {:?} not found in local copy of the index file", - layer_file_name - ); - } - } - REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc(); REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size); @@ -550,13 +521,6 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - // The file size can be missing for files that were created before we tracked that - // in the metadata, but it should be present for any new files we create. - ensure!( - layer_metadata.file_size().is_some(), - "file size not initialized in metadata" - ); - upload_queue .latest_files .insert(layer_file_name.clone(), layer_metadata.clone()); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index ea8d9858c3..bda095d850 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -21,7 +21,7 @@ use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; -use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata}; +use super::index::{IndexPart, LayerFileMetadata}; use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD}; async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { @@ -113,16 +113,11 @@ pub async fn download_layer_file<'a>( }) .map_err(DownloadError::Other)?; - match layer_metadata.file_size() { - Some(expected) if expected != bytes_amount => { - return Err(DownloadError::Other(anyhow!( - "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", - temp_file_path.display() - ))); - } - Some(_) | None => { - // matches, or upgrading from an earlier IndexPart version - } + let expected = layer_metadata.file_size(); + if expected != bytes_amount { + return Err(DownloadError::Other(anyhow!( + "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}", + ))); } // not using sync_data because it can lose file size update @@ -261,14 +256,12 @@ pub(super) async fn download_index_part( ) .await?; - let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes) + let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| { format!("Failed to deserialize index part file into file {index_part_path:?}") }) .map_err(DownloadError::Other)?; - let index_part = index_part.remove_unclean_layer_file_names(); - Ok(index_part) } diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 420edae6cd..9c84f8e977 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -6,7 +6,6 @@ use std::collections::{HashMap, HashSet}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; -use tracing::warn; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerFileName; @@ -20,7 +19,7 @@ use utils::lsn::Lsn; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] #[cfg_attr(test, derive(Default))] pub struct LayerFileMetadata { - file_size: Option, + file_size: u64, } impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { @@ -33,36 +32,16 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { impl LayerFileMetadata { pub fn new(file_size: u64) -> Self { - LayerFileMetadata { - file_size: Some(file_size), - } + LayerFileMetadata { file_size } } - /// This is used to initialize the metadata for remote layers, for which - /// the metadata was missing from the index part file. - pub const MISSING: Self = LayerFileMetadata { file_size: None }; - - pub fn file_size(&self) -> Option { + pub fn file_size(&self) -> u64 { self.file_size } - - /// Metadata has holes due to version upgrades. This method is called to upgrade self with the - /// other value. - /// - /// This is called on the possibly outdated version. Returns true if any changes - /// were made. - pub fn merge(&mut self, other: &Self) -> bool { - let mut changed = false; - - if self.file_size != other.file_size { - self.file_size = other.file_size.or(self.file_size); - changed = true; - } - - changed - } } +// TODO seems like another part of the remote storage file format +// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 /// In-memory representation of an `index_part.json` file /// /// Contains the data about all files in the timeline, present remotely and its metadata. @@ -71,10 +50,7 @@ impl LayerFileMetadata { /// remember to add a test case for the changed version. #[serde_as] #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] -pub struct IndexPartImpl -where - L: std::hash::Hash + PartialEq + Eq, -{ +pub struct IndexPart { /// Debugging aid describing the version of this type. #[serde(default)] version: usize, @@ -82,14 +58,13 @@ where /// Layer names, which are stored on the remote storage. /// /// Additional metadata can might exist in `layer_metadata`. - pub timeline_layers: HashSet, + pub timeline_layers: HashSet, /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - #[serde(default = "HashMap::default")] - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated here for convenience. @@ -98,101 +73,6 @@ where metadata_bytes: Vec, } -// TODO seems like another part of the remote storage file format -// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 -pub type IndexPart = IndexPartImpl; - -pub type IndexPartUnclean = IndexPartImpl; - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum UncleanLayerFileName { - Clean(LayerFileName), - BackupFile(String), -} - -impl<'de> serde::Deserialize<'de> for UncleanLayerFileName { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_string(UncleanLayerFileNameVisitor) - } -} - -struct UncleanLayerFileNameVisitor; - -impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor { - type Value = UncleanLayerFileName; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - write!( - formatter, - "a string that is a valid LayerFileName or '.old' backup file name" - ) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - let maybe_clean: Result = v.parse(); - match maybe_clean { - Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)), - Err(e) => { - if v.ends_with(".old") || v == "metadata_backup" { - Ok(UncleanLayerFileName::BackupFile(v.to_owned())) - } else { - Err(E::custom(e)) - } - } - } - } -} - -impl UncleanLayerFileName { - fn into_clean(self) -> Option { - match self { - UncleanLayerFileName::Clean(clean) => Some(clean), - UncleanLayerFileName::BackupFile(_) => None, - } - } -} - -impl IndexPartUnclean { - pub fn remove_unclean_layer_file_names(self) -> IndexPart { - let IndexPartUnclean { - version, - timeline_layers, - layer_metadata, - disk_consistent_lsn, - metadata_bytes, - } = self; - - IndexPart { - version, - timeline_layers: timeline_layers - .into_iter() - .filter_map(|unclean_file_name| match unclean_file_name { - UncleanLayerFileName::Clean(clean_name) => Some(clean_name), - UncleanLayerFileName::BackupFile(backup_file_name) => { - // For details see https://github.com/neondatabase/neon/issues/3024 - warn!( - "got backup file on the remote storage, ignoring it {backup_file_name}" - ); - None - } - }) - .collect(), - layer_metadata: layer_metadata - .into_iter() - .filter_map(|(l, m)| l.into_clean().map(|l| (l, m))) - .collect(), - disk_consistent_lsn, - metadata_bytes, - } - } -} - impl IndexPart { /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be /// used to understand later versions. @@ -232,7 +112,7 @@ impl IndexPart { /// Serialized form of [`LayerFileMetadata`]. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub struct IndexLayerMetadata { - pub(super) file_size: Option, + pub(super) file_size: u64, } impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { @@ -247,27 +127,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { mod tests { use super::*; - #[test] - fn v0_indexpart_is_parsed() { - let example = r#"{ - "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], - "disk_consistent_lsn":"0/16960E8", - "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] - }"#; - - let expected = IndexPart { - version: 0, - timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), - layer_metadata: HashMap::default(), - disk_consistent_lsn: "0/16960E8".parse::().unwrap(), - metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), - }; - - let part: IndexPartUnclean = serde_json::from_str(example).unwrap(); - let part = part.remove_unclean_layer_file_names(); - assert_eq!(part, expected); - } - #[test] fn v1_indexpart_is_parsed() { let example = r#"{ @@ -287,21 +146,19 @@ mod tests { timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { - file_size: Some(25600000), + file_size: 25600000, }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. - file_size: Some(9007199254741001), + file_size: 9007199254741001, }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), }; - let part = serde_json::from_str::(example) - .unwrap() - .remove_unclean_layer_file_names(); + let part = serde_json::from_str::(example).unwrap(); assert_eq!(part, expected); } @@ -325,20 +182,64 @@ mod tests { timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { - file_size: Some(25600000), + file_size: 25600000, }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. - file_size: Some(9007199254741001), + file_size: 9007199254741001, }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), }; - let part = serde_json::from_str::(example).unwrap(); - let part = part.remove_unclean_layer_file_names(); + let part = serde_json::from_str::(example).unwrap(); assert_eq!(part, expected); } + + #[test] + fn empty_layers_are_parsed() { + let empty_layers_json = r#"{ + "version":1, + "timeline_layers":[], + "layer_metadata":{}, + "disk_consistent_lsn":"0/2532648", + "metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + version: 1, + timeline_layers: HashSet::new(), + layer_metadata: HashMap::new(), + disk_consistent_lsn: "0/2532648".parse::().unwrap(), + metadata_bytes: [ + 136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83, + 38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255, + 210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73, + 240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, + ] + .to_vec(), + }; + + let empty_layers_parsed = serde_json::from_str::(empty_layers_json).unwrap(); + + assert_eq!(empty_layers_parsed, expected); + } } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 5082fa1634..ce9f4d9bf8 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -64,13 +64,9 @@ pub(super) async fn upload_timeline_layer<'a>( })? .len(); - // FIXME: this looks bad - if let Some(metadata_size) = known_metadata.file_size() { - if metadata_size != fs_size { - bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); - } - } else { - // this is a silly state we would like to avoid + let metadata_size = known_metadata.file_size(); + if metadata_size != fs_size { + bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); } let fs_size = usize::try_from(fs_size).with_context(|| { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 52ce2cab42..c36b6121c0 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -378,7 +378,7 @@ pub trait PersistentLayer: Layer { /// /// Should not change over the lifetime of the layer object because /// current_physical_size is computed as the som of this value. - fn file_size(&self) -> Option; + fn file_size(&self) -> u64; fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 37719dfce5..98cbcc5f07 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -444,8 +444,8 @@ impl PersistentLayer for DeltaLayer { Ok(()) } - fn file_size(&self) -> Option { - Some(self.file_size) + fn file_size(&self) -> u64 { + self.file_size } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { @@ -456,7 +456,7 @@ impl PersistentLayer for DeltaLayer { HistoricLayerInfo::Delta { layer_file_name, - layer_file_size: Some(self.file_size), + layer_file_size: self.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, remote: false, diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index efd0769886..e2112fc388 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -258,6 +258,15 @@ impl serde::Serialize for LayerFileName { } } +impl<'de> serde::Deserialize<'de> for LayerFileName { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_string(LayerFileNameVisitor) + } +} + struct LayerFileNameVisitor; impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index e37e001eda..a99b1b491f 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -258,8 +258,8 @@ impl PersistentLayer for ImageLayer { Ok(()) } - fn file_size(&self) -> Option { - Some(self.file_size) + fn file_size(&self) -> u64 { + self.file_size } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { @@ -268,7 +268,7 @@ impl PersistentLayer for ImageLayer { HistoricLayerInfo::Image { layer_file_name, - layer_file_size: Some(self.file_size), + layer_file_size: self.file_size, lsn_start: lsn_range.start, remote: false, access_stats: self.access_stats.as_api_model(reset), diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index dbce2e7888..2eb7eb0cb6 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -167,7 +167,7 @@ impl PersistentLayer for RemoteLayer { true } - fn file_size(&self) -> Option { + fn file_size(&self) -> u64 { self.layer_metadata.file_size() } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f5dbe63b0b..4d03a78883 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -334,25 +334,6 @@ impl LogicalSize { } } -/// Returned by [`Timeline::layer_size_sum`] -pub enum LayerSizeSum { - /// The result is accurate. - Accurate(u64), - // We don't know the layer file size of one or more layers. - // They contribute to the sum with a value of 0. - // Hence, the sum is a lower bound for the actualy layer file size sum. - ApproximateLowerBound(u64), -} - -impl LayerSizeSum { - pub fn approximate_is_ok(self) -> u64 { - match self { - LayerSizeSum::Accurate(v) => v, - LayerSizeSum::ApproximateLowerBound(v) => v, - } - } -} - pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, @@ -550,20 +531,13 @@ impl Timeline { /// The sum of the file size of all historic layers in the layer map. /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. - pub fn layer_size_sum(&self) -> LayerSizeSum { + pub fn layer_size_sum(&self) -> u64 { let layer_map = self.layers.read().unwrap(); let mut size = 0; - let mut no_size_cnt = 0; for l in layer_map.iter_historic_layers() { - let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); - size += l_size; - no_size_cnt += l_no_size; - } - if no_size_cnt == 0 { - LayerSizeSum::Accurate(size) - } else { - LayerSizeSum::ApproximateLowerBound(size) + size += l.file_size(); } + size } pub fn get_resident_physical_size(&self) -> u64 { @@ -1047,9 +1021,7 @@ impl Timeline { return Ok(false); } - let layer_file_size = local_layer - .file_size() - .expect("Local layer should have a file size"); + let layer_file_size = local_layer.file_size(); let local_layer_mtime = local_layer .local_path() @@ -1514,7 +1486,12 @@ impl Timeline { .layer_metadata .get(remote_layer_name) .map(LayerFileMetadata::from) - .unwrap_or(LayerFileMetadata::MISSING); + .with_context(|| { + format!( + "No remote layer metadata found for layer {}", + remote_layer_name.file_name() + ) + })?; // Is the local layer's size different from the size stored in the // remote index file? @@ -1530,34 +1507,27 @@ impl Timeline { local_layer_path.display() ); - if let Some(remote_size) = remote_layer_metadata.file_size() { - let metadata = local_layer_path.metadata().with_context(|| { - format!( - "get file size of local layer {}", - local_layer_path.display() - ) - })?; - let local_size = metadata.len(); - if local_size != remote_size { - warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); - if let Err(err) = rename_to_backup(&local_layer_path) { - assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); - anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); - } else { - self.metrics.resident_physical_size_gauge.sub(local_size); - updates.remove_historic(local_layer); - // fall-through to adding the remote layer - } + let remote_size = remote_layer_metadata.file_size(); + let metadata = local_layer_path.metadata().with_context(|| { + format!( + "get file size of local layer {}", + local_layer_path.display() + ) + })?; + let local_size = metadata.len(); + if local_size != remote_size { + warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); + if let Err(err) = rename_to_backup(&local_layer_path) { + assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); + anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { - debug!( - "layer is present locally and file size matches remote, using it: {}", - local_layer_path.display() - ); - continue; + self.metrics.resident_physical_size_gauge.sub(local_size); + updates.remove_historic(local_layer); + // fall-through to adding the remote layer } } else { debug!( - "layer is present locally and remote does not have file size, using it: {}", + "layer is present locally and file size matches remote, using it: {}", local_layer_path.display() ); continue; @@ -1984,9 +1954,7 @@ impl Timeline { ) -> anyhow::Result<()> { if !layer.is_remote_layer() { layer.delete_resident_layer_file()?; - let layer_file_size = layer - .file_size() - .expect("Local layer should have a file size"); + let layer_file_size = layer.file_size(); self.metrics .resident_physical_size_gauge .sub(layer_file_size); diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 790b2f59aa..08bc1f219d 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -127,12 +127,21 @@ impl UploadQueue { let mut files = HashMap::with_capacity(index_part.timeline_layers.len()); for layer_name in &index_part.timeline_layers { - let layer_metadata = index_part + match index_part .layer_metadata .get(layer_name) .map(LayerFileMetadata::from) - .unwrap_or(LayerFileMetadata::MISSING); - files.insert(layer_name.to_owned(), layer_metadata); + { + Some(layer_metadata) => { + files.insert(layer_name.to_owned(), layer_metadata); + } + None => { + anyhow::bail!( + "No remote layer metadata found for layer {}", + layer_name.file_name() + ); + } + } } let index_part_metadata = index_part.parse_metadata()?; diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 769bc10280..c786f8a8e1 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -9,7 +9,6 @@ import asyncio import json import os -import shutil from pathlib import Path from typing import List, Tuple @@ -217,208 +216,9 @@ def test_tenants_attached_after_download( assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*") -@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) -def test_tenant_upgrades_index_json_from_v0( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind -): - # the "image" for the v0 index_part.json. the fields themselves are - # replaced with values read from the later version because of #2592 (initdb - # lsn not reproducible). - v0_skeleton = json.loads( - """{ - "timeline_layers":[ - "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9" - ], - "missing_layers":["This should not fail as its not used anymore"], - "disk_consistent_lsn":"0/16960E8", - "metadata_bytes":[] - }""" - ) - - # getting a too eager compaction happening for this test would not play - # well with the strict assertions. - neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'" - - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_tenant_upgrades_index_json_from_v0" - ) - - # launch pageserver, populate the default tenants timeline, wait for it to be uploaded, - # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade - env = neon_env_builder.init_start() - - pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") - - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) - - with pg.cursor() as cur: - cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - - # flush, wait until in remote storage - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) - env.postgres.stop_all() - env.pageserver.stop() - - # remove all local data for the tenant to force redownloading and subsequent upgrade - shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id)) - - # downgrade the remote file - timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) - with open(timeline_path, "r+") as timeline_file: - # keep the deserialized for later inspection - orig_index_part = json.load(timeline_file) - - v0_index_part = { - key: orig_index_part[key] - for key in v0_skeleton.keys() - ["missing_layers"] # pgserver doesn't have it anymore - } - - timeline_file.seek(0) - json.dump(v0_index_part, timeline_file) - timeline_file.truncate(timeline_file.tell()) - - env.pageserver.start() - pageserver_http = env.pageserver.http_client() - pageserver_http.tenant_attach(tenant_id) - - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), - ) - - pg = env.postgres.create_start("main") - - with pg.cursor() as cur: - cur.execute("INSERT INTO t0 VALUES (234, 'test data');") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) - - # not needed anymore - env.postgres.stop_all() - env.pageserver.stop() - - # make sure the file has been upgraded back to how it started - index_part = local_fs_index_part(env, tenant_id, timeline_id) - assert index_part["version"] == orig_index_part["version"] - assert "missing_layers" not in index_part.keys() - - # expect one more layer because of the forced checkpoint - assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1 - - # all of the same layer files are there, but they might be shuffled around - orig_layers = set(orig_index_part["timeline_layers"]) - later_layers = set(index_part["timeline_layers"]) - assert later_layers.issuperset(orig_layers) - - added_layers = later_layers - orig_layers - assert len(added_layers) == 1 - - # all of metadata has been regenerated (currently just layer file size) - all_metadata_keys = set() - for layer in orig_layers: - orig_metadata = orig_index_part["layer_metadata"][layer] - new_metadata = index_part["layer_metadata"][layer] - assert ( - orig_metadata == new_metadata - ), f"metadata for layer {layer} should not have changed {orig_metadata} vs. {new_metadata}" - all_metadata_keys |= set(orig_metadata.keys()) - - one_new_layer = next(iter(added_layers)) - assert one_new_layer in index_part["layer_metadata"], "new layer should have metadata" - - only_new_metadata = index_part["layer_metadata"][one_new_layer] - - assert ( - set(only_new_metadata.keys()).symmetric_difference(all_metadata_keys) == set() - ), "new layer metadata has same metadata as others" - - # FIXME: test index_part.json getting downgraded from imaginary new version -@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) -def test_tenant_ignores_backup_file( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind -): - # getting a too eager compaction happening for this test would not play - # well with the strict assertions. - neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'" - - neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file") - - # launch pageserver, populate the default tenants timeline, wait for it to be uploaded, - # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade - env = neon_env_builder.init_start() - - env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*") - - pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") - - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) - - with pg.cursor() as cur: - cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - - # flush, wait until in remote storage - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) - - env.postgres.stop_all() - env.pageserver.stop() - - # change the remote file to have entry with .0.old suffix - timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) - with open(timeline_path, "r+") as timeline_file: - # keep the deserialized for later inspection - orig_index_part = json.load(timeline_file) - backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old" - orig_index_part["timeline_layers"].append(backup_layer_name) - - timeline_file.seek(0) - json.dump(orig_index_part, timeline_file) - - env.pageserver.start() - pageserver_http = env.pageserver.http_client() - - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), - ) - - pg = env.postgres.create_start("main") - - with pg.cursor() as cur: - cur.execute("INSERT INTO t0 VALUES (234, 'test data');") - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) - - # not needed anymore - env.postgres.stop_all() - env.pageserver.stop() - - # the .old file is gone from newly serialized index_part - new_index_part = local_fs_index_part(env, tenant_id, timeline_id) - backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"]) - assert len(list(backup_layers)) == 0 - - @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_tenant_redownloads_truncated_file_on_startup( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind From 0f7de847856510807b5794d5c3903e058b07d066 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Wed, 22 Mar 2023 09:17:00 +0200 Subject: [PATCH 182/426] Allow calling detach on ignored tenant (#3834) ## Describe your changes Added a query param to detach API Allow to remove local state of a tenant even if its not in the memory (following ignore API) ## Issue ticket number and link #3828 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: Kirill Bulatov --- pageserver/src/http/openapi_spec.yml | 7 ++ pageserver/src/http/routes.rs | 3 +- pageserver/src/tenant/mgr.rs | 32 +++++--- test_runner/fixtures/neon_fixtures.py | 14 +++- test_runner/regress/test_tenant_detach.py | 90 ++++++++++++++++++++++- 5 files changed, 130 insertions(+), 16 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 3d3a9892bf..2098f848d5 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -351,6 +351,13 @@ paths: schema: type: string format: hex + - name: detach_ignored + in: query + required: false + schema: + type: boolean + description: | + When true, allow to detach a tenant which state is ignored. post: description: | Remove tenant data (including all corresponding timelines) from pageserver's memory and file system. diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index d91e421a52..04b7928d31 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -384,10 +384,11 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; + let detach_ignored: Option = parse_query_param(&request, "detach_ignored")?; let state = get_state(&request); let conf = state.conf; - mgr::detach_tenant(conf, tenant_id) + mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false)) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) .await?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a4212ea8a6..26a2bb972c 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -315,10 +315,6 @@ pub async fn get_tenant( .get(&tenant_id) .ok_or(TenantStateError::NotFound(tenant_id))?; if active_only && !tenant.is_active() { - tracing::warn!( - "Tenant {tenant_id} is not active. Current state: {:?}", - tenant.current_state() - ); Err(TenantStateError::NotActive(tenant_id)) } else { Ok(Arc::clone(tenant)) @@ -350,17 +346,35 @@ pub enum TenantStateError { pub async fn detach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + detach_ignored: bool, ) -> Result<(), TenantStateError> { - remove_tenant_from_memory(tenant_id, async { - let local_tenant_directory = conf.tenant_path(&tenant_id); + let local_files_cleanup_operation = |tenant_id_to_clean| async move { + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); fs::remove_dir_all(&local_tenant_directory) .await .with_context(|| { - format!("Failed to remove local tenant directory {local_tenant_directory:?}") + format!("local tenant directory {local_tenant_directory:?} removal") })?; Ok(()) - }) - .await + }; + + let removal_result = + remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await; + + // Ignored tenants are not present in memory and will bail the removal from memory operation. + // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. + if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) { + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); + if tenant_ignore_mark.exists() { + info!("Detaching an ignored tenant"); + local_files_cleanup_operation(tenant_id) + .await + .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?; + return Ok(()); + } + } + + removal_result } pub async fn load_tenant( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6429b1e940..9929d3e66b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1119,7 +1119,9 @@ def neon_env_builder( class PageserverApiException(Exception): - pass + def __init__(self, message, status_code: int): + super().__init__(message) + self.status_code = status_code class PageserverHttpClient(requests.Session): @@ -1140,7 +1142,7 @@ class PageserverHttpClient(requests.Session): msg = res.json()["msg"] except: # noqa: E722 msg = "" - raise PageserverApiException(msg) from e + raise PageserverApiException(msg, res.status_code) from e def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() @@ -1190,8 +1192,12 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") self.verbose_error(res) - def tenant_detach(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") + def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): + params = {} + if detach_ignored: + params["detach_ignored"] = "true" + + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) self.verbose_error(res) def tenant_load(self, tenant_id: TenantId): diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index e061ab92a4..5db79eef4a 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -264,9 +264,11 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): with pytest.raises( expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}", - ): + ) as excinfo: pageserver_http.tenant_detach(tenant_id) + assert excinfo.value.status_code == 404 + # the error will be printed to the log too env.pageserver.allowed_errors.append(".*NotFound: tenant *") @@ -325,7 +327,91 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http.timeline_gc(tenant_id, timeline_id, 0) -# +# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail), +# then with parameters to force ignored tenant detach (should not fail). +def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): + env = neon_simple_env + client = env.pageserver.http_client() + + # create a new tenant + tenant_id, _ = env.neon_cli.create_tenant() + + # assert tenant exists on disk + assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + # we rely upon autocommit after each statement + pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ] + ) + + # ignore tenant + client.tenant_ignore(tenant_id) + env.pageserver.allowed_errors.append(".*NotFound: tenant .*") + # ensure tenant couldn't be detached without the special flag for ignored tenant + log.info("detaching ignored tenant WITHOUT required flag") + with pytest.raises( + expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}" + ): + client.tenant_detach(tenant_id) + + log.info("tenant detached failed as expected") + + # ensure tenant is detached with ignore state + log.info("detaching ignored tenant with required flag") + client.tenant_detach(tenant_id, True) + log.info("ignored tenant detached without error") + + # check that nothing is left on disk for deleted tenant + assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + + # assert the tenant does not exists in the Pageserver + tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] + assert ( + tenant_id not in tenants_after_detach + ), f"Ignored and then detached tenant {tenant_id} \ + should not be present in pageserver's memory" + + +# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach. +# Tenant should be detached without issues. +def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): + env = neon_simple_env + client = env.pageserver.http_client() + + # create a new tenant + tenant_id, _ = env.neon_cli.create_tenant() + + # assert tenant exists on disk + assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + # we rely upon autocommit after each statement + pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ] + ) + + log.info("detaching regular tenant with detach ignored flag") + client.tenant_detach(tenant_id, True) + log.info("regular tenant detached without error") + + # check that nothing is left on disk for deleted tenant + assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + + # assert the tenant does not exists in the Pageserver + tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] + assert ( + tenant_id not in tenants_after_detach + ), f"Ignored and then detached tenant {tenant_id} \ + should not be present in pageserver's memory" + + @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_detach_while_attaching( neon_env_builder: NeonEnvBuilder, From 14a40c9ca67568beac3783f597e44990c2c9a9e4 Mon Sep 17 00:00:00 2001 From: mikecaat <35882227+mikecaat@users.noreply.github.com> Date: Wed, 22 Mar 2023 17:10:53 +0900 Subject: [PATCH 183/426] Fix minor things for the docker-compose file (#3862) * Add the REPOSITORY env to build args to avoid the following error when executing without the credentials for the repository. ``` ERROR: Service 'compute' failed to build: Head "https://369495373322.dkr.ecr.eu-central-1.amazonaws.com/v2/compute-node-v15/manifests/2221": no basic auth credentials ``` * update the tag version in the documentation to support storage broker --- docker-compose/docker-compose.yml | 1 + docs/docker.md | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index b24cb80ce4..4926dad932 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -160,6 +160,7 @@ services: build: context: ./compute_wrapper/ args: + - REPOSITORY=${REPOSITORY:-neondatabase} - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14} - TAG=${TAG:-latest} - http_proxy=$http_proxy diff --git a/docs/docker.md b/docs/docker.md index d264a1a748..704044377f 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -37,9 +37,9 @@ You can specify version of neon cluster using following environment values. - PG_VERSION: postgres version for compute (default is 14) - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) ``` -$ cd docker-compose/docker-compose.yml +$ cd docker-compose/ $ docker-compose down # remove the conainers if exists -$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version +$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver Creating docker-compose_storage_broker_1 ... done (...omit...) From 6033dfdf4a9cbc5f81db551d6a8b259445f390c5 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 22 Mar 2023 16:26:27 +0200 Subject: [PATCH 184/426] Re-access layers before threshold eviction (#3867) To avoid re-downloading evicted files on restart, re-compute logical size and partitioning before each threshold based eviction run. Cc: #3802 Co-authored-by: Christian Schwarz --- .../src/tenant/timeline/eviction_task.rs | 81 +++++++++++++++++-- 1 file changed, 74 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 2aad0ef0f3..666768ff87 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -1,5 +1,18 @@ -//! The per-timeline layer eviction task. - +//! The per-timeline layer eviction task, which evicts data which has not been accessed for more +//! than a given threshold. +//! +//! Data includes all kinds of caches, namely: +//! - (in-memory layers) +//! - on-demand downloaded layer files on disk +//! - (cached layer file pages) +//! - derived data from layer file contents, namely: +//! - initial logical size +//! - partitioning +//! - (other currently missing unknowns) +//! +//! Items with parentheses are not (yet) touched by this task. +//! +//! See write-up on restart on-demand download spike: use std::{ ops::ControlFlow, sync::Arc, @@ -12,6 +25,7 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; use crate::{ + context::{DownloadBehavior, RequestContext}, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, @@ -54,9 +68,10 @@ impl Timeline { } } + let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); loop { let policy = self.get_eviction_policy(); - let cf = self.eviction_iteration(&policy, cancel.clone()).await; + let cf = self.eviction_iteration(&policy, &cancel, &ctx).await; match cf { ControlFlow::Break(()) => break, @@ -77,7 +92,8 @@ impl Timeline { async fn eviction_iteration( self: &Arc, policy: &EvictionPolicy, - cancel: CancellationToken, + cancel: &CancellationToken, + ctx: &RequestContext, ) -> ControlFlow<(), Instant> { debug!("eviction iteration: {policy:?}"); match policy { @@ -87,7 +103,7 @@ impl Timeline { } EvictionPolicy::LayerAccessThreshold(p) => { let start = Instant::now(); - match self.eviction_iteration_threshold(p, cancel).await { + match self.eviction_iteration_threshold(p, cancel, ctx).await { ControlFlow::Break(()) => return ControlFlow::Break(()), ControlFlow::Continue(()) => (), } @@ -101,7 +117,8 @@ impl Timeline { async fn eviction_iteration_threshold( self: &Arc, p: &EvictionPolicyLayerAccessThreshold, - cancel: CancellationToken, + cancel: &CancellationToken, + ctx: &RequestContext, ) -> ControlFlow<()> { let now = SystemTime::now(); @@ -114,6 +131,20 @@ impl Timeline { not_evictable: usize, skipped_for_shutdown: usize, } + + // what we want is to invalidate any caches which haven't been accessed for `p.threshold`, + // but we cannot actually do it for current limitations except by restarting pageserver. we + // just recompute the values which would be recomputed on startup. + // + // for active tenants this will likely materialized page cache or in-memory layers. for + // inactive tenants it will refresh the last_access timestamps so that we will not evict + // and re-download on restart these layers. + self.refresh_layers_required_in_restart(cancel, ctx).await; + + if cancel.is_cancelled() { + return ControlFlow::Break(()); + } + let mut stats = EvictionStats::default(); // Gather layers for eviction. // NB: all the checks can be invalidated as soon as we release the layer map lock. @@ -174,7 +205,7 @@ impl Timeline { }; let results = match self - .evict_layer_batch(remote_client, &candidates[..], cancel) + .evict_layer_batch(remote_client, &candidates[..], cancel.clone()) .await { Err(pre_err) => { @@ -216,4 +247,40 @@ impl Timeline { } ControlFlow::Continue(()) } + + /// Recompute the values which would cause on-demand downloads during restart. + async fn refresh_layers_required_in_restart( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + ) { + let lsn = self.get_last_record_lsn(); + + // imitiate on-restart initial logical size + let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await; + + match &size { + Ok(_size) => { + // good, don't log it to avoid confusion + } + Err(_) => { + // we have known issues for which we already log this on consumption metrics, + // gc, and compaction. leave logging out for now. + // + // https://github.com/neondatabase/neon/issues/2539 + } + } + + // imitiate repartiting on first compactation + if let Err(e) = self.collect_keyspace(lsn, ctx).await { + // if this failed, we probably failed logical size because these use the same keys + if size.is_err() { + // ignore, see above comment + } else { + warn!( + "failed to collect keyspace but succeeded in calculating logical size: {e:#}" + ); + } + } + } } From 8bd565e09ede7b3af0e0dddce68968ba02ac8b54 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 22 Mar 2023 17:42:31 +0200 Subject: [PATCH 185/426] Ensure branches with no layers have their remote storage counterpart created eventually (#3857) Discovered during writing a test for https://github.com/neondatabase/neon/pull/3843 --- pageserver/src/tenant.rs | 39 +++-- pageserver/src/tenant/timeline.rs | 51 +++++-- test_runner/regress/test_remote_storage.py | 164 ++++++++++++++++++--- 3 files changed, 205 insertions(+), 49 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5f1e23b873..b462c93b2d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -478,7 +478,7 @@ impl Tenant { let dummy_timeline = self.create_timeline_data( timeline_id, - up_to_date_metadata.clone(), + up_to_date_metadata, ancestor.clone(), remote_client, )?; @@ -503,7 +503,7 @@ impl Tenant { let broken_timeline = self .create_timeline_data( timeline_id, - up_to_date_metadata.clone(), + up_to_date_metadata, ancestor.clone(), None, ) @@ -1142,7 +1142,7 @@ impl Tenant { ); self.prepare_timeline( new_timeline_id, - new_metadata, + &new_metadata, timeline_uninit_mark, true, None, @@ -1700,7 +1700,7 @@ impl Tenant { fn create_timeline_data( &self, new_timeline_id: TimelineId, - new_metadata: TimelineMetadata, + new_metadata: &TimelineMetadata, ancestor: Option>, remote_client: Option, ) -> anyhow::Result> { @@ -2160,13 +2160,25 @@ impl Tenant { let new_timeline = self .prepare_timeline( dst_id, - metadata, + &metadata, timeline_uninit_mark, false, Some(Arc::clone(src_timeline)), )? .initialize_with_lock(&mut timelines, true, true)?; drop(timelines); + + // Root timeline gets its layers during creation and uploads them along with the metadata. + // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created. + // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC + // could get incorrect information and remove more layers, than needed. + // See also https://github.com/neondatabase/neon/issues/3865 + if let Some(remote_client) = new_timeline.remote_client.as_ref() { + remote_client + .schedule_index_upload_for_metadata_update(&metadata) + .context("branch initial metadata upload")?; + } + info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); Ok(new_timeline) @@ -2229,7 +2241,7 @@ impl Tenant { pg_version, ); let raw_timeline = - self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?; + self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?; let tenant_id = raw_timeline.owning_tenant.tenant_id; let unfinished_timeline = raw_timeline.raw_timeline()?; @@ -2283,7 +2295,7 @@ impl Tenant { fn prepare_timeline( &self, new_timeline_id: TimelineId, - new_metadata: TimelineMetadata, + new_metadata: &TimelineMetadata, uninit_mark: TimelineUninitMark, init_layers: bool, ancestor: Option>, @@ -2297,7 +2309,7 @@ impl Tenant { tenant_id, new_timeline_id, ); - remote_client.init_upload_queue_for_empty_remote(&new_metadata)?; + remote_client.init_upload_queue_for_empty_remote(new_metadata)?; Some(remote_client) } else { None @@ -2336,17 +2348,12 @@ impl Tenant { &self, timeline_path: &Path, new_timeline_id: TimelineId, - new_metadata: TimelineMetadata, + new_metadata: &TimelineMetadata, ancestor: Option>, remote_client: Option, ) -> anyhow::Result> { let timeline_data = self - .create_timeline_data( - new_timeline_id, - new_metadata.clone(), - ancestor, - remote_client, - ) + .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client) .context("Failed to create timeline data structure")?; crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; @@ -2358,7 +2365,7 @@ impl Tenant { self.conf, new_timeline_id, self.tenant_id, - &new_metadata, + new_metadata, true, ) .context("Failed to create timeline metadata")?; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4d03a78883..33909e749b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1163,7 +1163,7 @@ impl Timeline { pub(super) fn new( conf: &'static PageServerConf, tenant_conf: Arc>, - metadata: TimelineMetadata, + metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, tenant_id: TenantId, @@ -1629,6 +1629,8 @@ impl Timeline { .map(|l| (l.filename(), l)) .collect::>(); + // If no writes happen, new branches do not have any layers, only the metadata file. + let has_local_layers = !local_layers.is_empty(); let local_only_layers = match index_part { Some(index_part) => { info!( @@ -1646,21 +1648,40 @@ impl Timeline { } }; - // Are there local files that don't exist remotely? Schedule uploads for them - for (layer_name, layer) in &local_only_layers { - // XXX solve this in the type system - let layer_path = layer - .local_path() - .expect("local_only_layers only contains local layers"); - let layer_size = layer_path - .metadata() - .with_context(|| format!("failed to get file {layer_path:?} metadata"))? - .len(); - info!("scheduling {layer_path:?} for upload"); - remote_client - .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; + if has_local_layers { + // Are there local files that don't exist remotely? Schedule uploads for them. + // Local timeline metadata will get uploaded to remove along witht he layers. + for (layer_name, layer) in &local_only_layers { + // XXX solve this in the type system + let layer_path = layer + .local_path() + .expect("local_only_layers only contains local layers"); + let layer_size = layer_path + .metadata() + .with_context(|| format!("failed to get file {layer_path:?} metadata"))? + .len(); + info!("scheduling {layer_path:?} for upload"); + remote_client + .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; + } + remote_client.schedule_index_upload_for_file_changes()?; + } else if index_part.is_none() { + // No data on the remote storage, no local layers, local metadata file. + // + // TODO https://github.com/neondatabase/neon/issues/3865 + // Currently, console does not wait for the timeline data upload to the remote storage + // and considers the timeline created, expecting other pageserver nodes to work with it. + // Branch metadata upload could get interrupted (e.g pageserver got killed), + // hence any locally existing branch metadata with no remote counterpart should be uploaded, + // otherwise any other pageserver won't see the branch on `attach`. + // + // After the issue gets implemented, pageserver should rather remove the branch, + // since absence on S3 means we did not acknowledge the branch creation and console will have to retry, + // no need to keep the old files. + remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?; + } else { + // Local timeline has a metadata file, remote one too, both have no layers to sync. } - remote_client.schedule_index_upload_for_file_changes()?; info!("Done"); diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 1f6f0c67cc..f6600e8974 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -11,8 +11,10 @@ from typing import Dict, List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + LocalFsStorage, NeonEnvBuilder, PageserverApiException, + PageserverHttpClient, RemoteStorageKind, available_remote_storages, wait_for_last_flush_lsn, @@ -421,23 +423,6 @@ def test_remote_timeline_client_calls_started_metric( ) wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - def get_queued_count(file_kind, op_kind): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, - ) - if val is None: - return val - return int(val) - - def wait_upload_queue_empty(): - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) - calls_started: Dict[Tuple[str, str], List[int]] = { ("layer", "upload"): [0], ("index", "upload"): [0], @@ -478,7 +463,7 @@ def test_remote_timeline_client_calls_started_metric( # create some layers & wait for uploads to finish churn("a", "b") - wait_upload_queue_empty() + wait_upload_queue_empty(client, tenant_id, timeline_id) # ensure that we updated the calls_started metric fetch_calls_started() @@ -637,4 +622,147 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( time.sleep(10) +# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only. +# Ensures that such branch is still persisted on the remote storage, and can be restored during tenant (re)attach. +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_empty_branch_remote_storage_upload( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_empty_branch_remote_storage_upload", + ) + + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + new_branch_name = "new_branch" + new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant) + + with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg: + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id) + wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) + + timelines_before_detach = set( + map( + lambda t: TimelineId(t["timeline_id"]), + client.timeline_list(env.initial_tenant), + ) + ) + expected_timelines = set([env.initial_timeline, new_branch_timeline_id]) + assert ( + timelines_before_detach == expected_timelines + ), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}" + + client.tenant_detach(env.initial_tenant) + client.tenant_attach(env.initial_tenant) + wait_until_tenant_state(client, env.initial_tenant, "Active", 5) + + timelines_after_detach = set( + map( + lambda t: TimelineId(t["timeline_id"]), + client.timeline_list(env.initial_tenant), + ) + ) + + assert ( + timelines_before_detach == timelines_after_detach + ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}" + + +# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only. +# Ensures the branch is not on the remote storage and restarts the pageserver — the branch should be uploaded after the restart. +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_empty_branch_remote_storage_upload_on_restart( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_empty_branch_remote_storage_upload_on_restart", + ) + + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + new_branch_name = "new_branch" + new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant) + + with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg: + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id) + wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) + + env.pageserver.stop() + + # Remove new branch from the remote storage + assert isinstance(env.remote_storage, LocalFsStorage) + new_branch_on_remote_storage = ( + env.remote_storage.root + / "tenants" + / str(env.initial_tenant) + / "timelines" + / str(new_branch_timeline_id) + ) + assert ( + new_branch_on_remote_storage.is_dir() + ), f"'{new_branch_on_remote_storage}' path does not exist on the remote storage" + shutil.rmtree(new_branch_on_remote_storage) + + env.pageserver.start() + + wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) + assert ( + new_branch_on_remote_storage.is_dir() + ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'" + + +def wait_upload_queue_empty( + client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): + wait_until( + 2, + 1, + lambda: get_queued_count( + client, tenant_id, timeline_id, file_kind="layer", op_kind="upload" + ) + == 0, + ) + wait_until( + 2, + 1, + lambda: get_queued_count( + client, tenant_id, timeline_id, file_kind="index", op_kind="upload" + ) + == 0, + ) + wait_until( + 2, + 1, + lambda: get_queued_count( + client, tenant_id, timeline_id, file_kind="layer", op_kind="delete" + ) + == 0, + ) + + +def get_queued_count( + client: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + file_kind: str, + op_kind: str, +): + val = client.get_remote_timeline_client_metric( + "pageserver_remote_timeline_client_calls_unfinished", + tenant_id, + timeline_id, + file_kind, + op_kind, + ) + if val is None: + return val + return int(val) + + # TODO Test that we correctly handle GC of files that are stuck in upload queue. From f5ca897292d78faf6a447a8f4542b1b1400e6dd1 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 23 Mar 2023 12:00:52 +0200 Subject: [PATCH 186/426] fix: less logging at shutdown (#3866) Log less during shutdown; don't log anything for quickly (less than 1s) exiting tasks. --- pageserver/src/task_mgr.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 2734031a09..44b1bbb06d 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -481,13 +481,25 @@ pub async fn shutdown_tasks( for task in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); - info!("waiting for {} to shut down", task.name); - let join_handle = task_mut.join_handle.take(); - drop(task_mut); - join_handle + task_mut.join_handle.take() }; - if let Some(join_handle) = join_handle { - let _ = join_handle.await; + if let Some(mut join_handle) = join_handle { + let completed = tokio::select! { + _ = &mut join_handle => { true }, + _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => { + // allow some time to elapse before logging to cut down the number of log + // lines. + info!("waiting for {} to shut down", task.name); + false + } + }; + if !completed { + // we never handled this return value, but: + // - we don't deschedule which would lead to is_cancelled + // - panics are already logged (is_panicked) + // - task errors are already logged in the wrapper + let _ = join_handle.await; + } } else { // Possibly one of: // * The task had not even fully started yet. From 870ba43a1ff6840b56d64709a59e3616e4040870 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 24 Mar 2023 20:25:39 +0300 Subject: [PATCH 187/426] return proper http codes in timeline delete endpoint (#3876) return proper http codes in timeline delete endpoint + fix openapi spec for detach to include 404 responses --- pageserver/src/http/openapi_spec.yml | 12 +++++++++++ pageserver/src/http/routes.rs | 23 +++++++++++++++++++++ pageserver/src/tenant.rs | 22 ++++++++++++++------ pageserver/src/tenant/mgr.rs | 11 +++++++++- test_runner/regress/test_timeline_delete.py | 12 ++++++++--- 5 files changed, 70 insertions(+), 10 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2098f848d5..b8c3bffcd5 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -183,6 +183,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "404": + description: Timeline not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" "500": description: Generic operation error content: @@ -383,6 +389,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "404": + description: Tenant not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" "500": description: Generic operation error content: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 04b7928d31..ba53729ea9 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -131,6 +131,29 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(value: crate::tenant::DeleteTimelineError) -> Self { + use crate::tenant::DeleteTimelineError::*; + match value { + NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")), + HasChildren => ApiError::BadRequest(anyhow::anyhow!( + "Cannot delete timeline which has child timelines" + )), + Other(e) => ApiError::InternalServerError(e), + } + } +} + +impl From for ApiError { + fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self { + use crate::tenant::mgr::DeleteTimelineError::*; + match value { + Tenant(t) => ApiError::from(t), + Timeline(t) => ApiError::from(t), + } + } +} + // Helper function to construct a TimelineInfo struct for a timeline async fn build_timeline_info( timeline: &Arc, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b462c93b2d..0a167fd787 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -431,6 +431,16 @@ remote: } } +#[derive(Debug, thiserror::Error)] +pub enum DeleteTimelineError { + #[error("NotFound")] + NotFound, + #[error("HasChildren")] + HasChildren, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + struct RemoteStartupData { index_part: IndexPart, remote_metadata: TimelineMetadata, @@ -1307,7 +1317,7 @@ impl Tenant { &self, timeline_id: TimelineId, _ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), DeleteTimelineError> { // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. let timeline = { @@ -1319,13 +1329,13 @@ impl Tenant { .iter() .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - anyhow::ensure!( - !children_exist, - "Cannot delete timeline which has child timelines" - ); + if children_exist { + return Err(DeleteTimelineError::HasChildren); + } + let timeline_entry = match timelines.entry(timeline_id) { Entry::Occupied(e) => e, - Entry::Vacant(_) => bail!("timeline not found"), + Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), }; let timeline = Arc::clone(timeline_entry.get()); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 26a2bb972c..4971186206 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -321,11 +321,20 @@ pub async fn get_tenant( } } +#[derive(Debug, thiserror::Error)] +pub enum DeleteTimelineError { + #[error("Tenant {0}")] + Tenant(#[from] TenantStateError), + + #[error("Timeline {0}")] + Timeline(#[from] crate::tenant::DeleteTimelineError), +} + pub async fn delete_timeline( tenant_id: TenantId, timeline_id: TimelineId, ctx: &RequestContext, -) -> Result<(), TenantStateError> { +) -> Result<(), DeleteTimelineError> { let tenant = get_tenant(tenant_id, true).await?; tenant.delete_timeline(timeline_id, ctx).await?; Ok(()) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index b9c4f5b83f..30d894e04c 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -25,9 +25,11 @@ def test_timeline_delete(neon_simple_env: NeonEnv): with pytest.raises( PageserverApiException, match=f"NotFound: tenant {invalid_tenant_id}", - ): + ) as exc: ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) + assert exc.value.status_code == 404 + # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") @@ -39,7 +41,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http = env.pageserver.http_client() with pytest.raises( PageserverApiException, match="Cannot delete timeline which has child timelines" - ): + ) as exc: timeline_path = ( env.repo_dir / "tenants" @@ -53,6 +55,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert not timeline_path.exists() + assert exc.value.status_code == 400 + timeline_path = ( env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) ) @@ -71,7 +75,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): with pytest.raises( PageserverApiException, match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found", - ): + ) as exc: ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? @@ -80,3 +84,5 @@ def test_timeline_delete(neon_simple_env: NeonEnv): interval=0.2, func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id), ) + + assert exc.value.status_code == 404 From 4071ff8c7b699565d79f772e44ca2423e00a6a3b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Mar 2023 12:33:39 +0000 Subject: [PATCH 188/426] Bump openssl from 0.10.45 to 0.10.48 in /test_runner/pg_clients/rust/tokio-postgres (#3879) --- test_runner/pg_clients/rust/tokio-postgres/Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 96989ee5ee..a0067b183e 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -389,9 +389,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "openssl" -version = "0.10.45" +version = "0.10.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b102428fd03bc5edf97f62620f7298614c45cedf287c271e7ed450bbaf83f2e1" +checksum = "518915b97df115dd36109bfa429a48b8f737bd05508cf9588977b599648926d2" dependencies = [ "bitflags", "cfg-if", @@ -421,9 +421,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.80" +version = "0.9.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23bbbf7854cd45b83958ebe919f0e8e516793727652e27fda10a8384cfc790b7" +checksum = "666416d899cf077260dac8698d60a60b435a46d57e82acb1be3d0dad87284e5b" dependencies = [ "autocfg", "cc", From 4d8c7654852cdae2375243aabd02e2f8a183e026 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 27 Mar 2023 12:04:48 +0300 Subject: [PATCH 189/426] remove redundant dyn (#3878) remove redundant dyn --- libs/utils/src/id.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index f84bcb793f..b27c5cda35 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -23,7 +23,7 @@ pub enum IdError { struct Id([u8; 16]); impl Id { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id { + pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id { let mut arr = [0u8; 16]; buf.copy_to_slice(&mut arr); Id::from(arr) @@ -112,7 +112,7 @@ impl fmt::Debug for Id { macro_rules! id_newtype { ($t:ident) => { impl $t { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t { + pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t { $t(Id::get_from_buf(buf)) } From 8d783299919ac4c8d86fb5c5187450cc49c0108a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 25 Mar 2023 13:43:04 +0200 Subject: [PATCH 190/426] Remove some dead code. whoami() was never called, 'is_test' was never set. 'restart()' might be useful, but it wasn't hooked up the CLI so it was dead code. It's not clear what kind of a restart it should perform, anyway: just restart Postgres, or re-initialize the data directory from a fresh basebackup like "stop"+"start" does. --- control_plane/src/compute.rs | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 730cacf40b..46f0ad8d4f 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -87,7 +87,6 @@ impl ComputeControlPlane { address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), - is_test: false, timeline_id, lsn, tenant_id, @@ -113,7 +112,6 @@ pub struct PostgresNode { name: String, pub env: LocalEnv, pageserver: Arc, - is_test: bool, pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, @@ -171,7 +169,6 @@ impl PostgresNode { name, env: env.clone(), pageserver: Arc::clone(pageserver), - is_test: false, timeline_id, lsn: recovery_target_lsn, tenant_id, @@ -480,10 +477,6 @@ impl PostgresNode { self.pg_ctl(&["start"], auth_token) } - pub fn restart(&self, auth_token: &Option) -> Result<()> { - self.pg_ctl(&["restart"], auth_token) - } - pub fn stop(&self, destroy: bool) -> Result<()> { // If we are going to destroy data directory, // use immediate shutdown mode, otherwise, @@ -514,26 +507,4 @@ impl PostgresNode { "postgres" ) } - - // XXX: cache that in control plane - pub fn whoami(&self) -> String { - let output = Command::new("whoami") - .output() - .expect("failed to execute whoami"); - - assert!(output.status.success(), "whoami failed"); - - String::from_utf8(output.stdout).unwrap().trim().to_string() - } -} - -impl Drop for PostgresNode { - // destructor to clean up state after test is done - // XXX: we may detect failed test by setting some flag in catch_unwind() - // and checking it here. But let just clean datadirs on start. - fn drop(&mut self) { - if self.is_test { - let _ = self.stop(true); - } - } } From e3cbcc2ea759a58f90c07e6ece984310bbb43492 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Mon, 27 Mar 2023 10:13:34 +0200 Subject: [PATCH 191/426] Revert "Add `neondatabase/release` team as a default reviewers for storage" This reverts commit daeaa767c405532f0c8bdb8a5765f0c13fd83aee. --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 014084c410..4bce9cdd1e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,4 +31,3 @@ jobs: head: releases/${{ steps.date.outputs.date }} base: release title: Release ${{ steps.date.outputs.date }} - team_reviewers: release From ff51e96fbd864504494ab301edfe955a2f030d47 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 27 Mar 2023 12:45:10 +0200 Subject: [PATCH 192/426] fix synthetic size for (last_record_lsn - gc_horizon) < initdb_lsn (#3874) fix synthetic size for (last_record_lsn - gc_horizon) < initdb_lsn Assume a single-timeline project. If the gc_horizon covers all WAL (last_record_lsn < gc_horizon) but we have written more data than just initdb, the synthetic size calculation worker needs to calculate the logical size at LSN initdb_lsn (Segment BranchStart). Before this patch, that calculation would incorrectly return the initial logical size calculation result that we cache in the Timeline::initial_logical_size. Presumably, because there was confusion around initdb_lsn vs. initial size calculation. The fix is to only hand out the initialized_size() only if the LSN matches. The distinction in the metrics between "init logical size" and "logical size" was also incorrect because of the above. So, remove it. There was a special case for `size != 0`. This was to cover the case of LogicalSize::empty_initial(), but `initial_part_end` is `None` in that case, so the new `LogicalSize::initialized_size()` will return None in that case as well. Lastly, to prevent confusion like this in the future, rename all occurrences of `init_lsn` to either just `lsn` or a more specific name. Co-authored-by: Joonas Koivunen Co-authored-by: Heikki Linnakangas --- pageserver/src/metrics.rs | 4 --- pageserver/src/tenant/timeline.rs | 43 ++++++++++++++----------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b5563ad186..6cb245aed7 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -586,7 +586,6 @@ pub struct TimelineMetrics { pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, - pub init_logical_size_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, @@ -619,8 +618,6 @@ impl TimelineMetrics { let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id); let create_images_time_histo = StorageTimeMetrics::new("create images", &tenant_id, &timeline_id); - let init_logical_size_histo = - StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id); let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id); let load_layer_map_histo = StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id); @@ -657,7 +654,6 @@ impl TimelineMetrics { flush_time_histo, compact_time_histo, create_images_time_histo, - init_logical_size_histo, logical_size_histo, garbage_collect_histo, load_layer_map_histo, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 33909e749b..5fde1a77e0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -328,9 +328,13 @@ impl LogicalSize { .fetch_add(delta, AtomicOrdering::SeqCst); } - /// Returns the initialized (already calculated) value, if any. - fn initialized_size(&self) -> Option { - self.initial_logical_size.get().copied() + /// Make the value computed by initial logical size computation + /// available for re-use. This doesn't contain the incremental part. + fn initialized_size(&self, lsn: Lsn) -> Option { + match self.initial_part_end { + Some(v) if v == lsn => self.initial_logical_size.get().copied(), + _ => None, + } } } @@ -806,11 +810,11 @@ impl Timeline { let mut is_exact = true; let size = current_size.size(); - if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = + if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) = (current_size, self.current_logical_size.initial_part_end) { is_exact = false; - self.try_spawn_size_init_task(init_lsn, ctx); + self.try_spawn_size_init_task(initial_part_end, ctx); } Ok((size, is_exact)) @@ -1688,7 +1692,7 @@ impl Timeline { Ok(()) } - fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn, ctx: &RequestContext) { + fn try_spawn_size_init_task(self: &Arc, lsn: Lsn, ctx: &RequestContext) { let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) .try_acquire_owned() { @@ -1726,7 +1730,7 @@ impl Timeline { // NB: don't log errors here, task_mgr will do that. async move { let calculated_size = match self_clone - .logical_size_calculation_task(init_lsn, &background_ctx) + .logical_size_calculation_task(lsn, &background_ctx) .await { Ok(s) => s, @@ -1811,7 +1815,7 @@ impl Timeline { #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] async fn logical_size_calculation_task( self: &Arc, - init_lsn: Lsn, + lsn: Lsn, ctx: &RequestContext, ) -> Result { let mut timeline_state_updates = self.subscribe_for_state_updates(); @@ -1822,7 +1826,7 @@ impl Timeline { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); self_calculation - .calculate_logical_size(init_lsn, cancel, &ctx) + .calculate_logical_size(lsn, cancel, &ctx) .await }; let timeline_state_cancellation = async { @@ -1906,21 +1910,12 @@ impl Timeline { // need to return something Ok(0) }); - let timer = if up_to_lsn == self.initdb_lsn { - if let Some(size) = self.current_logical_size.initialized_size() { - if size != 0 { - // non-zero size means that the size has already been calculated by this method - // after startup. if the logical size is for a new timeline without layers the - // size will be zero, and we cannot use that, or this caching strategy until - // pageserver restart. - return Ok(size); - } - } - - self.metrics.init_logical_size_histo.start_timer() - } else { - self.metrics.logical_size_histo.start_timer() - }; + // See if we've already done the work for initial size calculation. + // This is a short-cut for timelines that are mostly unused. + if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { + return Ok(size); + } + let timer = self.metrics.logical_size_histo.start_timer(); let logical_size = self .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx) .await?; From fe156245708525d87bd3682595a3383e389efc65 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 27 Mar 2023 13:33:40 +0200 Subject: [PATCH 193/426] eviction_task: only refresh layer accesses once per p.threshold (#3877) Without this, we run it every p.period, which can be quite low. For example, the running experiment with 3000 tenants in prod uses a period of 1 minute. Doing it once per p.threshold is enough to prevent eviction. --- pageserver/src/tenant/timeline.rs | 8 ++++++++ pageserver/src/tenant/timeline/eviction_task.rs | 15 ++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5fde1a77e0..dfa0e842f1 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -71,6 +71,8 @@ use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; use walreceiver::spawn_connection_manager_task; +use self::eviction_task::EvictionTaskTimelineState; + use super::layer_map::BatchedUpdates; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; @@ -216,6 +218,8 @@ pub struct Timeline { download_all_remote_layers_task_info: RwLock>, state: watch::Sender, + + eviction_task_timeline_state: tokio::sync::Mutex, } /// Internal structure to hold all data needed for logical size calculation. @@ -1252,6 +1256,10 @@ impl Timeline { download_all_remote_layers_task_info: RwLock::new(None), state, + + eviction_task_timeline_state: tokio::sync::Mutex::new( + EvictionTaskTimelineState::default(), + ), }; result.repartition_threshold = result.get_checkpoint_distance() / 10; result diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 666768ff87..06dfe7a0b9 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -35,6 +35,11 @@ use crate::{ use super::Timeline; +#[derive(Default)] +pub struct EvictionTaskTimelineState { + last_refresh_required_in_restart: Option, +} + impl Timeline { pub(super) fn launch_eviction_task(self: &Arc) { let self_clone = Arc::clone(self); @@ -139,7 +144,15 @@ impl Timeline { // for active tenants this will likely materialized page cache or in-memory layers. for // inactive tenants it will refresh the last_access timestamps so that we will not evict // and re-download on restart these layers. - self.refresh_layers_required_in_restart(cancel, ctx).await; + let mut state = self.eviction_task_timeline_state.lock().await; + match state.last_refresh_required_in_restart { + Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } + _ => { + self.refresh_layers_required_in_restart(cancel, ctx).await; + state.last_refresh_required_in_restart = Some(tokio::time::Instant::now()) + } + } + drop(state); if cancel.is_cancelled() { return ControlFlow::Break(()); From f14895b48ed6bea51b513f6650f89338d931974d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 27 Mar 2023 20:20:23 +0300 Subject: [PATCH 194/426] eviction: avoid post-restart download by synthetic_size (#3871) As of #3867, we do artificial layer accesses to layers that will be needed after the next restart, but not until then because of caches. With this patch, we also do that for the accesses that the synthetic size calculation worker does if consumption metrics are enabled. The actual size calculation is not of importance, but we need to calculate all of the sizes, so we only call tenant::size::gather_inputs. Co-authored-by: Christian Schwarz --- pageserver/src/config.rs | 42 +++-- pageserver/src/tenant.rs | 4 + pageserver/src/tenant/size.rs | 9 +- pageserver/src/tenant/timeline.rs | 13 +- .../src/tenant/timeline/eviction_task.rs | 166 +++++++++++++++--- 5 files changed, 197 insertions(+), 37 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 39282ce320..58a6056385 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -165,6 +165,10 @@ pub struct PageServerConf { /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, + /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`. + /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`. + /// See the comment in `eviction_task` for details. + pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore, // How often to collect metrics and send them to the metrics endpoint. pub metric_collection_interval: Duration, @@ -239,7 +243,7 @@ struct PageServerConfigBuilder { log_format: BuilderValue, - concurrent_tenant_size_logical_size_queries: BuilderValue, + concurrent_tenant_size_logical_size_queries: BuilderValue, metric_collection_interval: BuilderValue, cached_metric_collection_interval: BuilderValue, @@ -286,7 +290,9 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default keepalive interval")), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), - concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), + concurrent_tenant_size_logical_size_queries: Set( + ConfigurableSemaphore::DEFAULT_INITIAL, + ), metric_collection_interval: Set(humantime::parse_duration( DEFAULT_METRIC_COLLECTION_INTERVAL, ) @@ -389,7 +395,7 @@ impl PageServerConfigBuilder { self.log_format = BuilderValue::Set(log_format) } - pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) { + pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); } @@ -434,6 +440,11 @@ impl PageServerConfigBuilder { } pub fn build(self) -> anyhow::Result { + let concurrent_tenant_size_logical_size_queries = self + .concurrent_tenant_size_logical_size_queries + .ok_or(anyhow!( + "missing concurrent_tenant_size_logical_size_queries" + ))?; Ok(PageServerConf { listen_pg_addr: self .listen_pg_addr @@ -481,11 +492,12 @@ impl PageServerConfigBuilder { .broker_keepalive_interval .ok_or(anyhow!("No broker keepalive interval provided"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, - concurrent_tenant_size_logical_size_queries: self - .concurrent_tenant_size_logical_size_queries - .ok_or(anyhow!( - "missing concurrent_tenant_size_logical_size_queries" - ))?, + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), metric_collection_interval: self .metric_collection_interval .ok_or(anyhow!("missing metric_collection_interval"))?, @@ -680,8 +692,7 @@ impl PageServerConf { "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ let input = parse_toml_string(key, item)?; let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?; - ConfigurableSemaphore::new(permits) + NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? }), "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?), @@ -829,6 +840,8 @@ impl PageServerConf { broker_keepalive_interval: Duration::from_secs(5000), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( + ), metric_collection_interval: Duration::from_secs(60), cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, @@ -921,6 +934,11 @@ impl ConfigurableSemaphore { inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())), } } + + /// Returns the configured amount of permits. + pub fn initial_permits(&self) -> NonZeroUsize { + self.initial_permits + } } impl Default for ConfigurableSemaphore { @@ -1025,6 +1043,8 @@ log_format = 'json' )?, log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + eviction_task_immitated_concurrent_logical_size_queries: + ConfigurableSemaphore::default(), metric_collection_interval: humantime::parse_duration( defaults::DEFAULT_METRIC_COLLECTION_INTERVAL )?, @@ -1085,6 +1105,8 @@ log_format = 'json' broker_keepalive_interval: Duration::from_secs(5), log_format: LogFormat::Json, concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + eviction_task_immitated_concurrent_logical_size_queries: + ConfigurableSemaphore::default(), metric_collection_interval: Duration::from_secs(222), cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0a167fd787..2c5226e5bc 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -46,6 +46,7 @@ use std::time::{Duration, Instant}; use self::config::TenantConf; use self::metadata::TimelineMetadata; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::EvictionTaskTenantState; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir; @@ -142,6 +143,8 @@ pub struct Tenant { /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, cached_synthetic_tenant_size: Arc, + + eviction_task_tenant_state: tokio::sync::Mutex, } /// A timeline with some of its files on disk, being initialized. @@ -1781,6 +1784,7 @@ impl Tenant { state, cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), + eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), } } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index a41889f16d..77275f96bd 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use anyhow::{bail, Context}; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; @@ -352,6 +353,10 @@ async fn fill_logical_sizes( // our advantage with `?` error handling. let mut joinset = tokio::task::JoinSet::new(); + let cancel = tokio_util::sync::CancellationToken::new(); + // be sure to cancel all spawned tasks if we are dropped + let _dg = cancel.clone().drop_guard(); + // For each point that would benefit from having a logical size available, // spawn a Task to fetch it, unless we have it cached already. for seg in segments.iter() { @@ -373,6 +378,7 @@ async fn fill_logical_sizes( timeline, lsn, ctx, + cancel.child_token(), )); } e.insert(cached_size); @@ -477,13 +483,14 @@ async fn calculate_logical_size( timeline: Arc, lsn: utils::lsn::Lsn, ctx: RequestContext, + cancel: CancellationToken, ) -> Result { let _permit = tokio::sync::Semaphore::acquire_owned(limit) .await .expect("global semaphore should not had been closed"); let size_res = timeline - .spawn_ondemand_logical_size_calculation(lsn, ctx) + .spawn_ondemand_logical_size_calculation(lsn, ctx, cancel) .instrument(info_span!("spawn_ondemand_logical_size_calculation")) .await?; Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index dfa0e842f1..611c2c27d3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -71,6 +71,7 @@ use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; use walreceiver::spawn_connection_manager_task; +pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; use super::layer_map::BatchedUpdates; @@ -1737,8 +1738,11 @@ impl Timeline { false, // NB: don't log errors here, task_mgr will do that. async move { + // no cancellation here, because nothing really waits for this to complete compared + // to spawn_ondemand_logical_size_calculation. + let cancel = CancellationToken::new(); let calculated_size = match self_clone - .logical_size_calculation_task(lsn, &background_ctx) + .logical_size_calculation_task(lsn, &background_ctx, cancel) .await { Ok(s) => s, @@ -1793,6 +1797,7 @@ impl Timeline { self: &Arc, lsn: Lsn, ctx: RequestContext, + cancel: CancellationToken, ) -> oneshot::Receiver> { let (sender, receiver) = oneshot::channel(); let self_clone = Arc::clone(self); @@ -1812,7 +1817,9 @@ impl Timeline { "ondemand logical size calculation", false, async move { - let res = self_clone.logical_size_calculation_task(lsn, &ctx).await; + let res = self_clone + .logical_size_calculation_task(lsn, &ctx, cancel) + .await; let _ = sender.send(res).ok(); Ok(()) // Receiver is responsible for handling errors }, @@ -1825,10 +1832,10 @@ impl Timeline { self: &Arc, lsn: Lsn, ctx: &RequestContext, + cancel: CancellationToken, ) -> Result { let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); - let cancel = CancellationToken::new(); let calculation = async { let cancel = cancel.child_token(); diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 06dfe7a0b9..3ec8c30d70 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -14,6 +14,7 @@ //! //! See write-up on restart on-demand download spike: use std::{ + collections::HashMap, ops::ControlFlow, sync::Arc, time::{Duration, SystemTime}, @@ -30,6 +31,7 @@ use crate::{ tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, storage_layer::PersistentLayer, + Tenant, }, }; @@ -37,7 +39,12 @@ use super::Timeline; #[derive(Default)] pub struct EvictionTaskTimelineState { - last_refresh_required_in_restart: Option, + last_layer_access_imitation: Option, +} + +#[derive(Default)] +pub struct EvictionTaskTenantState { + last_layer_access_imitation: Option, } impl Timeline { @@ -127,6 +134,35 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); + // If we evict layers but keep cached values derived from those layers, then + // we face a storm of on-demand downloads after pageserver restart. + // The reason is that the restart empties the caches, and so, the values + // need to be re-computed by accessing layers, which we evicted while the + // caches were filled. + // + // Solutions here would be one of the following: + // 1. Have a persistent cache. + // 2. Count every access to a cached value to the access stats of all layers + // that were accessed to compute the value in the first place. + // 3. Invalidate the caches at a period of < p.threshold/2, so that the values + // get re-computed from layers, thereby counting towards layer access stats. + // 4. Make the eviction task imitate the layer accesses that typically hit caches. + // + // We follow approach (4) here because in Neon prod deployment: + // - page cache is quite small => high churn => low hit rate + // => eviction gets correct access stats + // - value-level caches such as logical size & repatition have a high hit rate, + // especially for inactive tenants + // => eviction sees zero accesses for these + // => they cause the on-demand download storm on pageserver restart + // + // We should probably move to persistent caches in the future, or avoid + // having inactive tenants attached to pageserver in the first place. + match self.imitate_layer_accesses(p, cancel, ctx).await { + ControlFlow::Break(()) => return ControlFlow::Break(()), + ControlFlow::Continue(()) => (), + } + #[allow(dead_code)] #[derive(Debug, Default)] struct EvictionStats { @@ -137,27 +173,6 @@ impl Timeline { skipped_for_shutdown: usize, } - // what we want is to invalidate any caches which haven't been accessed for `p.threshold`, - // but we cannot actually do it for current limitations except by restarting pageserver. we - // just recompute the values which would be recomputed on startup. - // - // for active tenants this will likely materialized page cache or in-memory layers. for - // inactive tenants it will refresh the last_access timestamps so that we will not evict - // and re-download on restart these layers. - let mut state = self.eviction_task_timeline_state.lock().await; - match state.last_refresh_required_in_restart { - Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } - _ => { - self.refresh_layers_required_in_restart(cancel, ctx).await; - state.last_refresh_required_in_restart = Some(tokio::time::Instant::now()) - } - } - drop(state); - - if cancel.is_cancelled() { - return ControlFlow::Break(()); - } - let mut stats = EvictionStats::default(); // Gather layers for eviction. // NB: all the checks can be invalidated as soon as we release the layer map lock. @@ -261,8 +276,55 @@ impl Timeline { ControlFlow::Continue(()) } + async fn imitate_layer_accesses( + &self, + p: &EvictionPolicyLayerAccessThreshold, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<()> { + let mut state = self.eviction_task_timeline_state.lock().await; + match state.last_layer_access_imitation { + Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } + _ => { + self.imitate_timeline_cached_layer_accesses(cancel, ctx) + .await; + state.last_layer_access_imitation = Some(tokio::time::Instant::now()) + } + } + drop(state); + + if cancel.is_cancelled() { + return ControlFlow::Break(()); + } + + // This task is timeline-scoped, but the synthetic size calculation is tenant-scoped. + // Make one of the tenant's timelines draw the short straw and run the calculation. + // The others wait until the calculation is done so that they take into account the + // imitated accesses that the winner made. + let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else { + // likely, we're shutting down + return ControlFlow::Break(()); + }; + let mut state = tenant.eviction_task_tenant_state.lock().await; + match state.last_layer_access_imitation { + Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } + _ => { + self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel) + .await; + state.last_layer_access_imitation = Some(tokio::time::Instant::now()); + } + } + drop(state); + + if cancel.is_cancelled() { + return ControlFlow::Break(()); + } + + ControlFlow::Continue(()) + } + /// Recompute the values which would cause on-demand downloads during restart. - async fn refresh_layers_required_in_restart( + async fn imitate_timeline_cached_layer_accesses( &self, cancel: &CancellationToken, ctx: &RequestContext, @@ -296,4 +358,62 @@ impl Timeline { } } } + + // Imitate the synthetic size calculation done by the consumption_metrics module. + async fn imitate_synthetic_size_calculation_worker( + &self, + tenant: &Arc, + ctx: &RequestContext, + cancel: &CancellationToken, + ) { + if self.conf.metric_collection_endpoint.is_none() { + // We don't start the consumption metrics task if this is not set in the config. + // So, no need to imitate the accesses in that case. + return; + } + + // The consumption metrics are collected on a per-tenant basis, by a single + // global background loop. + // It limits the number of synthetic size calculations using the global + // `concurrent_tenant_size_logical_size_queries` semaphore to not overload + // the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs). + // + // If we used that same semaphore here, then we'd compete for the + // same permits, which may impact timeliness of consumption metrics. + // That is a no-go, as consumption metrics are much more important + // than what we do here. + // + // So, we have a separate semaphore, initialized to the same + // number of permits as the `concurrent_tenant_size_logical_size_queries`. + // In the worst, we would have twice the amount of concurrenct size calculations. + // But in practice, the `p.threshold` >> `consumption metric interval`, and + // we spread out the eviction task using `random_init_delay`. + // So, the chance of the worst case is quite low in practice. + // It runs as a per-tenant task, but the eviction_task.rs is per-timeline. + // So, we must coordinate with other with other eviction tasks of this tenant. + let limit = self + .conf + .eviction_task_immitated_concurrent_logical_size_queries + .inner(); + + let mut throwaway_cache = HashMap::new(); + let gather = + crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx); + tokio::pin!(gather); + + tokio::select! { + _ = cancel.cancelled() => {} + gather_result = gather => { + match gather_result { + Ok(_) => {}, + Err(e) => { + // We don't care about the result, but, if it failed, we should log it, + // since consumption metric might be hitting the cached value and + // thus not encountering this error. + warn!("failed to imitate synthetic size calculation accesses: {e:#}") + } + } + } + } + } } From 6efea4344913bb4b7a135debf069efe1404aa2b9 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 27 Mar 2023 21:01:46 +0300 Subject: [PATCH 195/426] Use precondition failed code in delete_timeline when tenant is missing (#3884) This allows client to differentiate between missing tenant and missing timeline cases --- libs/utils/src/http/error.rs | 7 +++++++ pageserver/src/http/openapi_spec.yml | 14 ++++++++++++++ pageserver/src/http/routes.rs | 5 +++++ test_runner/regress/test_timeline_delete.py | 6 +++--- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 1ba0422993..3c6023eb80 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -20,6 +20,9 @@ pub enum ApiError { #[error("Conflict: {0}")] Conflict(String), + #[error("Precondition failed: {0}")] + PreconditionFailed(&'static str), + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -44,6 +47,10 @@ impl ApiError { ApiError::Conflict(_) => { HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT) } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + self.to_string(), + StatusCode::PRECONDITION_FAILED, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b8c3bffcd5..795c0cd3c4 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -189,6 +189,13 @@ paths: application/json: schema: $ref: "#/components/schemas/NotFoundError" + "412": + description: Tenant is missing + content: + application/json: + schema: + $ref: "#/components/schemas/PreconditionFailedError" + "500": description: Generic operation error content: @@ -958,6 +965,13 @@ components: properties: msg: type: string + PreconditionFailedError: + type: object + required: + - msg + properties: + msg: + type: string security: - JWT: [] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ba53729ea9..b0addc82f1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -148,6 +148,11 @@ impl From for ApiError { fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self { use crate::tenant::mgr::DeleteTimelineError::*; match value { + // Report Precondition failed so client can distinguish between + // "tenant is missing" case from "timeline is missing" + Tenant(TenantStateError::NotFound(..)) => { + ApiError::PreconditionFailed("Requested tenant is missing") + } Tenant(t) => ApiError::from(t), Timeline(t) => ApiError::from(t), } diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 30d894e04c..93fafff934 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -10,7 +10,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*Timeline .* was not found.*") env.pageserver.allowed_errors.append(".*timeline not found.*") env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*") - env.pageserver.allowed_errors.append(".*NotFound: tenant .*") + env.pageserver.allowed_errors.append(".*Precondition failed: Requested tenant is missing.*") ps_http = env.pageserver.http_client() @@ -24,11 +24,11 @@ def test_timeline_delete(neon_simple_env: NeonEnv): invalid_tenant_id = TenantId.generate() with pytest.raises( PageserverApiException, - match=f"NotFound: tenant {invalid_tenant_id}", + match="Precondition failed: Requested tenant is missing", ) as exc: ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) - assert exc.value.status_code == 404 + assert exc.value.status_code == 412 # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches From 82a47770467553a2950cc46dea61d54464e29bca Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 28 Mar 2023 08:27:50 +0300 Subject: [PATCH 196/426] Add local free space monitor (#3832) ## Describe your changes Monitor free spae in local file system and shrink local file cache size if it is under watermark. Neon is using local storage for temp files (temp table + intermediate results), unlogged relations and local file cache. Ideally all space not used for temporary files should be used for local file cache. Temporary files and even unlogged relation are intended to have small life time (because them can be lost at any moment in case of compute restart). So the policy is to overcommit local cache size and shrink it if there is not enough free space. As far as temporary files are expected to be needed for a short time, there i no need to permanently shrink local file cache size. Instead of it, we just throw away least recently accessed elements from local file cache, releasing some space on the local disk. ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: sharnoff --- pgxn/neon/file_cache.c | 110 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 3a2ac380f9..143ad4bf67 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -14,6 +14,7 @@ */ #include +#include #include #include @@ -34,6 +35,9 @@ #include "storage/fd.h" #include "storage/pg_shmem.h" #include "storage/buf_internals.h" +#include "storage/procsignal.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" /* * Local file cache is used to temporary store relations pages in local file system. @@ -59,6 +63,9 @@ #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) +#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */ +#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */ + typedef struct FileCacheEntry { BufferTag key; @@ -71,6 +78,7 @@ typedef struct FileCacheEntry typedef struct FileCacheControl { uint32 size; /* size of cache file in chunks */ + uint32 used; /* number of used chunks */ dlist_head lru; /* double linked list for LRU replacement algorithm */ } FileCacheControl; @@ -79,12 +87,14 @@ static int lfc_desc; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; +static int lfc_free_space_watermark; static char* lfc_path; static FileCacheControl* lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 static shmem_request_hook_type prev_shmem_request_hook; #endif +static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */ static void lfc_shmem_startup(void) @@ -112,6 +122,7 @@ lfc_shmem_startup(void) &info, HASH_ELEM | HASH_BLOBS); lfc_ctl->size = 0; + lfc_ctl->used = 0; dlist_init(&lfc_ctl->lru); /* Remove file cache on restart */ @@ -165,7 +176,7 @@ lfc_change_limit_hook(int newval, void *extra) } } LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru)) + while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru)) { /* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */ FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); @@ -175,12 +186,86 @@ lfc_change_limit_hook(int newval, void *extra) elog(LOG, "Failed to punch hole in file: %m"); #endif hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); - lfc_ctl->size -= 1; + lfc_ctl->used -= 1; } elog(LOG, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); } +/* + * Local file system state monitor check available free space. + * If it is lower than lfc_free_space_watermark then we shrink size of local cache + * but throwing away least recently accessed chunks. + * First time low space watermark is reached cache size is divided by two, + * second time by four,... Finally we remove all chunks from local cache. + * + * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler. + * We only throw away cached chunks but do not prevent from filling cache by new chunks. + * + * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark + * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second. + * Callinng statfs each second should not add some noticable overhead. + */ +void +FileCacheMonitorMain(Datum main_arg) +{ + /* + * Choose file system state monitor interval so that space can not be exosted + * during this period but not longer than MAX_MONITOR_INTERVAL (10 sec) + */ + uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE); + + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + BackgroundWorkerUnblockSignals(); + + /* Periodically dump buffers until terminated. */ + while (!ShutdownRequestPending) + { + if (lfc_size_limit != 0) + { + struct statfs sfs; + if (statfs(lfc_path, &sfs) < 0) + { + elog(WARNING, "Failed to obtain status of %s: %m", lfc_path); + } + else + { + if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB) + { + if (lfc_shrinking_factor < 31) { + lfc_shrinking_factor += 1; + } + lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL); + } + else + lfc_shrinking_factor = 0; /* reset to initial value */ + } + } + pg_usleep(monitor_interval); + } +} + +static void +lfc_register_free_space_monitor(void) +{ + BackgroundWorker bgw; + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + void lfc_init(void) { @@ -217,6 +302,19 @@ lfc_init(void) lfc_change_limit_hook, NULL); + DefineCustomIntVariable("neon.free_space_watermark", + "Minimal free space in local file system after reaching which local file cache will be truncated", + NULL, + &lfc_free_space_watermark, + 1024, /* 1GB */ + 0, + INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, + NULL, + NULL); + DefineCustomStringVariable("neon.file_cache_path", "Path to local file cache (can be raw device)", NULL, @@ -231,6 +329,9 @@ lfc_init(void) if (lfc_max_size == 0) return; + if (lfc_free_space_watermark != 0) + lfc_register_free_space_monitor(); + prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = lfc_shmem_startup; #if PG_VERSION_NUM>=150000 @@ -380,7 +481,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, * there are should be very large number of concurrent IO operations and them are limited by max_connections, * we prefer not to complicate code and use second approach. */ - if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru)) + if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru)) { /* Cache overflow: evict least recently used chunk */ FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); @@ -390,7 +491,10 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, elog(LOG, "Swap file cache page"); } else + { + lfc_ctl->used += 1; entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */ + } entry->access_count = 1; memset(entry->bitmap, 0, sizeof entry->bitmap); } From c30b9e6eb125c270d045030e544863bc69522003 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 24 Mar 2023 12:56:21 +0400 Subject: [PATCH 197/426] Show full path to pg_ctl invokation when it fails. --- control_plane/src/compute.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 46f0ad8d4f..ee504bfaa6 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -403,7 +403,7 @@ impl PostgresNode { fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl"); - let mut cmd = Command::new(pg_ctl_path); + let mut cmd = Command::new(&pg_ctl_path); cmd.args( [ &[ @@ -432,7 +432,9 @@ impl PostgresNode { cmd.env("NEON_AUTH_TOKEN", token); } - let pg_ctl = cmd.output().context("pg_ctl failed")?; + let pg_ctl = cmd + .output() + .context(format!("{} failed", pg_ctl_path.display()))?; if !pg_ctl.status.success() { anyhow::bail!( "pg_ctl failed, exit code: {}, stdout: {}, stderr: {}", From 278d0f117d3b73ee302dd8b09388f5988f7d5011 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 28 Mar 2023 13:46:47 +0400 Subject: [PATCH 198/426] Rename neon_local sk logs s/safekeeper 1.log/safekeeper-1.log. I don't like spaces in file names. --- control_plane/src/safekeeper.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 84d6320573..d358f73343 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -156,7 +156,7 @@ impl SafekeeperNode { } background_process::start_process( - &format!("safekeeper {id}"), + &format!("safekeeper-{id}"), &datadir, &self.env.safekeeper_bin(), &args, From 35ecb139dc16f3c92b2cf6ca0afe0fd28fd86816 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 28 Mar 2023 09:14:35 +0300 Subject: [PATCH 199/426] Use stavfs instead inof statfs to fix MacOS build --- pgxn/neon/file_cache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 143ad4bf67..8fe53b1e7d 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -14,7 +14,7 @@ */ #include -#include +#include #include #include @@ -204,7 +204,7 @@ lfc_change_limit_hook(int newval, void *extra) * * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second. - * Callinng statfs each second should not add some noticable overhead. + * Callinng statvfs each second should not add some noticable overhead. */ void FileCacheMonitorMain(Datum main_arg) @@ -226,8 +226,8 @@ FileCacheMonitorMain(Datum main_arg) { if (lfc_size_limit != 0) { - struct statfs sfs; - if (statfs(lfc_path, &sfs) < 0) + struct statvfs sfs; + if (statvfs(lfc_path, &sfs) < 0) { elog(WARNING, "Failed to obtain status of %s: %m", lfc_path); } From 9798737ec65961351b55de07b3852a46d43052bf Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 28 Mar 2023 10:24:31 +0300 Subject: [PATCH 200/426] Update pgxn/neon/file_cache.c Co-authored-by: Heikki Linnakangas --- pgxn/neon/file_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 8fe53b1e7d..8dff259f02 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -204,7 +204,7 @@ lfc_change_limit_hook(int newval, void *extra) * * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second. - * Callinng statvfs each second should not add some noticable overhead. + * Calling statvfs each second should not add any noticeable overhead. */ void FileCacheMonitorMain(Datum main_arg) From 7456e5b71cf5a09b1109a3d6c04eadda73457069 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 28 Mar 2023 17:04:02 +0300 Subject: [PATCH 201/426] Add script to collect state from safekeepers (#3835) Add an ansible script to collect https://github.com/neondatabase/neon/pull/3710 state JSON from all safekeeper nodes and upload them to a postgres table. --- scripts/sk_collect_dumps/.gitignore | 2 ++ scripts/sk_collect_dumps/readme.md | 25 +++++++++++++ scripts/sk_collect_dumps/remote.yaml | 18 ++++++++++ scripts/sk_collect_dumps/upload.sh | 52 ++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+) create mode 100644 scripts/sk_collect_dumps/.gitignore create mode 100644 scripts/sk_collect_dumps/readme.md create mode 100644 scripts/sk_collect_dumps/remote.yaml create mode 100755 scripts/sk_collect_dumps/upload.sh diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore new file mode 100644 index 0000000000..d9d4d0296a --- /dev/null +++ b/scripts/sk_collect_dumps/.gitignore @@ -0,0 +1,2 @@ +result +*.json diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md new file mode 100644 index 0000000000..52b73e9495 --- /dev/null +++ b/scripts/sk_collect_dumps/readme.md @@ -0,0 +1,25 @@ +# Collect /v1/debug_dump from all safekeeper nodes + +1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. +2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. + +## How to use ansible (staging) + +``` +AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +``` + +## How to use ansible (prod) + +``` +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +``` + diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml new file mode 100644 index 0000000000..29ce83efde --- /dev/null +++ b/scripts/sk_collect_dumps/remote.yaml @@ -0,0 +1,18 @@ +- name: Fetch state dumps from safekeepers + hosts: safekeepers + gather_facts: False + remote_user: "{{ remote_user }}" + + tasks: + - name: Download file + get_url: + url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false" + dest: "/tmp/{{ inventory_hostname }}.json" + + - name: Fetch file from remote hosts + fetch: + src: "/tmp/{{ inventory_hostname }}.json" + dest: "./result/{{ inventory_hostname }}.json" + flat: yes + fail_on_missing: no + diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh new file mode 100755 index 0000000000..2e54ecba1c --- /dev/null +++ b/scripts/sk_collect_dumps/upload.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +if [ -z "$DB_CONNSTR" ]; then + echo "DB_CONNSTR is not set" + exit 1 +fi + +# Create a temporary table for JSON data +psql $DB_CONNSTR -c 'DROP TABLE IF EXISTS tmp_json' +psql $DB_CONNSTR -c 'CREATE TABLE tmp_json (data jsonb)' + +for file in ./result/*.json; do + echo "$file" + SK_ID=$(jq '.config.id' $file) + echo "SK_ID: $SK_ID" + jq -c ".timelines[] | . + {\"sk_id\": $SK_ID}" $file | psql $DB_CONNSTR -c "\\COPY tmp_json (data) FROM STDIN" +done + +TABLE_NAME=$1 + +if [ -z "$TABLE_NAME" ]; then + echo "TABLE_NAME is not set, skipping conversion to table with typed columns" + echo "Usage: ./upload.sh TABLE_NAME" + exit 0 +fi + +psql $DB_CONNSTR <>'sk_id')::bigint AS sk_id, + (data->>'tenant_id') AS tenant_id, + (data->>'timeline_id') AS timeline_id, + (data->'memory'->>'active')::bool AS active, + (data->'memory'->>'flush_lsn')::bigint AS flush_lsn, + (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn, + (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn, + (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn, + (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn, + (data->'memory'->>'write_lsn')::bigint AS write_lsn, + (data->'memory'->>'num_computes')::bigint AS num_computes, + (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn, + (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno, + (data->'memory'->>'is_cancelled')::bool AS is_cancelled, + (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn, + (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn, + (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term, + (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn, + (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn, + (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn, + (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn +FROM tmp_json +EOF From 5a123b56e5b94f07ed944cacdc09d4eac12bafdd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 28 Mar 2023 13:46:13 +0300 Subject: [PATCH 202/426] Remove obsolete hack to rename neon-specific GUCs. I checked the console database, we don't have any of these left in production. --- compute_tools/src/pg_helpers.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 79f851ed13..01b192b2de 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -74,18 +74,9 @@ impl GenericOption { /// Represent `GenericOption` as configuration option. pub fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { - // TODO: check in the console DB that we don't have these settings - // set for any non-deleted project and drop this override. - let name = match self.name.as_str() { - "safekeepers" => "neon.safekeepers", - "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", - "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout", - it => it, - }; - match self.vartype.as_ref() { - "string" => format!("{} = '{}'", name, escape_conf_value(val)), - _ => format!("{} = {}", name, val), + "string" => format!("{} = '{}'", self.name, escape_conf_value(val)), + _ => format!("{} = {}", self.name, val), } } else { self.name.to_owned() From b52389f2280e082134643edf75f250ee19002c92 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 20 Mar 2023 22:29:52 +0400 Subject: [PATCH 203/426] Cleanly exit on any shutdown signal in storage_broker. neon_local sends SIGQUIT, which otherwise dumps core by default. Also, remove obsolete install_shutdown_handlers; in all binaries it was overridden by ShutdownSignals::handle later. ref https://github.com/neondatabase/neon/issues/3847 --- libs/utils/src/signals.rs | 23 +---------------------- pageserver/src/bin/pageserver.rs | 11 +++-------- safekeeper/src/bin/safekeeper.rs | 17 +++++++---------- storage_broker/src/bin/storage_broker.rs | 9 +++++++++ 4 files changed, 20 insertions(+), 40 deletions(-) diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index 6586da2339..c37e9aea58 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,25 +1,7 @@ -use signal_hook::flag; use signal_hook::iterator::Signals; -use std::sync::atomic::AtomicBool; -use std::sync::Arc; pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; -pub fn install_shutdown_handlers() -> anyhow::Result { - let term_now = Arc::new(AtomicBool::new(false)); - for sig in TERM_SIGNALS { - // When terminated by a second term signal, exit with exit code 1. - // This will do nothing the first time (because term_now is false). - flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?; - // But this will "arm" the above for the second time, by setting it to true. - // The order of registering these is important, if you put this one first, it will - // first arm and then terminate ‒ all in the first round. - flag::register(*sig, Arc::clone(&term_now))?; - } - - Ok(ShutdownSignals) -} - pub enum Signal { Quit, Interrupt, @@ -39,10 +21,7 @@ impl Signal { pub struct ShutdownSignals; impl ShutdownSignals { - pub fn handle( - self, - mut handler: impl FnMut(Signal) -> anyhow::Result<()>, - ) -> anyhow::Result<()> { + pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> { for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() { let signal = match raw_signal { SIGINT => Signal::Interrupt, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 14e86ddcb6..cbfd3e1165 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -24,11 +24,9 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::signals::ShutdownSignals; use utils::{ - auth::JwtAuth, - logging, project_git_version, - sentry_init::init_sentry, - signals::{self, Signal}, + auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, tcp_listener, }; @@ -263,9 +261,6 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; - // Install signal handlers - let signals = signals::install_shutdown_handlers()?; - // Launch broker client WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; @@ -409,7 +404,7 @@ fn start_pageserver( } // All started up! Now just sit and wait for shutdown signal. - signals.handle(|signal| match signal { + ShutdownSignals::handle(|signal| match signal { Signal::Quit => { info!( "Got {}. Terminating in immediate shutdown mode", diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 8966e8c49b..ace921a26d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result}; use clap::Parser; use remote_storage::RemoteStorageConfig; use toml_edit::Document; +use utils::signals::ShutdownSignals; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; @@ -39,7 +40,7 @@ use utils::{ logging::{self, LogFormat}, project_git_version, sentry_init::init_sentry, - signals, tcp_listener, + tcp_listener, }; const PID_FILE_NAME: &str = "safekeeper.pid"; @@ -216,7 +217,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); @@ -274,15 +274,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { set_build_info_metric(GIT_VERSION); // TODO: put more thoughts into handling of failed threads - // We probably should restart them. + // We should catch & die if they are in trouble. - // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps - signals.handle(|signal| { - // TODO: implement graceful shutdown with joining threads etc - info!( - "received {}, terminating in immediate shutdown mode", - signal.name() - ); + // On any shutdown signal, log receival and exit. Additionally, handling + // SIGQUIT prevents coredump. + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); std::process::exit(0); }) } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1a0d261184..57f975b0df 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -33,6 +33,7 @@ use tonic::transport::server::Connected; use tonic::Code; use tonic::{Request, Response, Status}; use tracing::*; +use utils::signals::ShutdownSignals; use metrics::{Encoder, TextEncoder}; use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; @@ -437,6 +438,14 @@ async fn main() -> Result<(), Box> { info!("version: {GIT_VERSION}"); ::metrics::set_build_info_metric(GIT_VERSION); + // On any shutdown signal, log receival and exit. + std::thread::spawn(move || { + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); + std::process::exit(0); + }) + }); + let registry = Registry { shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))), timeline_chan_size: args.timeline_chan_size, From 018c8b0e2ba5a6875bd1e3ebfcfd7e75d3f908f0 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 17 Mar 2023 18:19:05 +0200 Subject: [PATCH 204/426] Use proper tokens and delimeters when listing S3 --- libs/remote_storage/src/s3_bucket.rs | 5 ++- test_runner/regress/test_remote_storage.py | 46 +++++++++++++++++++++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 93f5e0596e..a476ff32e0 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -291,6 +291,7 @@ impl RemoteStorage for S3Bucket { .list_objects_v2() .bucket(self.bucket_name.clone()) .set_prefix(self.prefix_in_bucket.clone()) + .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) .set_continuation_token(continuation_token) .send() .await @@ -306,7 +307,7 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))), ); - match fetch_response.continuation_token { + match fetch_response.next_continuation_token { Some(new_token) => continuation_token = Some(new_token), None => break, } @@ -371,7 +372,7 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), ); - match fetch_response.continuation_token { + match fetch_response.next_continuation_token { Some(new_token) => continuation_token = Some(new_token), None => break, } diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index f6600e8974..b9709d9b83 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -6,7 +6,7 @@ import shutil import threading import time from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Set, Tuple import pytest from fixtures.log_helper import log @@ -717,6 +717,50 @@ def test_empty_branch_remote_storage_upload_on_restart( ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'" +# Test creates >1000 timelines and upload them to the remote storage. +# AWS S3 does not return more than 1000 items and starts paginating, ensure that pageserver paginates correctly. +@pytest.mark.skip("Too slow to run, requires too much disk space to run") +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) +def test_thousands_of_branches( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_compaction_downloads_on_demand_without_image_creation", + ) + + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + expected_timelines: Set[TimelineId] = set([]) + tenant_id = env.initial_tenant + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + max_timelines = 1500 + for i in range(0, max_timelines): + new_timeline_id = TimelineId.generate() + log.info(f"Creating timeline {new_timeline_id}, {i + 1} out of {max_timelines}") + expected_timelines.add(new_timeline_id) + + client.timeline_create(tenant_id, new_timeline_id=new_timeline_id) + client.timeline_checkpoint(tenant_id, new_timeline_id) + wait_for_last_flush_lsn(env, pg, tenant_id, new_timeline_id) + with pg.cursor() as cur: + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + wait_for_upload(client, tenant_id, new_timeline_id, current_lsn) + + client.tenant_detach(tenant_id=tenant_id) + client.tenant_attach(tenant_id=tenant_id) + + timelines_after_reattach = set( + [timeline["timeline_id"] for timeline in client.timeline_list(tenant_id=tenant_id)] + ) + + assert ( + expected_timelines == timelines_after_reattach + ), f"Timelines after reattach do not match the ones created initially. \ + Missing timelines: {expected_timelines - timelines_after_reattach}, extra timelines: {timelines_after_reattach - expected_timelines}" + + def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): From 1300dc9239d3e844a1af74db136dabb7353c5776 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 20 Mar 2023 15:37:11 +0200 Subject: [PATCH 205/426] Replace Python IT test with the Rust one --- Cargo.lock | 22 ++ Cargo.toml | 1 + libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/lib.rs | 18 ++ libs/remote_storage/src/s3_bucket.rs | 4 + libs/remote_storage/tests/pagination_tests.rs | 275 ++++++++++++++++++ pageserver/src/config.rs | 1 + test_runner/regress/test_remote_storage.py | 46 +-- 8 files changed, 323 insertions(+), 45 deletions(-) create mode 100644 libs/remote_storage/tests/pagination_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 17aacd8ee7..a19a97a40d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3085,6 +3085,7 @@ dependencies = [ "serde", "serde_json", "tempfile", + "test-context", "tokio", "tokio-util", "toml_edit", @@ -3888,6 +3889,27 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "test-context" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +dependencies = [ + "async-trait", + "futures", + "test-context-macros", +] + +[[package]] +name = "test-context-macros" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "textwrap" version = "0.16.0" diff --git a/Cargo.toml b/Cargo.toml index e27a50a1cb..09cc150606 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,7 @@ strum_macros = "0.24" svg_fmt = "0.4.1" sync_wrapper = "0.1.2" tar = "0.4" +test-context = "0.1" thiserror = "1.0" tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 15812e8439..da15823b69 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -26,3 +26,4 @@ workspace_hack.workspace = true [dev-dependencies] tempfile.workspace = true +test-context.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 901f849801..1d50a777f4 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -39,6 +39,9 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +/// No limits on the client side, which currenltly means 1000 for AWS S3. +/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax +pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; @@ -64,6 +67,10 @@ impl RemotePath { pub fn object_name(&self) -> Option<&str> { self.0.file_name().and_then(|os_str| os_str.to_str()) } + + pub fn join(&self, segment: &Path) -> Self { + Self(self.0.join(segment)) + } } /// Storage (potentially remote) API to manage its state. @@ -266,6 +273,7 @@ pub struct S3Config { /// AWS S3 has various limits on its API calls, we need not to exceed those. /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, + pub max_keys_per_list_response: Option, } impl Debug for S3Config { @@ -275,6 +283,10 @@ impl Debug for S3Config { .field("bucket_region", &self.bucket_region) .field("prefix_in_bucket", &self.prefix_in_bucket) .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) .finish() } } @@ -303,6 +315,11 @@ impl RemoteStorageConfig { ) .context("Failed to parse 'concurrency_limit' as a positive integer")?; + let max_keys_per_list_response = + parse_optional_integer::("max_keys_per_list_response", toml) + .context("Failed to parse 'max_keys_per_list_response' as a positive integer")? + .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE); + let storage = match (local_path, bucket_name, bucket_region) { // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled (None, None, None) => return Ok(None), @@ -324,6 +341,7 @@ impl RemoteStorageConfig { .map(|endpoint| parse_toml_string("endpoint", endpoint)) .transpose()?, concurrency_limit, + max_keys_per_list_response, }), (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( parse_toml_string("local_path", local_path)?, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index a476ff32e0..d4eb7d9244 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -102,6 +102,7 @@ pub struct S3Bucket { client: Client, bucket_name: String, prefix_in_bucket: Option, + max_keys_per_list_response: Option, // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. @@ -164,6 +165,7 @@ impl S3Bucket { Ok(Self { client, bucket_name: aws_config.bucket_name.clone(), + max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())), }) @@ -293,6 +295,7 @@ impl RemoteStorage for S3Bucket { .set_prefix(self.prefix_in_bucket.clone()) .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) .set_continuation_token(continuation_token) + .set_max_keys(self.max_keys_per_list_response) .send() .await .map_err(|e| { @@ -355,6 +358,7 @@ impl RemoteStorage for S3Bucket { .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token) .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) + .set_max_keys(self.max_keys_per_list_response) .send() .await .map_err(|e| { diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs new file mode 100644 index 0000000000..eb52409c44 --- /dev/null +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -0,0 +1,275 @@ +use std::collections::HashSet; +use std::env; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::ops::ControlFlow; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::UNIX_EPOCH; + +use anyhow::Context; +use remote_storage::{ + GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, +}; +use test_context::{test_context, AsyncTestContext}; +use tokio::task::JoinSet; +use tracing::{debug, error, info}; + +const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; + +/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. +/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. +/// See the client creation in [`create_s3_client`] for details on the required env vars. +/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the +/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. +/// +/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`] +/// where +/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference +/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket +/// +/// Then, verifies that the client does return correct prefixes when queried: +/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only +/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` +/// +/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. +/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// since current default AWS S3 pagination limit is 1000. +/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// +/// Lastly, the test attempts to clean up and remove all uploaded S3 files. +/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3::Enabled(ctx) => ctx, + MaybeEnabledS3::Disabled => return Ok(()), + MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"), + }; + + let test_client = Arc::clone(&ctx.client_with_excessive_pagination); + let expected_remote_prefixes = ctx.remote_prefixes.clone(); + + let base_prefix = + RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?; + let root_remote_prefixes = test_client + .list_prefixes(None) + .await + .context("client list root prefixes failure")? + .into_iter() + .collect::>(); + assert_eq!( + root_remote_prefixes, HashSet::from([base_prefix.clone()]), + "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" + ); + + let nested_remote_prefixes = test_client + .list_prefixes(Some(&base_prefix)) + .await + .context("client list nested prefixes failure")? + .into_iter() + .collect::>(); + let remote_only_prefixes = nested_remote_prefixes + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + + Ok(()) +} + +enum MaybeEnabledS3 { + Enabled(S3WithTestBlobs), + Disabled, + UploadsFailed(anyhow::Error, S3WithTestBlobs), +} + +struct S3WithTestBlobs { + client_with_excessive_pagination: Arc, + base_prefix_str: &'static str, + remote_prefixes: HashSet, + remote_blobs: HashSet, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledS3 { + async fn setup() -> Self { + utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed"); + if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + let max_keys_in_list_response = 10; + let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); + + let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response) + .context("S3 client creation") + .expect("S3 client creation failed"); + + let base_prefix_str = "test/"; + match upload_s3_data( + &client_with_excessive_pagination, + base_prefix_str, + upload_tasks_count, + ) + .await + { + ControlFlow::Continue(uploads) => { + info!("Remote objects created successfully"); + Self::Enabled(S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }) + } + ControlFlow::Break(uploads) => Self::UploadsFailed( + anyhow::anyhow!("One or multiple blobs failed to upload to S3"), + S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }, + ), + } + } + + async fn teardown(self) { + match self { + Self::Disabled => {} + Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { + cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await; + } + } + } +} + +fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result> { + let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET") + .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?; + let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION") + .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?; + let random_prefix_part = std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .context("random s3 test prefix part calculation")? + .as_millis(); + let remote_storage_config = RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), + max_sync_errors: NonZeroU32::new(5).unwrap(), + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: remote_storage_s3_bucket, + bucket_region: remote_storage_s3_region, + prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")), + endpoint: None, + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: Some(max_keys_per_list_response), + }), + }; + Ok(Arc::new( + GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + )) +} + +struct Uploads { + prefixes: HashSet, + blobs: HashSet, +} + +async fn upload_s3_data( + client: &Arc, + base_prefix_str: &'static str, + upload_tasks_count: usize, +) -> ControlFlow { + info!("Creating {upload_tasks_count} S3 files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/")); + let blob_prefix = RemotePath::new(&prefix) + .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; + let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}"))); + debug!("Creating remote item {i} at path {blob_path:?}"); + + let data = format!("remote blob data {i}").into_bytes(); + let data_len = data.len(); + task_client + .upload( + Box::new(std::io::Cursor::new(data)), + data_len, + &blob_path, + None, + ) + .await?; + + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok((upload_prefix, upload_path)) => { + uploaded_prefixes.insert(upload_prefix); + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + let uploads = Uploads { + prefixes: uploaded_prefixes, + blobs: uploaded_blobs, + }; + if upload_tasks_failed { + ControlFlow::Break(uploads) + } else { + ControlFlow::Continue(uploads) + } +} + +async fn cleanup(client: &Arc, objects_to_delete: HashSet) { + info!( + "Removing {} objects from the remote storage during cleanup", + objects_to_delete.len() + ); + let mut delete_tasks = JoinSet::new(); + for object_to_delete in objects_to_delete { + let task_client = Arc::clone(client); + delete_tasks.spawn(async move { + debug!("Deleting remote item at path {object_to_delete:?}"); + task_client + .delete(&object_to_delete) + .await + .with_context(|| format!("{object_to_delete:?} removal")) + }); + } + + while let Some(task_run_result) = delete_tasks.join_next().await { + match task_run_result { + Ok(task_result) => match task_result { + Ok(()) => {} + Err(e) => error!("Delete task failed: {e:?}"), + }, + Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), + } + } +} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 58a6056385..7293e69f69 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -1238,6 +1238,7 @@ broker_endpoint = '{broker_endpoint}' prefix_in_bucket: Some(prefix_in_bucket.clone()), endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, + max_keys_per_list_response: None, }), }, "Remote storage config should correctly parse the S3 config" diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index b9709d9b83..f6600e8974 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -6,7 +6,7 @@ import shutil import threading import time from pathlib import Path -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Tuple import pytest from fixtures.log_helper import log @@ -717,50 +717,6 @@ def test_empty_branch_remote_storage_upload_on_restart( ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'" -# Test creates >1000 timelines and upload them to the remote storage. -# AWS S3 does not return more than 1000 items and starts paginating, ensure that pageserver paginates correctly. -@pytest.mark.skip("Too slow to run, requires too much disk space to run") -@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) -def test_thousands_of_branches( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind -): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_compaction_downloads_on_demand_without_image_creation", - ) - - env = neon_env_builder.init_start() - client = env.pageserver.http_client() - expected_timelines: Set[TimelineId] = set([]) - tenant_id = env.initial_tenant - pg = env.postgres.create_start("main", tenant_id=tenant_id) - - max_timelines = 1500 - for i in range(0, max_timelines): - new_timeline_id = TimelineId.generate() - log.info(f"Creating timeline {new_timeline_id}, {i + 1} out of {max_timelines}") - expected_timelines.add(new_timeline_id) - - client.timeline_create(tenant_id, new_timeline_id=new_timeline_id) - client.timeline_checkpoint(tenant_id, new_timeline_id) - wait_for_last_flush_lsn(env, pg, tenant_id, new_timeline_id) - with pg.cursor() as cur: - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - wait_for_upload(client, tenant_id, new_timeline_id, current_lsn) - - client.tenant_detach(tenant_id=tenant_id) - client.tenant_attach(tenant_id=tenant_id) - - timelines_after_reattach = set( - [timeline["timeline_id"] for timeline in client.timeline_list(tenant_id=tenant_id)] - ) - - assert ( - expected_timelines == timelines_after_reattach - ), f"Timelines after reattach do not match the ones created initially. \ - Missing timelines: {expected_timelines - timelines_after_reattach}, extra timelines: {timelines_after_reattach - expected_timelines}" - - def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): From 6c84cbbb5877e311c3d7e2959f2bb7214940750a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 28 Mar 2023 19:02:09 +0300 Subject: [PATCH 206/426] Run new Rust IT test in CI --- .github/workflows/build_and_test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d50a42d83c..f2d436c864 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -246,6 +246,13 @@ jobs: run: | ${cov_prefix} cargo test $CARGO_FLAGS + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_REGION=eu-central-1 + # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now + ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact + - name: Install rust binaries run: | # Install target binaries From 9d714a8413771ee35ffe237a097411dd497fed27 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 28 Mar 2023 15:44:15 +0300 Subject: [PATCH 207/426] Split $CARGO_FLAGS and $CARGO_FEATURES to make e2e tests work --- .github/workflows/build_and_test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f2d436c864..52e1d94e9b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -184,10 +184,10 @@ jobs: CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked $CARGO_FEATURES" + CARGO_FLAGS="--locked" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FLAGS="--locked --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --release" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -240,11 +240,11 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run cargo test run: | - ${cov_prefix} cargo test $CARGO_FLAGS + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -275,7 +275,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do From f1b174dc6a86149d92a7d6a84704d70df27805fb Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 29 Mar 2023 09:49:23 +0200 Subject: [PATCH 208/426] Update rust version to 1.68.2 --- rust-toolchain.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 0692340147..c39ba4f417 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.66.1" +channel = "1.68.2" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From ac9c7e8c4a1a1c4f2984de2ccabcc860dd5905b7 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 29 Mar 2023 14:14:56 +0300 Subject: [PATCH 209/426] Replace pin! from tokio to the std one (#3903) With fresh rustc brought by https://github.com/neondatabase/neon/pull/3902, we can use `std::pin::pin!` macro instead of the tokio one. One place did not need the macro at all, other places were adjusted. --- pageserver/src/page_service.rs | 7 +++---- pageserver/src/tenant/timeline.rs | 9 ++++----- pageserver/src/tenant/timeline/eviction_task.rs | 1 - .../timeline/walreceiver/walreceiver_connection.rs | 6 +++--- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b63ee31d5e..c0e4a2a9cf 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -27,6 +27,7 @@ use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; +use std::pin::pin; use std::str; use std::str::FromStr; use std::sync::Arc; @@ -466,8 +467,7 @@ impl PageServerHandler { pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_reader = StreamReader::new(copyin_stream(pgb)); - tokio::pin!(copyin_reader); + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); timeline .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx) .await?; @@ -512,8 +512,7 @@ impl PageServerHandler { info!("importing wal"); pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_reader = StreamReader::new(copyin_stream(pgb)); - tokio::pin!(copyin_reader); + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; info!("wal import complete"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 611c2c27d3..e1db34ec1b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,6 +24,7 @@ use std::collections::HashMap; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; +use std::pin::pin; use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; @@ -677,8 +678,7 @@ impl Timeline { let mut failed = 0; - let cancelled = task_mgr::shutdown_watcher(); - tokio::pin!(cancelled); + let mut cancelled = pin!(task_mgr::shutdown_watcher()); loop { tokio::select! { @@ -1837,13 +1837,13 @@ impl Timeline { let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); - let calculation = async { + let mut calculation = pin!(async { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); self_calculation .calculate_logical_size(lsn, cancel, &ctx) .await - }; + }); let timeline_state_cancellation = async { loop { match timeline_state_updates.changed().await { @@ -1872,7 +1872,6 @@ impl Timeline { "aborted because task_mgr shutdown requested".to_string() }; - tokio::pin!(calculation); loop { tokio::select! { res = &mut calculation => { return res } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3ec8c30d70..107cd89b90 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -399,7 +399,6 @@ impl Timeline { let mut throwaway_cache = HashMap::new(); let gather = crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx); - tokio::pin!(gather); tokio::select! { _ = cancel.cancelled() => {} diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 7194a4f3ed..9398a7bee9 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -2,6 +2,7 @@ use std::{ error::Error, + pin::pin, str::FromStr, sync::Arc, time::{Duration, SystemTime}, @@ -17,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; -use tokio::{pin, select, sync::watch, time}; +use tokio::{select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; @@ -187,8 +188,7 @@ pub async fn handle_walreceiver_connection( let query = format!("START_REPLICATION PHYSICAL {startpoint}"); let copy_stream = replication_client.copy_both_simple(&query).await?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); + let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); From b26c837ed685641b4c324dc47e6eb8da423ad103 Mon Sep 17 00:00:00 2001 From: Gleb Novikov Date: Wed, 29 Mar 2023 19:18:44 +0400 Subject: [PATCH 210/426] Fixed pageserver openapi spec properties reference (#3904) ## Describe your changes In [this linter run](https://github.com/neondatabase/cloud/actions/runs/4553032319/jobs/8029101300?pr=4391) accidentally found out that spec is invalid. Reference other schemas in properties should be done the way I changed. Could not find documentation specifically for schemas embedding in `components.schemas`, but it seems like the approach is inherited from json schema: https://json-schema.org/understanding-json-schema/structuring.html#ref ## Issue ticket number and link - ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] ~If it is a core feature, I have added thorough tests.~ - [ ] ~Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?~ - [ ] ~If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.~ --- pageserver/src/http/openapi_spec.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 795c0cd3c4..eda4a60e95 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -873,13 +873,9 @@ components: type: object properties: tenant_specific_overrides: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigInfo" effective_config: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigInfo" TimelineInfo: type: object required: From 1c1bb904edc5f7fbd13bc5523d53591759ce93f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Thu, 30 Mar 2023 16:24:47 +0300 Subject: [PATCH 211/426] Rename zenith_* labels to neon_* (#3911) ## Describe your changes Get rid of the legacy labeling. Aslo `neon_region_slug` with the same value as `neon_region` doesn't make much sense, so just drop it. This allows us to drop the relabeling from zenith to neon in the log collector. --- .../helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml | 7 +++---- .../helm-values/dev-us-east-2-beta.neon-proxy-link.yaml | 7 +++---- .../dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 7 +++---- .../helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml | 7 +++---- .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 7 +++---- .../prod-eu-central-1-gamma.neon-proxy-scram.yaml | 7 +++---- .../helm-values/prod-us-east-2-delta.neon-proxy-link.yaml | 7 +++---- .../helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 7 +++---- .../prod-us-west-2-eta.neon-proxy-scram-legacy.yaml | 7 +++---- .../helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml | 7 +++---- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index ad712c4745..2307856464 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -30,10 +30,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: dev - zenith_region: eu-west-1 - zenith_region_slug: eu-west-1 + neon_service: proxy-scram + neon_env: dev + neon_region: eu-west-1 exposedService: annotations: diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index 91ddd07eae..feca05aff6 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -15,10 +15,9 @@ settings: # -- Additional labels for neon-proxy-link pods podLabels: - zenith_service: proxy - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy + neon_env: dev + neon_region: us-east-2 service: type: LoadBalancer diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index 6ec18ff388..feee1b369a 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -15,10 +15,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram-legacy - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram-legacy + neon_env: dev + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index a091be1016..40814e55c9 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -30,10 +30,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram + neon_env: dev + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 8d65e94d00..aa5be89101 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: ap-southeast-1 - zenith_region_slug: ap-southeast-1 + neon_service: proxy-scram + neon_env: prod + neon_region: ap-southeast-1 exposedService: annotations: diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index f806b37482..083af6aa2d 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: eu-central-1 - zenith_region_slug: eu-central-1 + neon_service: proxy-scram + neon_env: prod + neon_region: eu-central-1 exposedService: annotations: diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml index eff24302bb..30dcefc151 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml @@ -13,10 +13,9 @@ settings: # -- Additional labels for zenith-proxy pods podLabels: - zenith_service: proxy - zenith_env: production - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy + neon_env: production + neon_region: us-east-2 service: type: LoadBalancer diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 38719f64e7..40fbc52b39 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml index d23ea41bd7..a186fb833f 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-west-2 - zenith_region_slug: us-west-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-west-2 exposedService: annotations: diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index d5a7d6d575..810a6a5f78 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-west-2 - zenith_region_slug: us-west-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-west-2 exposedService: annotations: From fa54a57ca29f5d04af06b64f5532b38c5430675b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 30 Mar 2023 18:38:45 +0200 Subject: [PATCH 212/426] random_init_delay: remove the minimum of 10 seconds (#3914) Before this patch, the range from which the random delay is picked is at minimum 10 seconds. With this patch, they delay is bounded to whatever the given `period` is, and zero, if period id Duration::ZERO. Motivation for this: the disk usage eviction tests that we'll add in https://github.com/neondatabase/neon/pull/3905 need to wait for the disk usage eviction background loop to do its job. They set a period of 1s. It seems wasteful to wait 10 seconds in the tests. Co-authored-by: Joonas Koivunen --- pageserver/src/tenant/tasks.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 20d1d2bfb6..8aeacc12f5 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -244,14 +244,12 @@ pub(crate) async fn random_init_delay( ) -> Result<(), Cancelled> { use rand::Rng; + if period == Duration::ZERO { + return Ok(()); + } + let d = { let mut rng = rand::thread_rng(); - - // gen_range asserts that the range cannot be empty, which it could be because period can - // be set to zero to disable gc or compaction, so lets set it to be at least 10s. - let period = std::cmp::max(period, Duration::from_secs(10)); - - // semi-ok default as the source of jitter rng.gen_range(Duration::ZERO..=period) }; From 41d364a8f19aa2b7a9b529fe54c8c75d0a73d38e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Thu, 30 Mar 2023 22:02:39 +0300 Subject: [PATCH 213/426] Add more detailed logging to compute_ctl's shutdown (#3915) Currently we don't see from the logs, if shutting down tracing takes long time or not. We do see that shutting down computes gets delayed for some reason and hits thhe grace period limit. Moving the shutdown message to slightly later, when we don't have anything else than just exit left. ## Issue ticket number and link ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- compute_tools/src/bin/compute_ctl.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b96842e416..f29a576413 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -203,13 +203,14 @@ fn main() -> Result<()> { if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); - info!("shutting down"); } + info!("shutting down tracing"); // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. tracing_utils::shutdown_tracing(); + info!("shutting down"); exit(exit_code.unwrap_or(1)) } From bf46237fc22800625cf86578ca9027bdfa047d19 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 30 Mar 2023 22:07:19 +0300 Subject: [PATCH 214/426] Fix prefetch for parallel bitmap scan (#3875) ## Describe your changes Fix prefetch for parallel bitmap scan ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 9fd9794436..757df1dab8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 9fd9794436d02fbfe68f8fca5beab218907cec41 +Subproject commit 757df1dab82f69bdf69469119420a0bbb307f992 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 257aaefb25..f8a650e49b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 257aaefb251c5c85c44652c01bf68c43db62748a +Subproject commit f8a650e49b06d39ad131b860117504044b01f312 From a64dd3ecb58f02826d159dfe07a24b7ee52c9b82 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 31 Mar 2023 13:47:57 +0200 Subject: [PATCH 215/426] disk-usage-based layer eviction (#3809) This patch adds a pageserver-global background loop that evicts layers in response to a shortage of available bytes in the $repo/tenants directory's filesystem. The loop runs periodically at a configurable `period`. Each loop iteration uses `statvfs` to determine filesystem-level space usage. It compares the returned usage data against two different types of thresholds. The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. We're good if that second statvfs shows that we're _actually_ below the configured thresholds. If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. There are two thresholds: - `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. If the actual usage is higher, the threshold is exceeded. - `min_avail_bytes` is the absolute available space in bytes. If the actual usage is lower, the threshold is exceeded. The iteration evicts layers in LRU fashion with a reservation of up to `tenant_min_resident_size` bytes of the most recent layers per tenant. The layers not part of the per-tenant reservation are evicted least-recently-used first until we're below all thresholds. The `tenant_min_resident_size` can be overridden per tenant as `min_resident_size_override` (bytes). In addition to the loop, there is also an HTTP endpoint to perform one loop iteration synchronous to the request. The endpoint takes an absolute number of bytes that the iteration needs to evict before pressure is relieved. The tests use this endpoint, which is a great simplification over setting up loopback-mounts in the tests, which would be required to test the statvfs part of the implementation. We will rely on manual testing in staging to test the statvfs parts. The HTTP endpoint is also handy in emergencies where an operator wants the pageserver to evict a given amount of space _now. Hence, it's arguments documented in openapi_spec.yml. The response type isn't documented though because we don't consider it stable. The endpoint should _not_ be used by Console but it could be used by on-call. Co-authored-by: Joonas Koivunen Co-authored-by: Dmitry Rodionov Co-authored-by: Heikki Linnakangas --- .github/ansible/staging.eu-west-1.hosts.yaml | 8 + .github/ansible/staging.us-east-2.hosts.yaml | 8 + Cargo.lock | 2 + control_plane/src/pageserver.rs | 10 + libs/pageserver_api/src/models.rs | 3 + libs/utils/Cargo.toml | 1 + libs/utils/src/lib.rs | 3 + libs/utils/src/serde_percent.rs | 83 +++ libs/utils/src/serde_regex.rs | 60 ++ pageserver/Cargo.toml | 1 + pageserver/src/bin/pageserver.rs | 27 +- pageserver/src/config.rs | 34 + pageserver/src/disk_usage_eviction_task.rs | 689 ++++++++++++++++++ pageserver/src/http/openapi_spec.yml | 25 + pageserver/src/http/routes.rs | 116 ++- pageserver/src/lib.rs | 2 + pageserver/src/statvfs.rs | 150 ++++ pageserver/src/task_mgr.rs | 3 + pageserver/src/tenant.rs | 10 +- pageserver/src/tenant/config.rs | 9 + pageserver/src/tenant/storage_layer.rs | 17 +- pageserver/src/tenant/timeline.rs | 92 +++ .../src/tenant/timeline/eviction_task.rs | 9 +- test_runner/fixtures/neon_fixtures.py | 34 + .../regress/test_disk_usage_eviction.py | 541 ++++++++++++++ 25 files changed, 1919 insertions(+), 18 deletions(-) create mode 100644 libs/utils/src/serde_percent.rs create mode 100644 libs/utils/src/serde_regex.rs create mode 100644 pageserver/src/disk_usage_eviction_task.rs create mode 100644 pageserver/src/statvfs.rs create mode 100644 test_runner/regress/test_disk_usage_eviction.py diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b537795704..e8d0bb1dc7 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index cd8f832af0..4ef51651fc 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" diff --git a/Cargo.lock b/Cargo.lock index a19a97a40d..4590e76014 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2474,6 +2474,7 @@ dependencies = [ "strum", "strum_macros", "svg_fmt", + "sync_wrapper", "tempfile", "tenant_size_model", "thiserror", @@ -4556,6 +4557,7 @@ dependencies = [ "once_cell", "pin-project-lite", "rand", + "regex", "routerify", "sentry", "serde", diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 3c66400a05..094069e4c0 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -363,6 +363,11 @@ impl PageServerNode { .map(|x| serde_json::from_str(x)) .transpose() .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .remove("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as integer")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -435,6 +440,11 @@ impl PageServerNode { .map(|x| serde_json::from_str(x)) .transpose() .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .get("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as an integer")?, }) .send()? .error_from_body()?; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0f860d0a6d..98a4b56858 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -120,6 +120,7 @@ pub struct TenantCreateRequest { // We might do that once the eviction feature has stabilizied. // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, + pub min_resident_size_override: Option, } #[serde_as] @@ -165,6 +166,7 @@ pub struct TenantConfigRequest { // We might do that once the eviction feature has stabilizied. // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, + pub min_resident_size_override: Option, } impl TenantConfigRequest { @@ -185,6 +187,7 @@ impl TenantConfigRequest { max_lsn_wal_lag: None, trace_read_requests: None, eviction_policy: None, + min_resident_size_override: None, } } } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index b9f67e82f8..391bc52a80 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,6 +19,7 @@ jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true +regex.workspace = true routerify.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 766d759ab4..d4176911ac 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -51,6 +51,9 @@ pub mod history_buffer; pub mod measured_stream; +pub mod serde_percent; +pub mod serde_regex; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs new file mode 100644 index 0000000000..63b62b5f1e --- /dev/null +++ b/libs/utils/src/serde_percent.rs @@ -0,0 +1,83 @@ +//! A serde::Deserialize type for percentages. +//! +//! See [`Percent`] for details. + +use serde::{Deserialize, Serialize}; + +/// If the value is not an integer between 0 and 100, +/// deserialization fails with a descriptive error. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(transparent)] +pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); + +impl Percent { + pub fn get(&self) -> u8 { + self.0 + } +} + +fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let v: u8 = serde::de::Deserialize::deserialize(deserializer)?; + if v > 100 { + return Err(serde::de::Error::custom( + "must be an integer between 0 and 100", + )); + } + Ok(v) +} + +#[cfg(test)] +mod tests { + use super::Percent; + + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)] + struct Foo { + bar: Percent, + } + + #[test] + fn basics() { + let input = r#"{ "bar": 50 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 50); + } + #[test] + fn null_handling() { + let input = r#"{ "bar": null }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn zero() { + let input = r#"{ "bar": 0 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 0); + } + #[test] + fn out_of_range_above() { + let input = r#"{ "bar": 101 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn out_of_range_below() { + let input = r#"{ "bar": -1 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn float() { + let input = r#"{ "bar": 50.5 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn string() { + let input = r#"{ "bar": "50 %" }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } +} diff --git a/libs/utils/src/serde_regex.rs b/libs/utils/src/serde_regex.rs new file mode 100644 index 0000000000..95ea4f8e44 --- /dev/null +++ b/libs/utils/src/serde_regex.rs @@ -0,0 +1,60 @@ +//! A `serde::{Deserialize,Serialize}` type for regexes. + +use std::ops::Deref; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct Regex( + #[serde( + deserialize_with = "deserialize_regex", + serialize_with = "serialize_regex" + )] + regex::Regex, +); + +fn deserialize_regex<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?; + Ok(re) +} + +fn serialize_regex(re: ®ex::Regex, serializer: S) -> Result +where + S: serde::ser::Serializer, +{ + serializer.collect_str(re.as_str()) +} + +impl Deref for Regex { + type Target = regex::Regex; + + fn deref(&self) -> ®ex::Regex { + &self.0 + } +} + +impl PartialEq for Regex { + fn eq(&self, other: &Regex) -> bool { + // comparing the automatons would be quite complicated + self.as_str() == other.as_str() + } +} + +impl Eq for Regex {} + +#[cfg(test)] +mod tests { + + #[test] + fn roundtrip() { + let input = r#""foo.*bar""#; + let re: super::Regex = serde_json::from_str(input).unwrap(); + assert!(re.is_match("foo123bar")); + assert!(!re.is_match("foo")); + let output = serde_json::to_string(&re).unwrap(); + assert_eq!(output, input); + } +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8d6641a387..0bc7eba95e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -48,6 +48,7 @@ serde_json = { workspace = true, features = ["raw_value"] } serde_with.workspace = true signal-hook.workspace = true svg_fmt.workspace = true +sync_wrapper.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index cbfd3e1165..ed23a18ee0 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -314,14 +315,34 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; + // shared state between the disk-usage backed eviction background task and the http endpoint + // that allows triggering disk-usage based eviction manually. note that the http endpoint + // is still accessible even if background task is not configured as long as remote storage has + // been configured. + let disk_usage_eviction_state: Arc = Arc::default(); + + if let Some(remote_storage) = &remote_storage { + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + )?; + } + // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, launch_ts, http_auth, remote_storage)? - .build() - .map_err(|err| anyhow!(err))?; + let router = http::make_router( + conf, + launch_ts, + http_auth, + remote_storage, + disk_usage_eviction_state, + )? + .build() + .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 7293e69f69..19f0f22815 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -27,6 +27,7 @@ use utils::{ logging::LogFormat, }; +use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; @@ -92,6 +93,8 @@ pub mod defaults { #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' +#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -104,6 +107,8 @@ pub mod defaults { #image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} #pitr_interval = '{DEFAULT_PITR_INTERVAL}' +#min_resident_size_override = .. # in bytes + # [remote_storage] "### @@ -180,6 +185,8 @@ pub struct PageServerConf { // See the corresponding metric's help string. pub evictions_low_residence_duration_metric_threshold: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -252,6 +259,8 @@ struct PageServerConfigBuilder { evictions_low_residence_duration_metric_threshold: BuilderValue, + disk_usage_based_eviction: BuilderValue>, + test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, @@ -312,6 +321,8 @@ impl Default for PageServerConfigBuilder { ) .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")), + disk_usage_based_eviction: Set(None), + test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), @@ -431,6 +442,10 @@ impl PageServerConfigBuilder { self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value); } + pub fn disk_usage_based_eviction(&mut self, value: Option) { + self.disk_usage_based_eviction = BuilderValue::Set(value); + } + pub fn ondemand_download_behavior_treat_error_as_warn( &mut self, ondemand_download_behavior_treat_error_as_warn: bool, @@ -515,6 +530,9 @@ impl PageServerConfigBuilder { .ok_or(anyhow!( "missing evictions_low_residence_duration_metric_threshold" ))?, + disk_usage_based_eviction: self + .disk_usage_based_eviction + .ok_or(anyhow!("missing disk_usage_based_eviction"))?, test_remote_failures: self .test_remote_failures .ok_or(anyhow!("missing test_remote_failuers"))?, @@ -704,6 +722,12 @@ impl PageServerConf { builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?), + "disk_usage_based_eviction" => { + tracing::info!("disk_usage_based_eviction: {:#?}", &item); + builder.disk_usage_based_eviction( + toml_edit::de::from_item(item.clone()) + .context("parse disk_usage_based_eviction")?) + }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } @@ -808,6 +832,13 @@ impl PageServerConf { ); } + if let Some(item) = item.get("min_resident_size_override") { + t_conf.min_resident_size_override = Some( + toml_edit::de::from_item(item.clone()) + .context("parse min_resident_size_override")?, + ); + } + Ok(t_conf) } @@ -850,6 +881,7 @@ impl PageServerConf { defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .unwrap(), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, } @@ -1058,6 +1090,7 @@ log_format = 'json' evictions_low_residence_duration_metric_threshold: humantime::parse_duration( defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD )?, + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -1112,6 +1145,7 @@ log_format = 'json' metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), evictions_low_residence_duration_metric_threshold: Duration::from_secs(444), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs new file mode 100644 index 0000000000..eeeb6fda89 --- /dev/null +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -0,0 +1,689 @@ +//! This module implements the pageserver-global disk-usage-based layer eviction task. +//! +//! # Mechanics +//! +//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background +//! loop that evicts layers in response to a shortage of available bytes +//! in the $repo/tenants directory's filesystem. +//! +//! The loop runs periodically at a configurable `period`. +//! +//! Each loop iteration uses `statvfs` to determine filesystem-level space usage. +//! It compares the returned usage data against two different types of thresholds. +//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. +//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. +//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds. +//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. +//! +//! # Eviction Policy +//! +//! There are two thresholds: +//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. +//! If the actual usage is higher, the threshold is exceeded. +//! `min_avail_bytes` is the absolute available space in bytes. +//! If the actual usage is lower, the threshold is exceeded. +//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction +//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again. +//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant. +//! The reservation is to keep the most recently accessed X bytes per tenant resident. +//! If we cannot relieve pressure by evicting layers outside of the reservation, we +//! start evicting layers that are part of the reservation, LRU first. +//! +//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size` +//! throughout the code, but, no actual variable carries that name. +//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`. +//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress +//! during page reconstruction. +//! An alternative default for all tenants can be specified in the `tenant_config` section of the config. +//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`). + +// Implementation notes: +// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl +// reading these fields. We use the Debug impl for semi-structured logging, though. + +use std::{ + collections::HashMap, + path::Path, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::Context; +use remote_storage::GenericRemoteStorage; +use serde::{Deserialize, Serialize}; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, instrument, warn, Instrument}; +use utils::serde_percent::Percent; + +use crate::{ + config::PageServerConf, + task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + tenant::{self, storage_layer::PersistentLayer, Timeline}, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, +} + +#[derive(Default)] +pub struct State { + /// Exclude http requests and background task from running at the same time. + mutex: tokio::sync::Mutex<()>, +} + +pub fn launch_disk_usage_global_eviction_task( + conf: &'static PageServerConf, + storage: GenericRemoteStorage, + state: Arc, +) -> anyhow::Result<()> { + let Some(task_config) = &conf.disk_usage_based_eviction else { + info!("disk usage based eviction task not configured"); + return Ok(()); + }; + + info!("launching disk usage based eviction task"); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "disk usage based eviction", + false, + async move { + disk_usage_eviction_task( + &state, + task_config, + storage, + &conf.tenants_path(), + task_mgr::shutdown_token(), + ) + .await; + info!("disk usage based eviction task finishing"); + Ok(()) + }, + ); + + Ok(()) +} + +#[instrument(skip_all)] +async fn disk_usage_eviction_task( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: GenericRemoteStorage, + tenants_dir: &Path, + cancel: CancellationToken, +) { + use crate::tenant::tasks::random_init_delay; + { + if random_init_delay(task_config.period, &cancel) + .await + .is_err() + { + info!("shutting down"); + return; + } + } + + let mut iteration_no = 0; + loop { + iteration_no += 1; + let start = Instant::now(); + + async { + let res = disk_usage_eviction_task_iteration( + state, + task_config, + &storage, + tenants_dir, + &cancel, + ) + .await; + + match res { + Ok(()) => {} + Err(e) => { + // these stat failures are expected to be very rare + warn!("iteration failed, unexpected error: {e:#}"); + } + } + } + .instrument(tracing::info_span!("iteration", iteration_no)) + .await; + + let sleep_until = start + task_config.period; + tokio::select! { + _ = tokio::time::sleep_until(sleep_until) => {}, + _ = cancel.cancelled() => { + info!("shutting down"); + break + } + } + } +} + +pub trait Usage: Clone + Copy + std::fmt::Debug { + fn has_pressure(&self) -> bool; + fn add_available_bytes(&mut self, bytes: u64); +} + +async fn disk_usage_eviction_task_iteration( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: &GenericRemoteStorage, + tenants_dir: &Path, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) + .context("get filesystem-level disk usage before evictions")?; + let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; + match res { + Ok(outcome) => { + debug!(?outcome, "disk_usage_eviction_iteration finished"); + match outcome { + IterationOutcome::NoPressure | IterationOutcome::Cancelled => { + // nothing to do, select statement below will handle things + } + IterationOutcome::Finished(outcome) => { + // Verify with statvfs whether we made any real progress + let after = filesystem_level_usage::get(tenants_dir, task_config) + // It's quite unlikely to hit the error here. Keep the code simple and bail out. + .context("get filesystem-level disk usage after evictions")?; + + debug!(?after, "disk usage"); + + if after.has_pressure() { + // Don't bother doing an out-of-order iteration here now. + // In practice, the task period is set to a value in the tens-of-seconds range, + // which will cause another iteration to happen soon enough. + // TODO: deltas between the three different usages would be helpful, + // consider MiB, GiB, TiB + warn!(?outcome, ?after, "disk usage still high"); + } else { + info!(?outcome, ?after, "disk usage pressure relieved"); + } + } + } + } + Err(e) => { + error!("disk_usage_eviction_iteration failed: {:#}", e); + } + } + + Ok(()) +} + +#[derive(Debug, Serialize)] +#[allow(clippy::large_enum_variant)] +pub enum IterationOutcome { + NoPressure, + Cancelled, + Finished(IterationOutcomeFinished), +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +pub struct IterationOutcomeFinished { + /// The actual usage observed before we started the iteration. + before: U, + /// The expected value for `after`, according to internal accounting, after phase 1. + planned: PlannedUsage, + /// The outcome of phase 2, where we actually do the evictions. + /// + /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will + /// be the same as `planned`. + assumed: AssumedUsage, +} + +#[derive(Debug, Serialize)] +#[allow(dead_code)] +struct AssumedUsage { + /// The expected value for `after`, after phase 2. + projected_after: U, + /// The layers we failed to evict during phase 2. + failed: LayerCount, +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +struct PlannedUsage { + respecting_tenant_min_resident_size: U, + fallback_to_global_lru: Option, +} + +#[allow(dead_code)] +#[derive(Debug, Default, Serialize)] +struct LayerCount { + file_sizes: u64, + count: usize, +} + +pub async fn disk_usage_eviction_task_iteration_impl( + state: &State, + storage: &GenericRemoteStorage, + usage_pre: U, + cancel: &CancellationToken, +) -> anyhow::Result> { + // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) + let _g = state + .mutex + .try_lock() + .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; + + debug!(?usage_pre, "disk usage"); + + if !usage_pre.has_pressure() { + return Ok(IterationOutcome::NoPressure); + } + + warn!( + ?usage_pre, + "running disk usage based eviction due to pressure" + ); + + let candidates = match collect_eviction_candidates(cancel).await? { + EvictionCandidates::Cancelled => { + return Ok(IterationOutcome::Cancelled); + } + EvictionCandidates::Finished(partitioned) => partitioned, + }; + + // Debug-log the list of candidates + let now = SystemTime::now(); + for (i, (partition, candidate)) in candidates.iter().enumerate() { + debug!( + "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", + i + 1, + candidates.len(), + candidate.layer.file_size(), + now.duration_since(candidate.last_activity_ts) + .unwrap() + .as_micros(), + partition, + candidate.layer.get_tenant_id(), + candidate.layer.get_timeline_id(), + candidate.layer.filename().file_name(), + ); + } + + // phase1: select victims to relieve pressure + // + // Walk through the list of candidates, until we have accumulated enough layers to get + // us back under the pressure threshold. 'usage_planned' is updated so that it tracks + // how much disk space would be used after evicting all the layers up to the current + // point in the list. The layers are collected in 'batched', grouped per timeline. + // + // If we get far enough in the list that we start to evict layers that are below + // the tenant's min-resident-size threshold, print a warning, and memorize the disk + // usage at that point, in 'usage_planned_min_resident_size_respecting'. + let mut batched: HashMap<_, Vec>> = HashMap::new(); + let mut warned = None; + let mut usage_planned = usage_pre; + for (i, (partition, candidate)) in candidates.into_iter().enumerate() { + if !usage_planned.has_pressure() { + debug!( + no_candidates_evicted = i, + "took enough candidates for pressure to be relieved" + ); + break; + } + + if partition == MinResidentSizePartition::Below && warned.is_none() { + warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); + warned = Some(usage_planned); + } + + usage_planned.add_available_bytes(candidate.layer.file_size()); + + batched + .entry(TimelineKey(candidate.timeline)) + .or_default() + .push(candidate.layer); + } + + let usage_planned = match warned { + Some(respecting_tenant_min_resident_size) => PlannedUsage { + respecting_tenant_min_resident_size, + fallback_to_global_lru: Some(usage_planned), + }, + None => PlannedUsage { + respecting_tenant_min_resident_size: usage_planned, + fallback_to_global_lru: None, + }, + }; + debug!(?usage_planned, "usage planned"); + + // phase2: evict victims batched by timeline + + // After the loop, `usage_assumed` is the post-eviction usage, + // according to internal accounting. + let mut usage_assumed = usage_pre; + let mut evictions_failed = LayerCount::default(); + for (timeline, batch) in batched { + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let batch_size = batch.len(); + + debug!(%timeline_id, "evicting batch for timeline"); + + async { + let results = timeline.evict_layers(storage, &batch, cancel.clone()).await; + + match results { + Err(e) => { + warn!("failed to evict batch: {:#}", e); + } + Ok(results) => { + assert_eq!(results.len(), batch.len()); + for (result, layer) in results.into_iter().zip(batch.iter()) { + match result { + Some(Ok(true)) => { + usage_assumed.add_available_bytes(layer.file_size()); + } + Some(Ok(false)) => { + // this is: + // - Replacement::{NotFound, Unexpected} + // - it cannot be is_remote_layer, filtered already + evictions_failed.file_sizes += layer.file_size(); + evictions_failed.count += 1; + } + None => { + assert!(cancel.is_cancelled()); + return; + } + Some(Err(e)) => { + // we really shouldn't be getting this, precondition failure + error!("failed to evict layer: {:#}", e); + } + } + } + } + } + } + .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size)) + .await; + + if cancel.is_cancelled() { + return Ok(IterationOutcome::Cancelled); + } + } + + Ok(IterationOutcome::Finished(IterationOutcomeFinished { + before: usage_pre, + planned: usage_planned, + assumed: AssumedUsage { + projected_after: usage_assumed, + failed: evictions_failed, + }, + })) +} + +#[derive(Clone)] +struct EvictionCandidate { + timeline: Arc, + layer: Arc, + last_activity_ts: SystemTime, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +enum MinResidentSizePartition { + Above, + Below, +} + +enum EvictionCandidates { + Cancelled, + Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), +} + +/// Gather the eviction candidates. +/// +/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction +/// order. A caller that evicts in that order, until pressure is relieved, implements +/// the eviction policy outlined in the module comment. +/// +/// # Example +/// +/// Imagine that there are two tenants, A and B, with five layers each, a-e. +/// Each layer has size 100, and both tenant's min_resident_size is 150. +/// The eviction order would be +/// +/// ```text +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e +/// ``` +/// +/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. +/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size. +/// +/// But, if we need to evict 900 bytes to relieve pressure, we'd evict +/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition +/// after exhauting the `Above` partition. +/// So, we did not respect each tenant's min_resident_size. +async fn collect_eviction_candidates( + cancel: &CancellationToken, +) -> anyhow::Result { + // get a snapshot of the list of tenants + let tenants = tenant::mgr::list_tenants() + .await + .context("get list of tenants")?; + + let mut candidates = Vec::new(); + + for (tenant_id, _state) in &tenants { + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await { + Ok(tenant) => tenant, + Err(e) => { + // this can happen if tenant has lifecycle transition after we fetched it + debug!("failed to get tenant: {e:#}"); + continue; + } + }; + + // collect layers from all timelines in this tenant + // + // If one of the timelines becomes `!is_active()` during the iteration, + // for example because we're shutting down, then `max_layer_size` can be too small. + // That's OK. This code only runs under a disk pressure situation, and being + // a little unfair to tenants during shutdown in such a situation is tolerable. + let mut tenant_candidates = Vec::new(); + let mut max_layer_size = 0; + for tl in tenant.list_timelines() { + if !tl.is_active() { + continue; + } + let info = tl.get_local_layers_for_disk_usage_eviction(); + debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend( + info.resident_layers + .into_iter() + .map(|layer_infos| (tl.clone(), layer_infos)), + ); + max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); + + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + } + + // `min_resident_size` defaults to maximum layer file size of the tenant. + // This ensures that each tenant can have at least one layer resident at a given time, + // ensuring forward progress for a single Timeline::get in that tenant. + // It's a questionable heuristic since, usually, there are many Timeline::get + // requests going on for a tenant, and, at least in Neon prod, the median + // layer file size is much smaller than the compaction target size. + // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. + // That's what's typically used by the various background loops. + // + // The default can be overriden with a fixed value in the tenant conf. + // A default override can be put in the default tenant conf in the pageserver.toml. + let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { + debug!( + tenant_id=%tenant.tenant_id(), + overriden_size=s, + "using overridden min resident size for tenant" + ); + s + } else { + debug!( + tenant_id=%tenant.tenant_id(), + max_layer_size, + "using max layer size as min_resident_size for tenant", + ); + max_layer_size + }; + + // Sort layers most-recently-used first, then partition by + // cumsum above/below min_resident_size. + tenant_candidates + .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); + let mut cumsum: i128 = 0; + for (timeline, layer_info) in tenant_candidates.into_iter() { + let file_size = layer_info.file_size(); + let candidate = EvictionCandidate { + timeline, + last_activity_ts: layer_info.last_activity_ts, + layer: layer_info.layer, + }; + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above + } else { + MinResidentSizePartition::Below + }; + candidates.push((partition, candidate)); + cumsum += i128::from(file_size); + } + } + + debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + candidates + .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + Ok(EvictionCandidates::Finished(candidates)) +} + +struct TimelineKey(Arc); + +impl PartialEq for TimelineKey { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.0, &other.0) + } +} + +impl Eq for TimelineKey {} + +impl std::hash::Hash for TimelineKey { + fn hash(&self, state: &mut H) { + Arc::as_ptr(&self.0).hash(state); + } +} + +impl std::ops::Deref for TimelineKey { + type Target = Timeline; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +mod filesystem_level_usage { + use std::path::Path; + + use anyhow::Context; + + use crate::statvfs::Statvfs; + + use super::DiskUsageEvictionTaskConfig; + + #[derive(Debug, Clone, Copy)] + #[allow(dead_code)] + pub struct Usage<'a> { + config: &'a DiskUsageEvictionTaskConfig, + + /// Filesystem capacity + total_bytes: u64, + /// Free filesystem space + avail_bytes: u64, + } + + impl super::Usage for Usage<'_> { + fn has_pressure(&self) -> bool { + let usage_pct = + (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64; + + let pressures = [ + ( + "min_avail_bytes", + self.avail_bytes < self.config.min_avail_bytes, + ), + ( + "max_usage_pct", + usage_pct > self.config.max_usage_pct.get() as u64, + ), + ]; + + pressures.into_iter().any(|(_, has_pressure)| has_pressure) + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.avail_bytes += bytes; + } + } + + pub fn get<'a>( + tenants_dir: &Path, + config: &'a DiskUsageEvictionTaskConfig, + ) -> anyhow::Result> { + let mock_config = { + #[cfg(feature = "testing")] + { + config.mock_statvfs.as_ref() + } + #[cfg(not(feature = "testing"))] + { + None + } + }; + + let stat = Statvfs::get(tenants_dir, mock_config) + .context("statvfs failed, presumably directory got unlinked")?; + + // https://unix.stackexchange.com/a/703650 + let blocksize = if stat.fragment_size() > 0 { + stat.fragment_size() + } else { + stat.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = stat.blocks_available() * blocksize; + let total_bytes = stat.blocks() * blocksize; + + Ok(Usage { + config, + total_bytes, + avail_bytes, + }) + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index eda4a60e95..478e9d228a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -27,6 +27,31 @@ paths: id: type: integer + /v1/disk_usage_eviction/run: + put: + description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space. + security: [] + requestBody: + content: + application/json: + schema: + type: object + required: + - evict_bytes + properties: + evict_bytes: + type: integer + responses: + "200": + description: | + The run completed. + This does not necessarily mean that we actually evicted `evict_bytes`. + Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`. + content: + application/json: + schema: + type: object + /v1/tenant/{tenant_id}: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b0addc82f1..2db60f557d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,6 +18,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -48,6 +49,7 @@ struct State { auth: Option>, allowlist_routes: Vec, remote_storage: Option, + disk_usage_eviction_state: Arc, } impl State { @@ -55,6 +57,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -65,6 +68,7 @@ impl State { auth, allowlist_routes, remote_storage, + disk_usage_eviction_state, }) } } @@ -775,6 +779,8 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; + + let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) + .await + .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; + + tenant.set_broken("broken from test"); + + json_response(StatusCode::OK, ()) +} + #[cfg(feature = "testing")] async fn failpoints_handler(mut request: Request) -> Result, ApiError> { if !fail::has_failpoints() { @@ -1063,6 +1085,89 @@ async fn always_panic_handler(req: Request) -> Result, ApiE json_response(StatusCode::NO_CONTENT, ()) } +async fn disk_usage_eviction_run(mut r: Request) -> Result, ApiError> { + check_permission(&r, None)?; + + #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] + struct Config { + /// How many bytes to evict before reporting that pressure is relieved. + evict_bytes: u64, + } + + #[derive(Debug, Clone, Copy, serde::Serialize)] + struct Usage { + // remains unchanged after instantiation of the struct + config: Config, + // updated by `add_available_bytes` + freed_bytes: u64, + } + + impl crate::disk_usage_eviction_task::Usage for Usage { + fn has_pressure(&self) -> bool { + self.config.evict_bytes > self.freed_bytes + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.freed_bytes += bytes; + } + } + + let config = json_request::(&mut r) + .await + .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + + let usage = Usage { + config, + freed_bytes: 0, + }; + + use crate::task_mgr::MGMT_REQUEST_RUNTIME; + + let (tx, rx) = tokio::sync::oneshot::channel(); + + let state = get_state(&r); + + let Some(storage) = state.remote_storage.clone() else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "remote storage not configured, cannot run eviction iteration" + ))) + }; + + let state = state.disk_usage_eviction_state.clone(); + + let cancel = CancellationToken::new(); + let child_cancel = cancel.clone(); + let _g = cancel.drop_guard(); + + crate::task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "ondemand disk usage eviction", + false, + async move { + let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, + &storage, + usage, + &child_cancel, + ) + .await; + + info!(?res, "disk_usage_eviction_task_iteration_impl finished"); + + let _ = tx.send(res); + Ok(()) + } + .in_current_span(), + ); + + let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1075,6 +1180,7 @@ pub fn make_router( launch_ts: &'static LaunchTimestamp, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -1119,7 +1225,8 @@ pub fn make_router( Ok(router .data(Arc::new( - State::new(conf, auth, remote_storage).context("Failed to initialize router state")?, + State::new(conf, auth, remote_storage, disk_usage_eviction_state) + .context("Failed to initialize router state")?, )) .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) .put( @@ -1200,6 +1307,13 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", |r| RequestSpan(evict_timeline_layer_handler).handle(r), ) + .put("/v1/disk_usage_eviction/run", |r| { + RequestSpan(disk_usage_eviction_run).handle(r) + }) + .put( + "/v1/tenant/:tenant_id/break", + testing_api!("set tenant state to broken", handle_tenant_break), + ) .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 09e21ae755..278658eba3 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -4,6 +4,7 @@ pub mod broker_client; pub mod config; pub mod consumption_metrics; pub mod context; +pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub mod keyspace; @@ -12,6 +13,7 @@ pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs new file mode 100644 index 0000000000..28d950b5e6 --- /dev/null +++ b/pageserver/src/statvfs.rs @@ -0,0 +1,150 @@ +//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking. + +use std::path::Path; + +pub enum Statvfs { + Real(nix::sys::statvfs::Statvfs), + Mock(mock::Statvfs), +} + +// NB: on macOS, the block count type of struct statvfs is u32. +// The workaround seems to be to use the non-standard statfs64 call. +// Sincce it should only be a problem on > 2TiB disks, let's ignore +// the problem for now and upcast to u64. +impl Statvfs { + pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result { + if let Some(mocked) = mocked { + Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?)) + } else { + Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?)) + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(), + Statvfs::Mock(stat) => stat.blocks, + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks_available(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(), + Statvfs::Mock(stat) => stat.blocks_available, + } + } + + pub fn fragment_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.fragment_size(), + Statvfs::Mock(stat) => stat.fragment_size, + } + } + + pub fn block_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.block_size(), + Statvfs::Mock(stat) => stat.block_size, + } + } +} + +pub mod mock { + use anyhow::Context; + use regex::Regex; + use std::path::Path; + use tracing::log::info; + + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + Failure { + mocked_error: MockedError, + }, + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + + pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result { + info!("running mocked statvfs"); + + match behavior { + Behavior::Success { + blocksize, + total_blocks, + ref name_filter, + } => { + let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); + + // round it up to the nearest block multiple + let used_blocks = (used_bytes + (blocksize - 1)) / blocksize; + + if used_blocks > *total_blocks { + panic!( + "mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}" + ); + } + + let avail_blocks = total_blocks - used_blocks; + + Ok(Statvfs { + blocks: *total_blocks, + blocks_available: avail_blocks, + fragment_size: *blocksize, + block_size: *blocksize, + }) + } + Behavior::Failure { mocked_error } => Err((*mocked_error).into()), + } + } + + fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result { + let mut total = 0; + for entry in walkdir::WalkDir::new(path) { + let entry = entry?; + if !entry.file_type().is_file() { + continue; + } + if !name_filter + .as_ref() + .map(|filter| filter.is_match(entry.file_name().to_str().unwrap())) + .unwrap_or(true) + { + continue; + } + total += entry + .metadata() + .with_context(|| format!("get metadata of {:?}", entry.path()))? + .len(); + } + Ok(total) + } + + pub struct Statvfs { + pub blocks: u64, + pub blocks_available: u64, + pub fragment_size: u64, + pub block_size: u64, + } +} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 44b1bbb06d..82aebc6c07 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -234,6 +234,9 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, + /// See [`crate::disk_usage_eviction_task`]. + DiskUsageEviction, + // Initial logical size calculation InitialLogicalSizeCalculation, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2c5226e5bc..7fac7d2ac0 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -95,7 +95,7 @@ mod timeline; pub mod size; -pub use timeline::{PageReconstructError, Timeline}; +pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -1706,6 +1706,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } + pub fn get_min_resident_size_override(&self) -> Option { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .min_resident_size_override + .or(self.conf.default_tenant_conf.min_resident_size_override) + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { *self.tenant_conf.write().unwrap() = new_tenant_conf; } @@ -2783,6 +2790,7 @@ pub mod harness { max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), + min_resident_size_override: tenant_conf.min_resident_size_override, } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 48cb6be121..cdabb23a7b 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -92,6 +92,7 @@ pub struct TenantConf { pub max_lsn_wal_lag: NonZeroU64, pub trace_read_requests: bool, pub eviction_policy: EvictionPolicy, + pub min_resident_size_override: Option, } /// Same as TenantConf, but this struct preserves the information about @@ -159,6 +160,10 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub eviction_policy: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub min_resident_size_override: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -220,6 +225,9 @@ impl TenantConfOpt { .trace_read_requests .unwrap_or(global_conf.trace_read_requests), eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), + min_resident_size_override: self + .min_resident_size_override + .or(global_conf.min_resident_size_override), } } } @@ -251,6 +259,7 @@ impl Default for TenantConf { .expect("cannot parse default max walreceiver Lsn wal lag"), trace_read_requests: false, eviction_policy: EvictionPolicy::NoEviction, + min_resident_size_override: None, } } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index c36b6121c0..2ee723e7c3 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -121,10 +121,10 @@ struct LayerAccessStatsInner { } #[derive(Debug, Clone, Copy)] -pub(super) struct LayerAccessStatFullDetails { - pub(super) when: SystemTime, - pub(super) task_kind: TaskKind, - pub(super) access_kind: LayerAccessKind, +pub(crate) struct LayerAccessStatFullDetails { + pub(crate) when: SystemTime, + pub(crate) task_kind: TaskKind, + pub(crate) access_kind: LayerAccessKind, } #[derive(Clone, Copy, strum_macros::EnumString)] @@ -255,7 +255,7 @@ impl LayerAccessStats { ret } - pub(super) fn most_recent_access_or_residence_event( + fn most_recent_access_or_residence_event( &self, ) -> Either { let locked = self.0.lock().unwrap(); @@ -268,6 +268,13 @@ impl LayerAccessStats { } } } + + pub(crate) fn latest_activity(&self) -> SystemTime { + match self.most_recent_access_or_residence_event() { + Either::Left(mra) => mra.when, + Either::Right(re) => re.timestamp, + } + } } /// Supertrait of the [`Layer`] trait that captures the bare minimum interface diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e1db34ec1b..b40cb05411 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -13,6 +13,7 @@ use pageserver_api::models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState, }; +use remote_storage::GenericRemoteStorage; use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -957,6 +958,25 @@ impl Timeline { } } + /// Evict a batch of layers. + /// + /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured." + /// + /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html + pub async fn evict_layers( + &self, + _: &GenericRemoteStorage, + layers_to_evict: &[Arc], + cancel: CancellationToken, + ) -> anyhow::Result>>> { + let remote_client = self.remote_client.clone().expect( + "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient", + ); + + self.evict_layer_batch(&remote_client, layers_to_evict, cancel) + .await + } + /// Evict multiple layers at once, continuing through errors. /// /// Try to evict the given `layers_to_evict` by @@ -994,6 +1014,15 @@ impl Timeline { // now lock out layer removal (compaction, gc, timeline deletion) let layer_removal_guard = self.layer_removal_cs.lock().await; + { + // to avoid racing with detach and delete_timeline + let state = self.current_state(); + anyhow::ensure!( + state == TimelineState::Active, + "timeline is not active but {state:?}" + ); + } + // start the batch update let mut layer_map = self.layers.write().unwrap(); let mut batch_updates = layer_map.batch_update(); @@ -1027,6 +1056,8 @@ impl Timeline { use super::layer_map::Replacement; if local_layer.is_remote_layer() { + // TODO(issue #3851): consider returning an err here instead of false, + // which is the same out the match later return Ok(false); } @@ -4012,6 +4043,67 @@ impl Timeline { } } +pub struct DiskUsageEvictionInfo { + /// Timeline's largest layer (remote or resident) + pub max_layer_size: Option, + /// Timeline's resident layers + pub resident_layers: Vec, +} + +pub struct LocalLayerInfoForDiskUsageEviction { + pub layer: Arc, + pub last_activity_ts: SystemTime, +} + +impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it + // having to allocate a string to this is bad, but it will rarely be formatted + let ts = chrono::DateTime::::from(self.last_activity_ts); + let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); + f.debug_struct("LocalLayerInfoForDiskUsageEviction") + .field("layer", &self.layer) + .field("last_activity", &ts) + .finish() + } +} + +impl LocalLayerInfoForDiskUsageEviction { + pub fn file_size(&self) -> u64 { + self.layer.file_size() + } +} + +impl Timeline { + pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { + let layers = self.layers.read().unwrap(); + + let mut max_layer_size: Option = None; + let mut resident_layers = Vec::new(); + + for l in layers.iter_historic_layers() { + let file_size = l.file_size(); + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + + if l.is_remote_layer() { + continue; + } + + let last_activity_ts = l.access_stats().latest_activity(); + + resident_layers.push(LocalLayerInfoForDiskUsageEviction { + layer: l, + last_activity_ts, + }); + } + + DiskUsageEvictionInfo { + max_layer_size, + resident_layers, + } + } +} + type TraversalPathItem = ( ValueReconstructResult, Lsn, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 107cd89b90..cf799a8808 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -20,7 +20,6 @@ use std::{ time::{Duration, SystemTime}, }; -use either::Either; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; @@ -185,13 +184,7 @@ impl Timeline { if hist_layer.is_remote_layer() { continue; } - let last_activity_ts = match hist_layer - .access_stats() - .most_recent_access_or_residence_event() - { - Either::Left(mra) => mra.when, - Either::Right(re) => re.timestamp, - }; + let last_activity_ts = hist_layer.access_stats().latest_activity(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9929d3e66b..a232bf8b6d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1220,6 +1220,28 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) return TenantConfig.from_json(res.json()) + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): + assert "tenant_id" not in config.keys() + res = self.put( + f"http://localhost:{self.port}/v1/tenant/config", + json={**config, "tenant_id": str(tenant_id)}, + ) + self.verbose_error(res) + + def patch_tenant_config_client_side( + self, + tenant_id: TenantId, + inserts: Optional[Dict[str, Any]] = None, + removes: Optional[List[str]] = None, + ): + current = self.tenant_config(tenant_id).tenant_specific_overrides + if inserts is not None: + current.update(inserts) + if removes is not None: + for key in removes: + del current[key] + self.set_tenant_config(tenant_id, current) + def tenant_size(self, tenant_id: TenantId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] @@ -1536,6 +1558,18 @@ class PageserverHttpClient(requests.Session): for layer in info.historic_layers: self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + def disk_usage_eviction_run(self, request: dict[str, Any]): + res = self.put( + f"http://localhost:{self.port}/v1/disk_usage_eviction/run", + json=request, + ) + self.verbose_error(res) + return res.json() + + def tenant_break(self, tenant_id: TenantId): + res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") + self.verbose_error(res) + @dataclass class TenantConfig: diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py new file mode 100644 index 0000000000..6ed09734fe --- /dev/null +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -0,0 +1,541 @@ +import shutil +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Tuple + +import pytest +import toml +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + LocalFsStorage, + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, + PgBin, + RemoteStorageKind, + wait_for_last_flush_lsn, + wait_for_upload_queue_empty, + wait_until, +) +from fixtures.types import Lsn, TenantId, TimelineId + +GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" + + +@pytest.mark.parametrize("config_level_override", [None, 400]) +def test_min_resident_size_override_handling( + neon_env_builder: NeonEnvBuilder, config_level_override: int +): + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + def assert_config(tenant_id, expect_override, expect_effective): + config = ps_http.tenant_config(tenant_id) + assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override + assert config.effective_config.get("min_resident_size_override") == expect_effective + + def assert_overrides(tenant_id, default_tenant_conf_value): + ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200}) + assert_config(tenant_id, 200, 200) + + ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0}) + assert_config(tenant_id, 0, 0) + + ps_http.set_tenant_config(tenant_id, {}) + assert_config(tenant_id, None, default_tenant_conf_value) + + env.pageserver.stop() + if config_level_override is not None: + env.pageserver.start( + overrides=( + "--pageserver-config-override=tenant_config={ min_resident_size_override = " + + str(config_level_override) + + " }", + ) + ) + else: + env.pageserver.start() + + tenant_id, _ = env.neon_cli.create_tenant() + assert_overrides(tenant_id, config_level_override) + + # Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig. + tenant_id, _ = env.neon_cli.create_tenant(conf={"min_resident_size_override": "100"}) + assert_config(tenant_id, 100, 100) + ps_http.set_tenant_config(tenant_id, {}) + assert_config(tenant_id, None, config_level_override) + + +@dataclass +class EvictionEnv: + timelines: list[Tuple[TenantId, TimelineId]] + neon_env: NeonEnv + pg_bin: PgBin + pageserver_http: PageserverHttpClient + layer_size: int + pgbench_init_lsns: Dict[TenantId, Lsn] + + def timelines_du(self) -> Tuple[int, int, int]: + return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines]) + + def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]: + return { + (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0] + for tid, tlid in self.timelines + } + + def warm_up_tenant(self, tenant_id: TenantId): + """ + Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it. + This assumes that the tenant is still at the state after pbench -i. + """ + lsn = self.pgbench_init_lsns[tenant_id] + with self.neon_env.postgres.create_start("main", tenant_id=tenant_id, lsn=lsn) as pg: + self.pg_bin.run(["pgbench", "-S", pg.connstr()]) + + def pageserver_start_with_disk_usage_eviction( + self, period, max_usage_pct, min_avail_bytes, mock_behavior + ): + disk_usage_config = { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + } + + enc = toml.TomlEncoder() + + self.neon_env.pageserver.start( + overrides=( + "--pageserver-config-override=disk_usage_based_eviction=" + + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + ), + ) + + def statvfs_called(): + assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*") + + wait_until(10, 1, statvfs_called) + + +@pytest.fixture +def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: + """ + Creates two tenants, one somewhat larger than the other. + """ + + log.info(f"setting up eviction_env for test {request.node.name}") + + neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # allow because we are invoking this manually; we always warn on executing disk based eviction + env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") + + # remove the initial tenant + ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865 + assert env.initial_timeline + wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline) + pageserver_http.tenant_detach(env.initial_tenant) + assert isinstance(env.remote_storage, LocalFsStorage) + tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant) + assert tenant_remote_storage.is_dir() + shutil.rmtree(tenant_remote_storage) + env.initial_tenant = TenantId("0" * 32) + env.initial_timeline = None + + # Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers. + # Large count of layers and small layer size is good for testing because it makes evictions predictable. + # Predictable in the sense that many layer evictions will be required to reach the eviction target, because + # each eviction only makes small progress. That means little overshoot, and thereby stable asserts. + pgbench_scales = [4, 6] + layer_size = 5 * 1024**2 + + pgbench_init_lsns = {} + + timelines = [] + for scale in pgbench_scales: + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{layer_size}", + "image_creation_threshold": "100", + "compaction_target_size": f"{layer_size}", + } + ) + + with env.postgres.create_start("main", tenant_id=tenant_id) as pg: + pg_bin.run(["pgbench", "-i", f"-s{scale}", pg.connstr()]) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + timelines.append((tenant_id, timeline_id)) + + # stop the safekeepers to avoid on-demand downloads caused by + # initial logical size calculation triggered by walreceiver connection status + # when we restart the pageserver process in any of the tests + env.neon_cli.safekeeper_stop() + + # after stopping the safekeepers, we know that no new WAL will be coming in + for tenant_id, timeline_id in timelines: + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id) + tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) + assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] + assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] + pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"]) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + log.info(f"{layers}") + assert ( + len(layers.historic_layers) >= 10 + ), "evictions happen at layer granularity, but we often assert at byte-granularity" + + eviction_env = EvictionEnv( + timelines=timelines, + neon_env=env, + pageserver_http=pageserver_http, + layer_size=layer_size, + pg_bin=pg_bin, + pgbench_init_lsns=pgbench_init_lsns, + ) + + return eviction_env + + +def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): + env = eviction_env + + env.neon_env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + broken_tenant_id, broken_timeline_id = env.timelines[0] + env.pageserver_http.tenant_break(broken_tenant_id) + + healthy_tenant_id, healthy_timeline_id = env.timelines[1] + + broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) + healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + + # try to evict everything, then validate that broken tenant wasn't touched + target = broken_size_pre + healthy_size_pre + + response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) + healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + + assert broken_size_pre == broken_size_post, "broken tenant should not be touched" + assert healthy_size_post < healthy_size_pre + assert healthy_size_post == 0 + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + + +def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv): + """ + Basic test to ensure that we evict enough to relieve pressure. + """ + env = eviction_env + pageserver_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + + target = total_on_disk // 2 + + response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + + actual_change = total_on_disk - later_total_on_disk + + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "must evict more than half" + assert ( + response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change + ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" + + +def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv): + """ + Override tenant min resident and ensure that it will be respected by eviction. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + du_by_timeline = env.du_by_timeline() + log.info("du_by_timeline: %s", du_by_timeline) + + assert len(du_by_timeline) == 2, "this test assumes two tenants" + large_tenant = max(du_by_timeline, key=du_by_timeline.__getitem__) + small_tenant = min(du_by_timeline, key=du_by_timeline.__getitem__) + assert du_by_timeline[large_tenant] > du_by_timeline[small_tenant] + assert ( + du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size + ), "ensure this test will do more than 1 eviction" + + # Give the larger tenant a haircut while preventing the smaller tenant from getting one. + # To prevent the smaller from getting a haircut, we set min_resident_size to its current size. + # To ensure the larger tenant is getting a haircut, any non-zero `target` will do. + min_resident_size = du_by_timeline[small_tenant] + target = 1 + assert ( + du_by_timeline[large_tenant] > min_resident_size + ), "ensure the larger tenant will get a haircut" + ps_http.patch_tenant_config_client_side( + small_tenant[0], {"min_resident_size_override": min_resident_size} + ) + ps_http.patch_tenant_config_client_side( + large_tenant[0], {"min_resident_size_override": min_resident_size} + ) + + # Make the large tenant more-recently used. An incorrect implemention would try to evict + # the smaller tenant completely first, before turning to the larger tenant, + # since the smaller tenant's layers are least-recently-used. + env.warm_up_tenant(large_tenant[0]) + + # do one run + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + time.sleep(1) # give log time to flush + assert not env.neon_env.pageserver.log_contains( + GLOBAL_LRU_LOG_LINE, + ), "this test is pointless if it fell back to global LRU" + + (later_total_on_disk, _, _) = env.timelines_du() + later_du_by_timeline = env.du_by_timeline() + log.info("later_du_by_timeline: %s", later_du_by_timeline) + + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + assert ( + response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change + ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" + + assert ( + later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant] + ), "small tenant sees no haircut" + assert ( + later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant] + ), "large tenant gets a haircut" + assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target + + +def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): + """ + If we can't relieve pressure using tenant_min_resident_size-respecting eviction, + we should continue to evict layers following global LRU. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + target = total_on_disk + + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + + time.sleep(1) # give log time to flush + assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + + +def test_partial_evict_tenant(eviction_env: EvictionEnv): + """ + Warm up a tenant, then build up pressure to cause in evictions in both. + We expect + * the default min resident size to be respect (largest layer file size) + * the warmed-up tenants layers above min resident size to be evicted after the cold tenant's. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + du_by_timeline = env.du_by_timeline() + + # pick any tenant + [our_tenant, other_tenant] = list(du_by_timeline.keys()) + (tenant_id, timeline_id) = our_tenant + + # make our tenant more recently used than the other one + env.warm_up_tenant(tenant_id) + + # Build up enough pressure to require evictions from both tenants, + # but not enough to fall into global LRU. + # So, set target to all occipied space, except 2*env.layer_size per tenant + target = ( + du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size + ) + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + + later_du_by_timeline = env.du_by_timeline() + for tenant, later_tenant_usage in later_du_by_timeline.items(): + assert ( + later_tenant_usage < du_by_timeline[tenant] + ), "all tenants should have lost some layers" + + assert ( + later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant] + ), "our warmed up tenant should be at about half capacity, part 1" + assert ( + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + later_du_by_timeline[our_tenant] + < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size + ), "our warmed up tenant should be at about half capacity, part 2" + assert ( + later_du_by_timeline[other_tenant] < 2 * env.layer_size + ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size" + + +def poor_mans_du( + env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]] +) -> Tuple[int, int, int]: + """ + Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples; + this could be done over layers endpoint just as well. + """ + total_on_disk = 0 + largest_layer = 0 + smallest_layer = None + for tenant_id, timeline_id in timelines: + dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + assert dir.exists(), f"timeline dir does not exist: {dir}" + sum = 0 + for file in dir.iterdir(): + if "__" not in file.name: + continue + size = file.stat().st_size + sum += size + largest_layer = max(largest_layer, size) + if smallest_layer: + smallest_layer = min(smallest_layer, size) + else: + smallest_layer = size + log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}") + + log.info(f"{tenant_id}/{timeline_id}: sum {sum}") + total_on_disk += sum + + assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0 + return (total_on_disk, largest_layer, smallest_layer or 0) + + +def test_statvfs_error_handling(eviction_env: EvictionEnv): + """ + We should log an error that statvfs fails. + """ + env = eviction_env + env.neon_env.pageserver.stop() + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=90, + min_avail_bytes=0, + mock_behavior={ + "type": "Failure", + "mocked_error": "EIO", + }, + ) + + assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") + env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO") + + +def test_statvfs_pressure_usage(eviction_env: EvictionEnv): + """ + If statvfs data shows 100% usage, the eviction task will drive it down to + the configured max_usage_pct. + """ + env = eviction_env + + env.neon_env.pageserver.stop() + + # make it seem like we're at 100% utilization by setting total bytes to the used bytes + total_size, _, _ = env.timelines_du() + blocksize = 512 + total_blocks = (total_size + (blocksize - 1)) // blocksize + + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=33, + min_avail_bytes=0, + mock_behavior={ + "type": "Success", + "blocksize": blocksize, + "total_blocks": total_blocks, + # Only count layer files towards used bytes in the mock_statvfs. + # This avoids accounting for metadata files & tenant conf in the tests. + "name_filter": ".*__.*", + }, + ) + + def relieved_log_message(): + assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + + wait_until(10, 1, relieved_log_message) + + post_eviction_total_size, _, _ = env.timelines_du() + + assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage" + + +def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): + """ + If statvfs data shows 100% usage, the eviction task will drive it down to + at least the configured min_avail_bytes. + """ + env = eviction_env + + env.neon_env.pageserver.stop() + + # make it seem like we're at 100% utilization by setting total bytes to the used bytes + total_size, _, _ = env.timelines_du() + blocksize = 512 + total_blocks = (total_size + (blocksize - 1)) // blocksize + + min_avail_bytes = total_size // 3 + + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=100, + min_avail_bytes=min_avail_bytes, + mock_behavior={ + "type": "Success", + "blocksize": blocksize, + "total_blocks": total_blocks, + # Only count layer files towards used bytes in the mock_statvfs. + # This avoids accounting for metadata files & tenant conf in the tests. + "name_filter": ".*__.*", + }, + ) + + def relieved_log_message(): + assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + + wait_until(10, 1, relieved_log_message) + + post_eviction_total_size, _, _ = env.timelines_du() + + assert ( + total_size - post_eviction_total_size >= min_avail_bytes + ), "we requested at least min_avail_bytes worth of free space" From 271f6a6e99a46667486d7a4450abf75a6b8120c2 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 28 Mar 2023 11:58:51 +0400 Subject: [PATCH 216/426] Always sync-safekeepers in neon_local on compute start. Instead of checking neon.safekeepers GUC value in existing pg node data dir, just always run sync-safekeepers when safekeepers are configured. Without this change, creation of new compute didn't run it. That's ok for new timeline/branch (it doesn't return anything useful anyway, and LSN is known by pageserver), but restart of compute for existing timeline bore the risk of getting basebackup not on the latest LSN, i.e. basically broken -- it might not have prev_lsn, and even if it had, walproposer would complain anyway. fixes https://github.com/neondatabase/neon/issues/2963 --- control_plane/src/compute.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index ee504bfaa6..bc81107706 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -90,7 +90,6 @@ impl ComputeControlPlane { timeline_id, lsn, tenant_id, - uses_wal_proposer: false, pg_version, }); @@ -115,7 +114,6 @@ pub struct PostgresNode { pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, - uses_wal_proposer: bool, pg_version: u32, } @@ -149,7 +147,6 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; - let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // Read postgres version from PG_VERSION file to determine which postgres version binary to use. // If it doesn't exist, assume broken data directory and use default pg version. @@ -172,7 +169,6 @@ impl PostgresNode { timeline_id, lsn: recovery_target_lsn, tenant_id, - uses_wal_proposer, pg_version, }) } @@ -364,7 +360,7 @@ impl PostgresNode { fn load_basebackup(&self, auth_token: &Option) -> Result<()> { let backup_lsn = if let Some(lsn) = self.lsn { Some(lsn) - } else if self.uses_wal_proposer { + } else if !self.env.safekeepers.is_empty() { // LSN 0 means that it is bootstrap and we need to download just // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again From d0711d08966542147596719ecec5eaf53c1f1b44 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 31 Mar 2023 16:05:15 +0300 Subject: [PATCH 217/426] build: fix git perms for deploy job (#3921) copy pasted from `build-neon` job. it is interesting that this is only needed by `build-neon` and `deploy`. Fixes: https://github.com/neondatabase/neon/actions/runs/4568077915/jobs/8070960178 which seems to have been going for a while. --- .github/workflows/build_and_test.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 52e1d94e9b..8482341b0c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -898,6 +898,16 @@ jobs: needs: [ push-docker-hub, tag, regress-tests ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Checkout uses: actions/checkout@v3 with: From 22f9ea5fe25b9a0782bc8c6ddca6ebdba89c8d73 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 31 Mar 2023 16:11:34 +0300 Subject: [PATCH 218/426] Remind people to clean up merge commit message in PR template (#3920) --- .github/pull_request_template.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 3f32b80ca8..816c5ee711 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -3,8 +3,12 @@ ## Issue ticket number and link ## Checklist before requesting a review + - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. +## Checklist before merging + +- [ ] Do not forget to reformat commit message to not include the above checklist From d2aa31f0ce687c3b2bc82c5f3db67dccbd5083cf Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 31 Mar 2023 18:25:53 +0200 Subject: [PATCH 219/426] fix pageserver_evictions_with_low_residence_duration metric (#3925) It was doing the comparison in the wrong way. --- pageserver/src/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 6cb245aed7..1f31e5a8fb 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -257,7 +257,7 @@ impl EvictionsWithLowResidenceDuration { } pub fn observe(&self, observed_value: Duration) { - if self.threshold < observed_value { + if observed_value < self.threshold { self.counter .as_ref() .expect("nobody calls this function after `remove_from_vec`") From 75ffe34b1734c1415b6cc6c998bc8c099409ad58 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 31 Mar 2023 19:45:59 +0100 Subject: [PATCH 220/426] check-macos-build: fix cache key (#3926) We don't have `${{ matrix.build_type }}` there, so it gets resolved to an empty substring and looks like this [`v1-macOS--pg-f8a650e49b06d39ad131b860117504044b01f312-dcccd010ff851b9f72bb451f28243fa3a341f07028034bbb46ea802413b36d80`](https://github.com/neondatabase/neon/actions/runs/4575422427/jobs/8078231907#step:26:2) --- .github/workflows/neon_extra_builds.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 2ae517e5e7..ef4c293e31 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,14 +53,14 @@ jobs: uses: actions/cache@v3 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v3 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS run: | From 814abd9f844561e4201f2b1c781867381f388a8a Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Sun, 2 Apr 2023 11:32:27 +0300 Subject: [PATCH 221/426] Switch to safekeeper in the same AZ (#3883) Add a condition to switch walreceiver connection to safekeeper that is located in the same availability zone. Switch happens when commit_lsn of a candidate is not less than commit_lsn from the active connection. This condition is expected not to trigger instantly, because commit_lsn of a current connection is usually greater than commit_lsn of updates from the broker. That means that if WAL is written continuously, switch can take a lot of time, but it should happen eventually. Now protoc 3.15+ is required for building neon. Fixes https://github.com/neondatabase/neon/issues/3200 --- README.md | 2 + .../walreceiver/connection_manager.rs | 123 +++++++++++++++--- safekeeper/src/http/routes.rs | 1 + safekeeper/src/timeline.rs | 1 + storage_broker/benches/rps.rs | 1 + storage_broker/proto/broker.proto | 4 +- storage_broker/src/bin/storage_broker.rs | 1 + 7 files changed, 117 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 43f3e3a02b..55df67f6c7 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ pacman -S base-devel readline zlib libseccomp openssl clang \ postgresql-libs cmake postgresql protobuf ``` +Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). + 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 0c770136db..de07676ffe 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -237,11 +237,7 @@ async fn connection_manager_loop_step( if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { info!("Switching to new connection candidate: {new_candidate:?}"); walreceiver_state - .change_connection( - new_candidate.safekeeper_id, - new_candidate.wal_source_connconf, - ctx, - ) + .change_connection(new_candidate, ctx) .await } } @@ -346,6 +342,8 @@ struct WalConnection { started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, + /// Availability zone of the safekeeper. + availability_zone: Option, /// Status of the connection. status: WalConnectionStatus, /// WAL streaming task handle. @@ -405,12 +403,7 @@ impl WalreceiverState { } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. - async fn change_connection( - &mut self, - new_sk_id: NodeId, - new_wal_source_connconf: PgConnectionConfig, - ctx: &RequestContext, - ) { + async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { self.drop_old_connection(true).await; let id = self.id; @@ -424,7 +417,7 @@ impl WalreceiverState { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - new_wal_source_connconf, + new_sk.wal_source_connconf, events_sender, cancellation, connect_timeout, @@ -433,13 +426,16 @@ impl WalreceiverState { .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) + .instrument( + info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id), + ) }); let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { started_at: now, - sk_id: new_sk_id, + sk_id: new_sk.safekeeper_id, + availability_zone: new_sk.availability_zone, status: WalConnectionStatus { is_connected: false, has_processed_wal: false, @@ -546,6 +542,7 @@ impl WalreceiverState { /// * if connected safekeeper is not present, pick the candidate /// * if we haven't received any updates for some time, pick the candidate /// * if the candidate commit_lsn is much higher than the current one, pick the candidate + /// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate /// /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. @@ -559,6 +556,7 @@ impl WalreceiverState { let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(Some(connected_sk_node))?; + let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone(); let now = Utc::now().naive_utc(); if let Ok(latest_interaciton) = @@ -569,6 +567,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoKeepAlives { last_keep_alive: Some( existing_wal_connection.status.latest_connection_update, @@ -594,6 +593,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::LaggingWal { current_commit_lsn, new_commit_lsn, @@ -601,6 +601,20 @@ impl WalreceiverState { }, }); } + // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver, + // and the current one is not, switch to the new one. + if self.availability_zone.is_some() + && existing_wal_connection.availability_zone + != self.availability_zone + && self.availability_zone == new_availability_zone + { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + availability_zone: new_availability_zone, + wal_source_connconf: new_wal_source_connconf, + reason: ReconnectReason::SwitchAvailabilityZone, + }); + } } None => debug!( "Best SK candidate has its commit_lsn behind connected SK's commit_lsn" @@ -668,6 +682,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoWalTimeout { current_lsn, current_commit_lsn, @@ -686,10 +701,11 @@ impl WalreceiverState { self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal; } None => { - let (new_sk_id, _, new_wal_source_connconf) = + let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(None)?; return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, + availability_zone: new_safekeeper_broker_data.availability_zone.clone(), wal_source_connconf: new_wal_source_connconf, reason: ReconnectReason::NoExistingConnection, }); @@ -794,6 +810,7 @@ impl WalreceiverState { struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, + availability_zone: Option, // This field is used in `derive(Debug)` only. #[allow(dead_code)] reason: ReconnectReason, @@ -808,6 +825,7 @@ enum ReconnectReason { new_commit_lsn: Lsn, threshold: NonZeroU64, }, + SwitchAvailabilityZone, NoWalTimeout { current_lsn: Lsn, current_commit_lsn: Lsn, @@ -873,6 +891,7 @@ mod tests { peer_horizon_lsn: 0, local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), + availability_zone: None, }, latest_update, } @@ -933,6 +952,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1095,6 +1115,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1160,6 +1181,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1222,6 +1244,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { @@ -1289,4 +1312,74 @@ mod tests { availability_zone: None, } } + + #[tokio::test] + async fn switch_to_same_availability_zone() -> anyhow::Result<()> { + // Pageserver and one of safekeepers will be in the same availability zone + // and pageserver should prefer to connect to it. + let test_az = Some("test_az".to_owned()); + + let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let mut state = dummy_state(&harness).await; + state.availability_zone = test_az.clone(); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: connected_sk_id, + availability_zone: None, + status: connection_status, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskStateUpdate::Progress(connection_status)) + .ok(); + Ok(()) + }), + discovered_new_wal: None, + }); + + // We have another safekeeper with the same commit_lsn, and it have the same availability zone as + // the current pageserver. + let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); + same_az_sk.timeline.availability_zone = test_az.clone(); + + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), + ), + (NodeId(1), same_az_sk), + ]); + + // We expect that pageserver will switch to the safekeeper in the same availability zone, + // even if it has the same commit_lsn. + let next_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(next_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + next_candidate.reason, + ReconnectReason::SwitchAvailabilityZone, + "Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn" + ); + assert_eq!( + next_candidate.wal_source_connconf.host(), + &Host::Domain("same_az".to_owned()) + ); + + Ok(()) + } } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 14badebd95..cdec45c148 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -242,6 +242,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result, n_keys: u64) { peer_horizon_lsn: 5, safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(), local_start_lsn: 0, + availability_zone: None, }; counter += 1; yield info; diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index 1a46896d02..4b2de1a8e5 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -36,9 +36,11 @@ message SafekeeperTimelineInfo { uint64 local_start_lsn = 9; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; + // Availability zone of a safekeeper. + optional string availability_zone = 11; } message TenantTimelineId { bytes tenant_id = 1; bytes timeline_id = 2; -} \ No newline at end of file +} diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 57f975b0df..d7ace28426 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -525,6 +525,7 @@ mod tests { peer_horizon_lsn: 5, safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(), local_start_lsn: 0, + availability_zone: None, } } From d733bc54b8b2aa904cf0192359c0c7d6f986fe8d Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 29 Mar 2023 22:02:36 +0400 Subject: [PATCH 222/426] Rename ReplicationFeedback and its fields. This is the the feedback originating from pageserver, so change previous confusing names to s/ReplicationFeedback/PageserverFeedback s/ps_writelsn/last_receive_lsn s/ps_flushlsn/disk_consistent_lsn s/ps_apply_lsn/remote_consistent_lsn I haven't changed on the wire format to keep compatibility. However, understanding of new field names is added to compute, so once all computes receive this patch we can change the wire names as well. Safekeepers/pageservers are deployed roughly at the same time and it is ok to live without feedbacks during the short period, so this is not a problem there. --- libs/pq_proto/src/lib.rs | 90 ++++++++++-------- .../walreceiver/walreceiver_connection.rs | 18 ++-- pgxn/neon/walproposer.c | 94 +++++++++---------- pgxn/neon/walproposer.h | 24 ++--- safekeeper/src/metrics.rs | 25 +++-- safekeeper/src/safekeeper.rs | 8 +- safekeeper/src/send_wal.rs | 8 +- safekeeper/src/timeline.rs | 12 +-- 8 files changed, 142 insertions(+), 137 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 656c0ff312..a976e19029 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -936,35 +936,40 @@ impl<'a> BeMessage<'a> { } } -// Neon extension of postgres replication protocol -// See NEON_STATUS_UPDATE_TAG_BYTE +/// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// Serialized in custom flexible key/value format. In replication protocol, it +/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres +/// Standby status update / Hot standby feedback messages. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub struct ReplicationFeedback { - // Last known size of the timeline. Used to enforce timeline size limit. +pub struct PageserverFeedback { + /// Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, - // Parts of StandbyStatusUpdate we resend to compute via safekeeper - pub ps_writelsn: u64, - pub ps_applylsn: u64, - pub ps_flushlsn: u64, - pub ps_replytime: SystemTime, + /// LSN last received and ingested by the pageserver. + pub last_received_lsn: u64, + /// LSN up to which data is persisted by the pageserver to its local disc. + pub disk_consistent_lsn: u64, + /// LSN up to which data is persisted by the pageserver on s3; safekeepers + /// consider WAL before it can be removed. + pub remote_consistent_lsn: u64, + pub replytime: SystemTime, } -// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback. +// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. // Do not remove previously available fields because this might be backwards incompatible. -pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5; +pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; -impl ReplicationFeedback { - pub fn empty() -> ReplicationFeedback { - ReplicationFeedback { +impl PageserverFeedback { + pub fn empty() -> PageserverFeedback { + PageserverFeedback { current_timeline_size: 0, - ps_writelsn: 0, - ps_applylsn: 0, - ps_flushlsn: 0, - ps_replytime: SystemTime::now(), + last_received_lsn: 0, + remote_consistent_lsn: 0, + disk_consistent_lsn: 0, + replytime: SystemTime::now(), } } - // Serialize ReplicationFeedback using custom format + // Serialize PageserverFeedback using custom format // to support protocol extensibility. // // Following layout is used: @@ -974,24 +979,26 @@ impl ReplicationFeedback { // null-terminated string - key, // uint32 - value length in bytes // value itself + // + // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_writelsn); + buf.put_u64(self.last_received_lsn); buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_flushlsn); + buf.put_u64(self.disk_consistent_lsn); buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_applylsn); + buf.put_u64(self.remote_consistent_lsn); let timestamp = self - .ps_replytime + .replytime .duration_since(*PG_EPOCH) .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; @@ -1001,9 +1008,10 @@ impl ReplicationFeedback { buf.put_i64(timestamp); } - // Deserialize ReplicationFeedback message - pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut rf = ReplicationFeedback::empty(); + // Deserialize PageserverFeedback message + // TODO: change serialized fields names once all computes migrate to rename. + pub fn parse(mut buf: Bytes) -> PageserverFeedback { + let mut rf = PageserverFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); @@ -1016,39 +1024,39 @@ impl ReplicationFeedback { b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_writelsn = buf.get_u64(); + rf.last_received_lsn = buf.get_u64(); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_flushlsn = buf.get_u64(); + rf.disk_consistent_lsn = buf.get_u64(); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_applylsn = buf.get_u64(); + rf.remote_consistent_lsn = buf.get_u64(); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { let len = buf.get_i32(); warn!( - "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.", + "PageserverFeedback parse. unknown key {} of len {len}. Skip it.", String::from_utf8_lossy(key.as_ref()) ); buf.advance(len as usize); } } } - trace!("ReplicationFeedback parsed is {:?}", rf); + trace!("PageserverFeedback parsed is {:?}", rf); rf } } @@ -1059,33 +1067,33 @@ mod tests { #[test] fn test_replication_feedback_serialization() { - let mut rf = ReplicationFeedback::empty(); + let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); - let rf_parsed = ReplicationFeedback::parse(data.freeze()); + let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { - let mut rf = ReplicationFeedback::empty(); + let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { - *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; + *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; } data.put_slice(b"new_field_one\0"); @@ -1093,7 +1101,7 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let rf_parsed = ReplicationFeedback::parse(data.freeze()); + let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 9398a7bee9..ea2f2392ea 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -37,7 +37,7 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use pq_proto::ReplicationFeedback; +use pq_proto::PageserverFeedback; use utils::lsn::Lsn; /// Status of the connection. @@ -319,12 +319,12 @@ pub async fn handle_walreceiver_connection( timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); + let last_received_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); + let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn); let ts = SystemTime::now(); // Update the status about what we just received. This is shown in the mgmt API. @@ -343,12 +343,12 @@ pub async fn handle_walreceiver_connection( let (timeline_logical_size, _) = timeline .get_current_logical_size(&ctx) .context("Status update creation failed to get current logical size")?; - let status_update = ReplicationFeedback { + let status_update = PageserverFeedback { current_timeline_size: timeline_logical_size, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, + last_received_lsn, + disk_consistent_lsn, + remote_consistent_lsn, + replytime: ts, }; debug!("neon_status_update {status_update:?}"); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index b0b2a23e3c..45037a8c01 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1872,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk) return sk->state == SS_ACTIVE; } -/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) +ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf) { uint8 nkeys; int i; @@ -1892,45 +1892,45 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", rf->currentClusterSize); } - else if (strcmp(key, "ps_writelsn") == 0) + else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); + rf->last_received_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", + LSN_FORMAT_ARGS(rf->last_received_lsn)); } - else if (strcmp(key, "ps_flushlsn") == 0) + else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); + rf->disk_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); } - else if (strcmp(key, "ps_applylsn") == 0) + else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); + rf->remote_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); } - else if (strcmp(key, "ps_replytime") == 0) + else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_replytime = pq_getmsgint64(reply_message); + rf->replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); + replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); + elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", + rf->replytime, replyTimeStr); pfree(replyTimeStr); } @@ -1944,7 +1944,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * * Skip unknown keys to support backward compatibile protocol * changes */ - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -2024,7 +2024,7 @@ GetAcknowledgedByQuorumWALPosition(void) } /* - * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + * WalproposerShmemSize --- report amount of shared memory space needed */ Size WalproposerShmemSize(void) @@ -2054,10 +2054,10 @@ WalproposerShmemInit(void) } void -replication_feedback_set(ReplicationFeedback * rf) +replication_feedback_set(PageserverFeedback * rf) { SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); SpinLockRelease(&walprop_shared->mutex); } @@ -2065,43 +2065,43 @@ void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.ps_writelsn; - *flushLsn = walprop_shared->feedback.ps_flushlsn; - *applyLsn = walprop_shared->feedback.ps_applylsn; + *writeLsn = walprop_shared->feedback.last_received_lsn; + *flushLsn = walprop_shared->feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } /* - * Get ReplicationFeedback fields from the most advanced safekeeper + * Get PageserverFeedback fields from the most advanced safekeeper */ static void -GetLatestNeonFeedback(ReplicationFeedback * rf) +GetLatestNeonFeedback(PageserverFeedback * rf) { int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + XLogRecPtr last_received_lsn = InvalidXLogRecPtr; for (int i = 0; i < n_safekeepers; i++) { - if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) { latest_safekeeper = i; - ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn; } } rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; - rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; - rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; - rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; + rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; + rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; + rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime; elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); replication_feedback_set(rf); } @@ -2115,16 +2115,16 @@ HandleSafekeeperResponse(void) XLogRecPtr minFlushLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; if (!syncSafekeepers) { - /* Get ReplicationFeedback fields from the most advanced safekeeper */ + /* Get PageserverFeedback fields from the most advanced safekeeper */ GetLatestNeonFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } - if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { if (minQuorumLsn > quorumFeedback.flushLsn) @@ -2142,7 +2142,7 @@ HandleSafekeeperResponse(void) * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.ps_flushlsn, + quorumFeedback.rf.disk_consistent_lsn, GetCurrentTimestamp(), false); } @@ -2326,7 +2326,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); + ParsePageserverFeedbackMessage(&s, &msg->rf); pq_getmsgend(&s); return true; } @@ -2462,7 +2462,7 @@ backpressure_lag_impl(void) replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024 * 1024) - elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 537c733850..f016a229eb 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -280,21 +280,21 @@ typedef struct HotStandbyFeedback FullTransactionId catalog_xmin; } HotStandbyFeedback; -typedef struct ReplicationFeedback +typedef struct PageserverFeedback { /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; - TimestampTz ps_replytime; -} ReplicationFeedback; + XLogRecPtr last_received_lsn; + XLogRecPtr disk_consistent_lsn; + XLogRecPtr remote_consistent_lsn; + TimestampTz replytime; +} PageserverFeedback; typedef struct WalproposerShmemState { slock_t mutex; - ReplicationFeedback feedback; + PageserverFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; } WalproposerShmemState; @@ -320,10 +320,10 @@ typedef struct AppendResponse /* Feedback recieved from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - ReplicationFeedback rf; + PageserverFeedback rf; } AppendResponse; -/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) @@ -383,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]); extern void WalProposerMain(Datum main_arg); extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); extern void WalProposerPoll(void); -extern void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); +extern void ParsePageserverFeedbackMessage(StringInfo reply_message, + PageserverFeedback *rf); extern void StartProposerReplication(StartReplicationCmd *cmd); extern Size WalproposerShmemSize(void); extern bool WalproposerShmemInit(void); -extern void replication_feedback_set(ReplicationFeedback *rf); +extern void replication_feedback_set(PageserverFeedback *rf); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); /* libpqwalproposer hooks & helper type */ diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index c3077b6dc5..2aaa17bfc5 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -255,7 +255,7 @@ pub struct TimelineCollector { epoch_start_lsn: GenericGaugeVec, peer_horizon_lsn: GenericGaugeVec, remote_consistent_lsn: GenericGaugeVec, - feedback_ps_write_lsn: GenericGaugeVec, + ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, @@ -339,15 +339,15 @@ impl TimelineCollector { .unwrap(); descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); - let feedback_ps_write_lsn = GenericGaugeVec::new( + let ps_last_received_lsn = GenericGaugeVec::new( Opts::new( - "safekeeper_feedback_ps_write_lsn", + "safekeeper_ps_last_received_lsn", "Last LSN received by the pageserver, acknowledged in the feedback", ), &["tenant_id", "timeline_id"], ) .unwrap(); - descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + descs.extend(ps_last_received_lsn.desc().into_iter().cloned()); let feedback_last_time_seconds = GenericGaugeVec::new( Opts::new( @@ -458,7 +458,7 @@ impl TimelineCollector { epoch_start_lsn, peer_horizon_lsn, remote_consistent_lsn, - feedback_ps_write_lsn, + ps_last_received_lsn, feedback_last_time_seconds, timeline_active, wal_backup_active, @@ -489,7 +489,7 @@ impl Collector for TimelineCollector { self.epoch_start_lsn.reset(); self.peer_horizon_lsn.reset(); self.remote_consistent_lsn.reset(); - self.feedback_ps_write_lsn.reset(); + self.ps_last_received_lsn.reset(); self.feedback_last_time_seconds.reset(); self.timeline_active.reset(); self.wal_backup_active.reset(); @@ -514,11 +514,11 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; + let mut most_advanced: Option = None; for replica in tli.replicas.iter() { if let Some(replica_feedback) = replica.pageserver_feedback { if let Some(current) = most_advanced { - if current.ps_writelsn < replica_feedback.ps_writelsn { + if current.last_received_lsn < replica_feedback.last_received_lsn { most_advanced = Some(replica_feedback); } } else { @@ -568,11 +568,10 @@ impl Collector for TimelineCollector { .set(tli.wal_storage.flush_wal_seconds); if let Some(feedback) = most_advanced { - self.feedback_ps_write_lsn + self.ps_last_received_lsn .with_label_values(labels) - .set(feedback.ps_writelsn); - if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) - { + .set(feedback.last_received_lsn); + if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) { self.feedback_last_time_seconds .with_label_values(labels) .set(unix_time.as_secs()); @@ -599,7 +598,7 @@ impl Collector for TimelineCollector { mfs.extend(self.epoch_start_lsn.collect()); mfs.extend(self.peer_horizon_lsn.collect()); mfs.extend(self.remote_consistent_lsn.collect()); - mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d8fe36d7f8..10b4842cbd 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -18,7 +18,7 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use pq_proto::{ReplicationFeedback, SystemId}; +use pq_proto::{PageserverFeedback, SystemId}; use utils::{ bin_ser::LeSer, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -360,7 +360,7 @@ pub struct AppendResponse { // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub pageserver_feedback: ReplicationFeedback, + pub pageserver_feedback: PageserverFeedback, } impl AppendResponse { @@ -370,7 +370,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), } } } @@ -708,7 +708,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), }; trace!("formed AppendResponse {:?}", ar); ar diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index b533e87c5b..a6ca89efa4 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -11,7 +11,7 @@ use postgres_backend::PostgresBackend; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; +use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -319,11 +319,9 @@ impl ReplyReader { // pageserver sends this. // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. let buf = Bytes::copy_from_slice(&msg[9..]); - let reply = ReplicationFeedback::parse(buf); + let reply = PageserverFeedback::parse(buf); - trace!("ReplicationFeedback is {:?}", reply); - // Only pageserver sends ReplicationFeedback, so set the flag. - // This replica is the source of information to resend to compute. + trace!("PageserverFeedback is {:?}", reply); self.feedback.pageserver_feedback = Some(reply); self.tli diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 931062db1a..9dd8a63cf0 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, bail, Result}; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; -use pq_proto::ReplicationFeedback; +use pq_proto::PageserverFeedback; use serde::Serialize; use std::cmp::{max, min}; use std::path::PathBuf; @@ -91,7 +91,7 @@ pub struct ReplicaState { /// combined hot standby feedback from all replicas pub hs_feedback: HotStandbyFeedback, /// Replication specific feedback received from pageserver, if any - pub pageserver_feedback: Option, + pub pageserver_feedback: Option, } impl Default for ReplicaState { @@ -276,7 +276,7 @@ impl SharedState { // if let Some(pageserver_feedback) = state.pageserver_feedback { if let Some(acc_feedback) = acc.pageserver_feedback { - if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn { + if acc_feedback.last_received_lsn < pageserver_feedback.last_received_lsn { warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet."); acc.pageserver_feedback = Some(pageserver_feedback); } @@ -287,12 +287,12 @@ impl SharedState { // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. // See https://github.com/neondatabase/neon/issues/1171 - acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); + acc.last_received_lsn = Lsn::from(pageserver_feedback.last_received_lsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, // safekeeper is free to delete it, so choose max of all pageservers. acc.remote_consistent_lsn = max( - Lsn::from(pageserver_feedback.ps_applylsn), + Lsn::from(pageserver_feedback.remote_consistent_lsn), acc.remote_consistent_lsn, ); } @@ -585,7 +585,7 @@ impl Timeline { let replica_state = shared_state.replicas[replica_id].unwrap(); let reported_remote_consistent_lsn = replica_state .pageserver_feedback - .map(|f| Lsn(f.ps_applylsn)) + .map(|f| Lsn(f.remote_consistent_lsn)) .unwrap_or(Lsn::INVALID); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. From cf5cfe6d718fa61ccd6d2e6d97d627a58ab2cd03 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 3 Apr 2023 13:26:45 +0300 Subject: [PATCH 223/426] fix: metric used for alerting threshold on staging (#3932) This should remove the too eager alerts from staging. --- .github/ansible/staging.eu-west-1.hosts.yaml | 1 + .github/ansible/staging.us-east-2.hosts.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index e8d0bb1dc7..b634345c72 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,6 +8,7 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 # TODO: learn typical resident-size growth rate [GiB/minute] and configure diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 4ef51651fc..c1ceaa61ee 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,6 +8,7 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 # TODO: learn typical resident-size growth rate [GiB/minute] and configure From a415670bc34825cbee8d548bff94abe3630c57fa Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 3 Apr 2023 14:15:41 +0300 Subject: [PATCH 224/426] feat: log evictions (#3930) this will help log analysis with the counterpart of already logging all remote download needs and downloads. ended up with a easily regexable output in the final round. --- pageserver/src/tenant/timeline.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b40cb05411..e80e32644b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1127,6 +1127,9 @@ impl Timeline { self.metrics .evictions_with_low_residence_duration .observe(delta); + info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period"); + } else { + info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period"); } true From 45bf76eb05944e1356ad0b7158e90a3d4502a2da Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 3 Apr 2023 14:57:36 +0200 Subject: [PATCH 225/426] enable layer eviction by default in prod (#3933) Leave disk_usage_based_eviction above the current max usage in prod (82%ish), so that deploying this commit won't trigger disk_usage_based_eviction. As indicated in the TODO, we'll decrease the value to 80% later. Also update the staging YAMLs to use the anchor syntax for `evictions_low_residence_duration_metric_threshold` like we do in the prod YAMLs as of this patch. --- .github/ansible/prod.ap-southeast-1.hosts.yaml | 10 ++++++++++ .github/ansible/prod.eu-central-1.hosts.yaml | 10 ++++++++++ .github/ansible/prod.us-east-2.hosts.yaml | 10 ++++++++++ .github/ansible/prod.us-west-2.hosts.yaml | 10 ++++++++++ .github/ansible/staging.eu-west-1.hosts.yaml | 8 ++------ .github/ansible/staging.us-east-2.hosts.yaml | 8 ++------ 6 files changed, 44 insertions(+), 12 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 8ccb67b04a..c185086eef 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index b3cd5de01c..0a0f974ea4 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 22c705e1cf..4427bb344e 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index f03e2d9435..53626b4f59 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b634345c72..34c8e77280 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,20 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min - evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 - # TODO: learn typical resident-size growth rate [GiB/minute] and configure - # min_avail_bytes such that we have X minutes of headroom. min_avail_bytes: 0 - # We assume that the worst-case growth rate is small enough that we can - # catch above-threshold conditions by checking every 10s. period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index c1ceaa61ee..94f2be83a4 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,20 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min - evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 - # TODO: learn typical resident-size growth rate [GiB/minute] and configure - # min_avail_bytes such that we have X minutes of headroom. min_avail_bytes: 0 - # We assume that the worst-case growth rate is small enough that we can - # catch above-threshold conditions by checking every 10s. period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" From f85a61ceac4c261d7e73449eb1198efd35a307e8 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 3 Apr 2023 17:55:16 +0300 Subject: [PATCH 226/426] [proxy] Fix regression in logging For some reason, `tracing::instrument` proc_macro doesn't always print elements specified via `fields()` or even show that it's impossible (e.g. there's no Display impl). Work around this using the `?foo` notation. Before: 2023-04-03T14:48:06.017504Z INFO handle_client:handshake: received SslRequest After: 2023-04-03T14:51:24.424176Z INFO handle_client{session_id=7bd07be8-3462-404e-8ccc-0a5332bf3ace}:handshake: received SslRequest --- proxy/src/auth/backend.rs | 2 +- proxy/src/proxy.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index b8599adaeb..18bc80d523 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -140,7 +140,7 @@ async fn auth_quirks( impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. - #[tracing::instrument(fields(allow_cleartext), skip_all)] + #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( &mut self, extra: &ConsoleReqExtra<'_>, diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index efe0e8795b..03c9c72f30 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -98,7 +98,7 @@ pub async fn task_main( } // TODO(tech debt): unite this with its twin below. -#[tracing::instrument(fields(session_id), skip_all)] +#[tracing::instrument(fields(session_id = ?session_id), skip_all)] pub async fn handle_ws_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -140,7 +140,7 @@ pub async fn handle_ws_client( .await } -#[tracing::instrument(fields(session_id), skip_all)] +#[tracing::instrument(fields(session_id = ?session_id), skip_all)] async fn handle_client( config: &'static ProxyConfig, cancel_map: &CancelMap, From 846532112c7f01984ce1b8a8b19134744a0d1948 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 3 Apr 2023 23:44:38 +0300 Subject: [PATCH 227/426] Remove unused S3 list operation (#3936) In S3, pageserver only lists tenants (prefixes) on S3, no other keys. Remove the list operation from the API, since S3 impl does not seem to work normally and not used anyway, --- libs/remote_storage/src/lib.rs | 3 -- libs/remote_storage/src/local_fs.rs | 7 ++-- libs/remote_storage/src/s3_bucket.rs | 44 -------------------- libs/remote_storage/src/simulate_failures.rs | 7 ---- 4 files changed, 4 insertions(+), 57 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 1d50a777f4..5b74308514 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -78,9 +78,6 @@ impl RemotePath { /// providing basic CRUD operations for storage files. #[async_trait::async_trait] pub trait RemoteStorage: Send + Sync + 'static { - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; - /// Lists all top level subdirectories for a given prefix /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index f1289569ae..21a4156ad3 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -73,10 +73,8 @@ impl LocalFs { Ok(None) } } -} -#[async_trait::async_trait] -impl RemoteStorage for LocalFs { + #[cfg(test)] async fn list(&self) -> anyhow::Result> { Ok(get_all_files(&self.storage_root, true) .await? @@ -91,7 +89,10 @@ impl RemoteStorage for LocalFs { }) .collect()) } +} +#[async_trait::async_trait] +impl RemoteStorage for LocalFs { async fn list_prefixes( &self, prefix: Option<&RemotePath>, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d4eb7d9244..fdf3ae02d3 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -275,50 +275,6 @@ impl AsyncRead for RatelimitedAsyncRead { #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - async fn list(&self) -> anyhow::Result> { - let mut document_keys = Vec::new(); - - let mut continuation_token = None; - loop { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 list")?; - - metrics::inc_list_objects(); - - let fetch_response = self - .client - .list_objects_v2() - .bucket(self.bucket_name.clone()) - .set_prefix(self.prefix_in_bucket.clone()) - .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) - .set_continuation_token(continuation_token) - .set_max_keys(self.max_keys_per_list_response) - .send() - .await - .map_err(|e| { - metrics::inc_list_objects_fail(); - e - })?; - document_keys.extend( - fetch_response - .contents - .unwrap_or_default() - .into_iter() - .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))), - ); - - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), - None => break, - } - } - - Ok(document_keys) - } - /// See the doc for `RemoteStorage::list_prefixes` /// Note: it wont include empty "directories" async fn list_prefixes( diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 643bb99dce..d1d062f8e7 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -20,7 +20,6 @@ pub struct UnreliableWrapper { /// Used to identify retries of different unique operation. #[derive(Debug, Hash, Eq, PartialEq)] enum RemoteOp { - List, ListPrefixes(Option), Upload(RemotePath), Download(RemotePath), @@ -75,12 +74,6 @@ impl UnreliableWrapper { #[async_trait::async_trait] impl RemoteStorage for UnreliableWrapper { - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result> { - self.attempt(RemoteOp::List)?; - self.inner.list().await - } - async fn list_prefixes( &self, prefix: Option<&RemotePath>, From 105b8bb9d36297e0c5bc119c87ed1e94ce18cbd5 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 4 Apr 2023 12:21:54 +0100 Subject: [PATCH 228/426] test_runner: automatically rerun flaky tests (#3880) This PR adds a plugin that automatically reruns (up to 3 times) flaky tests. Internally, it uses data from `TEST_RESULT_CONNSTR` database and `pytest-rerunfailures` plugin. As the first approximation we consider the test flaky if it has failed on the main branch in the last 10 days. Flaky tests are fetched by `scripts/flaky_tests.py` script (it's possible to use it in a standalone mode to learn which tests are flaky), stored to a JSON file, and then the file is passed to the pytest plugin. --- .github/actions/allure-report/action.yml | 4 +- .../actions/run-python-test-set/action.yml | 12 +++ .github/workflows/build_and_test.yml | 3 + poetry.lock | 38 +++++--- pyproject.toml | 6 +- scripts/flaky_tests.py | 87 +++++++++++++++++++ test_runner/conftest.py | 1 + test_runner/fixtures/flaky.py | 58 +++++++++++++ test_runner/fixtures/utils.py | 2 +- 9 files changed, 195 insertions(+), 16 deletions(-) create mode 100755 scripts/flaky_tests.py create mode 100644 test_runner/fixtures/flaky.py diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index 2d4cabdde5..e685006245 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -76,8 +76,8 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.19.0 - ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464 + ALLURE_VERSION: 2.21.0 + ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c - name: Upload Allure results if: ${{ inputs.action == 'store' }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 29b04a3478..11f5c78f19 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,10 @@ inputs: description: 'Secret access key' required: false default: '' + rerun_flaky: + description: 'Whether to rerun flaky tests' + required: false + default: 'false' runs: using: "composite" @@ -101,6 +105,7 @@ runs: COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14 ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') + RERUN_FLAKY: ${{ inputs.rerun_flaky }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report @@ -143,6 +148,13 @@ runs: EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi + if [ "${RERUN_FLAKY}" == "true" ]; then + mkdir -p $TEST_OUTPUT + poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json" + + EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" + fi + if [[ "${{ inputs.build_type }}" == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8482341b0c..8c108e7f50 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -335,6 +335,9 @@ jobs: real_s3_region: us-west-2 real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + rerun_flaky: true + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} - name: Merge and upload coverage data if: matrix.build_type == 'debug' diff --git a/poetry.lock b/poetry.lock index 011d5d7817..7b368cd3b4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] [[package]] name = "allure-pytest" -version = "2.10.0" +version = "2.13.1" description = "Allure pytest integration" category = "main" optional = false python-versions = "*" files = [ - {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"}, - {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"}, + {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"}, + {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"}, ] [package.dependencies] -allure-python-commons = "2.10.0" +allure-python-commons = "2.13.1" pytest = ">=4.5.0" -six = ">=1.9.0" [[package]] name = "allure-python-commons" -version = "2.10.0" +version = "2.13.1" description = "Common module for integrate allure with python-based frameworks" category = "main" optional = false -python-versions = ">=3.5" +python-versions = ">=3.6" files = [ - {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"}, - {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"}, + {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"}, + {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"}, ] [package.dependencies] attrs = ">=16.0.0" pluggy = ">=0.4.0" -six = ">=1.9.0" [[package]] name = "async-timeout" @@ -1932,6 +1930,22 @@ pytest = [ {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] +[[package]] +name = "pytest-rerunfailures" +version = "11.1.2" +description = "pytest plugin to re-run tests to eliminate flaky failures" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"}, + {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"}, +] + +[package.dependencies] +packaging = ">=17.1" +pytest = ">=5.3" + [[package]] name = "pytest-timeout" version = "2.1.0" @@ -2597,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91" +content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1" diff --git a/pyproject.toml b/pyproject.toml index f21c12b2e3..a51e91782e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" Werkzeug = "^2.2.3" pytest-order = "^1.0.1" -allure-pytest = "^2.10.0" +allure-pytest = "^2.13.1" pytest-asyncio = "^0.19.0" toml = "^0.10.2" psutil = "^5.9.4" @@ -34,6 +34,7 @@ types-psutil = "^5.9.5.4" types-toml = "^0.10.8" pytest-httpserver = "^1.0.6" aiohttp = "3.7.4" +pytest-rerunfailures = "^11.1.2" [tool.poetry.group.dev.dependencies] black = "^23.1.0" @@ -69,6 +70,9 @@ strict = true module = [ "asyncpg.*", "pg8000.*", + "allure.*", + "allure_commons.*", + "allure_pytest.*", ] ignore_missing_imports = true diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py new file mode 100755 index 0000000000..829cc814e8 --- /dev/null +++ b/scripts/flaky_tests.py @@ -0,0 +1,87 @@ +#! /usr/bin/env python3 + +import argparse +import json +import logging +from collections import defaultdict +from typing import DefaultDict, Dict + +import psycopg2 +import psycopg2.extras + +# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days. +FLAKY_TESTS_QUERY = """ + SELECT + DISTINCT parent_suite, suite, test + FROM + ( + SELECT + revision, + jsonb_array_elements(data -> 'children') -> 'name' as parent_suite, + jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status, + to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp + FROM + regress_test_results + WHERE + reference = 'refs/heads/main' + ) data + WHERE + timestamp > CURRENT_DATE - INTERVAL '%s' day + AND status::text IN ('"failed"', '"broken"') + ; +""" + + +def main(args: argparse.Namespace): + connstr = args.connstr + interval_days = args.days + output = args.output + + res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] + res = defaultdict(lambda: defaultdict(dict)) + + logging.info("connecting to the database...") + with psycopg2.connect(connstr, connect_timeout=10) as conn: + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + logging.info("fetching flaky tests...") + cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) + rows = cur.fetchall() + + for row in rows: + logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}") + res[row["parent_suite"]][row["suite"]][row["test"]] = True + + logging.info(f"saving results to {output.name}") + json.dump(res, output, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days") + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="flaky.json", + help="path to output json file (default: flaky.json)", + ) + parser.add_argument( + "--days", + required=False, + default=10, + type=int, + help="how many days to look back for flaky tests (default: 10)", + ) + parser.add_argument( + "connstr", + help="connection string to the test results database", + ) + args = parser.parse_args() + + level = logging.INFO + logging.basicConfig( + format="%(message)s", + level=level, + ) + + main(args) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 8b7f6a2eea..75242b84ce 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -4,4 +4,5 @@ pytest_plugins = ( "fixtures.pg_stats", "fixtures.compare_fixtures", "fixtures.slow", + "fixtures.flaky", ) diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py new file mode 100644 index 0000000000..9d7f8ead9a --- /dev/null +++ b/test_runner/fixtures/flaky.py @@ -0,0 +1,58 @@ +import json +from pathlib import Path +from typing import List + +import pytest +from _pytest.config import Config +from _pytest.config.argparsing import Parser +from allure_commons.types import LabelType +from allure_pytest.utils import allure_name, allure_suite_labels + +from fixtures.log_helper import log + +""" +The plugin reruns flaky tests. +It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py` + +Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers +""" + + +def pytest_addoption(parser: Parser): + parser.addoption( + "--flaky-tests-json", + action="store", + type=Path, + help="Path to json file with flaky tests generated by scripts/flaky_tests.py", + ) + + +def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]): + if not config.getoption("--flaky-tests-json"): + return + + # Any error with getting flaky tests aren't critical, so just do not rerun any tests + flaky_json = config.getoption("--flaky-tests-json") + if not flaky_json.exists(): + return + + content = flaky_json.read_text() + try: + flaky_tests = json.loads(content) + except ValueError: + log.error(f"Can't parse {content} as json") + return + + for item in items: + # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB) + # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100 + allure_labels = dict(allure_suite_labels(item)) + parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE)) + suite = str(allure_labels.get(LabelType.SUITE)) + params = item.callspec.params if hasattr(item, "callspec") else {} + name = allure_name(item, params) + + if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False): + # Rerun 3 times = 1 original run + 2 reruns + log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times") + item.add_marker(pytest.mark.flaky(reruns=2)) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index ce03658e8f..1e15fea3c2 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -7,7 +7,7 @@ import time from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, TypeVar -import allure # type: ignore +import allure from psycopg2.extensions import cursor from fixtures.log_helper import log From 1d23b5d1de0b1b373f507d2e9407e104a171bd48 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 4 Apr 2023 12:22:47 +0100 Subject: [PATCH 229/426] Comment PR with test results (#3907) This PR adds posting a comment with test results. Each workflow run updates the comment with new results. The layout and the information that we post can be changed to our needs, right now, it contains failed tests and test which changes status after rerun (i.e. flaky tests) --- .github/actions/allure-report/action.yml | 44 ++++++-- .github/workflows/build_and_test.yml | 82 +++++++++++---- scripts/pr-comment-test-report.js | 125 +++++++++++++++++++++++ 3 files changed, 222 insertions(+), 29 deletions(-) create mode 100644 scripts/pr-comment-test-report.js diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index e685006245..e35cbb20fd 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -15,10 +15,32 @@ outputs: report-url: description: 'Allure report URL' value: ${{ steps.generate-report.outputs.report-url }} + report-json-url: + description: 'Allure report JSON URL' + value: ${{ steps.generate-report.outputs.report-json-url }} runs: using: "composite" + steps: + # We're using some of env variables quite offen, so let's set them once. + # + # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1] + # + # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv + # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456 + # + - name: Set common environment variables + shell: bash -euxo pipefail {0} + run: | + echo "BUILD_TYPE=${BUILD_TYPE}" >> $GITHUB_ENV + echo "BUCKET=${BUCKET}" >> $GITHUB_ENV + echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV + env: + BUILD_TYPE: ${{ inputs.build_type }} + BUCKET: neon-github-public-dev + TEST_OUTPUT: /tmp/test_output + - name: Validate input parameters shell: bash -euxo pipefail {0} run: | @@ -84,8 +106,6 @@ runs: env: REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - TEST_OUTPUT: /tmp/test_output - BUCKET: neon-github-public-dev TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} shell: bash -euxo pipefail {0} run: | @@ -104,7 +124,7 @@ runs: EOF cat < $TEST_OUTPUT/allure/results/environment.properties TEST_SELECTION=${{ inputs.test_selection }} - BUILD_TYPE=${{ inputs.build_type }} + BUILD_TYPE=${BUILD_TYPE} EOF ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" @@ -113,13 +133,12 @@ runs: tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" - # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this + # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this - name: Acquire Allure lock if: ${{ inputs.action == 'generate' }} shell: bash -euxo pipefail {0} env: LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - BUCKET: neon-github-public-dev TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} run: | LOCK_TIMEOUT=300 # seconds @@ -149,8 +168,6 @@ runs: env: REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - TEST_OUTPUT: /tmp/test_output - BUCKET: neon-github-public-dev shell: bash -euxo pipefail {0} run: | # Get previously uploaded data for this run @@ -186,24 +203,24 @@ runs: REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html # Generate redirect - cat < ./index.html + cat < ${TEST_OUTPUT}/allure/index.html Redirecting to ${REPORT_URL} EOF - aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" + aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT + echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT - name: Release Allure lock if: ${{ inputs.action == 'generate' && always() }} shell: bash -euxo pipefail {0} env: LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - BUCKET: neon-github-public-dev TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} run: | aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 @@ -212,11 +229,16 @@ runs: aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" fi + - name: Cleanup + if: always() + shell: bash -euxo pipefail {0} + run: | + rm -rf ${TEST_OUTPUT}/allure + - uses: actions/github-script@v6 if: ${{ inputs.action == 'generate' && always() }} env: REPORT_URL: ${{ steps.generate-report.outputs.report-url }} - BUILD_TYPE: ${{ inputs.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: script: | diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8c108e7f50..68102bce84 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -374,42 +374,88 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones - merge-allure-report: + create-test-report: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init needs: [ regress-tests, benchmarks ] if: ${{ !cancelled() }} - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: false - - name: Create Allure report - id: create-allure-report + steps: + - uses: actions/checkout@v3 + + - name: Create Allure report (debug) + if: ${{ !cancelled() }} + id: create-allure-report-debug uses: ./.github/actions/allure-report with: action: generate - build_type: ${{ matrix.build_type }} + build_type: debug + + - name: Create Allure report (release) + if: ${{ !cancelled() }} + id: create-allure-report-release + uses: ./.github/actions/allure-report + with: + action: generate + build_type: release + + - uses: actions/github-script@v6 + if: > + !cancelled() && + github.event_name == 'pull_request' && ( + steps.create-allure-report-debug.outputs.report-url || + steps.create-allure-report-release.outputs.report-url + ) + with: + script: | + const reports = [{ + buildType: "debug", + reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}", + jsonUrl: "${{ steps.create-allure-report-debug.outputs.report-json-url }}", + }, { + buildType: "release", + reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}", + jsonUrl: "${{ steps.create-allure-report-release.outputs.report-json-url }}", + }] + + const script = require("./scripts/pr-comment-test-report.js") + await script({ + github, + context, + fetch, + reports, + }) - name: Store Allure test stat in the DB - if: ${{ steps.create-allure-report.outputs.report-url }} + if: > + !cancelled() && ( + steps.create-allure-report-debug.outputs.report-url || + steps.create-allure-report-release.outputs.report-url + ) env: - BUILD_TYPE: ${{ matrix.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} - REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }} + REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }} + REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} run: | - curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do + if [ -z "$report_url" ]; then + continue + fi + + if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then + BUILD_TYPE=debug + else + BUILD_TYPE=release + fi + + curl --fail --output suites.json "${report_url}" + DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + done coverage-report: runs-on: [ self-hosted, gen3, small ] diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js new file mode 100644 index 0000000000..d2c5aebc4f --- /dev/null +++ b/scripts/pr-comment-test-report.js @@ -0,0 +1,125 @@ +// +// The script parses Allure reports and posts a comment with a summary of the test results to the PR. +// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated). +// +// The comment is updated on each run with the latest results. +// +// It is designed to be used with actions/github-script from GitHub Workflows: +// - uses: actions/github-script@v6 +// with: +// script: | +// const script = require("./scripts/pr-comment-test-report.js") +// await script({ +// github, +// context, +// fetch, +// reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties +// }) +// + +module.exports = async ({ github, context, fetch, reports }) => { + // Marker to find the comment in the subsequent runs + const startMarker = `` + // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) + const githubActionsBotId = 41898282 + // The latest commit in the PR URL + const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.sha}` + // Commend body itself + let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n` + + // Common parameters for GitHub API requests + const ownerRepoParams = { + owner: context.repo.owner, + repo: context.repo.repo, + } + + for (const report of reports) { + const {buildType, reportUrl, jsonUrl} = report + + if (!reportUrl || !jsonUrl) { + console.warn(`"reportUrl" or "jsonUrl" aren't set for ${buildType} build`) + continue + } + + const suites = await (await fetch(jsonUrl)).json() + + // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests. + // For this report it's ok to treat them in the same way (as failed). + failedTests = [] + passedTests = [] + skippedTests = [] + + retriedTests = [] + retriedStatusChangedTests = [] + + for (const parentSuite of suites.children) { + for (const suite of parentSuite.children) { + for (const test of suite.children) { + pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}` + test.pytestName = pytestName + + if (test.status === "passed") { + passedTests.push(test); + } else if (test.status === "failed" || test.status === "broken") { + failedTests.push(test); + } else if (test.status === "skipped") { + skippedTests.push(test); + } + + if (test.retriesCount > 0) { + retriedTests.push(test); + + if (test.retriedStatusChangedTests) { + retriedStatusChangedTests.push(test); + } + } + } + } + } + + const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length + commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} ([full report](${reportUrl}))\n` + if (failedTests.length > 0) { + commentBody += `Failed tests:\n` + for (const test of failedTests) { + const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` + + commentBody += `- [\`${test.pytestName}\`](${allureLink})` + if (test.retriesCount > 0) { + commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))` + } + commentBody += "\n" + } + commentBody += "\n" + } + if (retriedStatusChangedTests > 0) { + commentBody += `Flaky tests:\n` + for (const test of retriedStatusChangedTests) { + const status = test.status === "passed" ? ":white_check_mark:" : ":x:" + commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n` + } + commentBody += "\n" + } + commentBody += "___\n" + } + + const { data: comments } = await github.rest.issues.listComments({ + issue_number: context.payload.number, + ...ownerRepoParams, + }) + + const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker)) + if (comment) { + await github.rest.issues.updateComment({ + comment_id: comment.id, + body: commentBody, + ...ownerRepoParams, + }) + } else { + await github.rest.issues.createComment({ + issue_number: context.payload.number, + body: commentBody, + ...ownerRepoParams, + }) + } +} From 957acb51b555b467551e43349847b9079f349cf2 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 4 Apr 2023 17:06:10 +0100 Subject: [PATCH 230/426] GitHub Autocomment: Fix the link to the latest commit (#3952) --- scripts/pr-comment-test-report.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js index d2c5aebc4f..8df7248c4e 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/pr-comment-test-report.js @@ -23,7 +23,7 @@ module.exports = async ({ github, context, fetch, reports }) => { // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) const githubActionsBotId = 41898282 // The latest commit in the PR URL - const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.sha}` + const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}` // Commend body itself let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n` From c3ca48c62bad8ed070a041433e36d3d5913ebcb8 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 5 Apr 2023 02:29:05 +0300 Subject: [PATCH 231/426] Support extra domain names for proxy. Make it possible to specify directory where proxy will look up for extra certificates. Proxy will iterate through subdirs of that directory and load `key.pem` and `cert.pem` files from each subdir. Certs directory structure may look like that: certs |--example.com | |--key.pem | |--cert.pem |--foo.bar |--key.pem |--cert.pem Actual domain names are taken from certs and key, subdir names are ignored. --- proxy/src/auth/backend/hacks.rs | 2 +- proxy/src/auth/credentials.rs | 103 ++++++++++++-------- proxy/src/config.rs | 161 +++++++++++++++++++++++++------- proxy/src/main.rs | 12 ++- proxy/src/proxy.rs | 8 +- proxy/src/proxy/tests.rs | 4 +- 6 files changed, 206 insertions(+), 84 deletions(-) diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index f710581cb2..d45806461e 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -53,7 +53,7 @@ pub async fn password_hack( .await?; info!(project = &payload.project, "received missing parameter"); - creds.project = Some(payload.project.into()); + creds.project = Some(payload.project); let mut node = api.wake_compute(extra, creds).await?; node.config.password(payload.password); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c556c33197..b21cd79ddf 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -2,7 +2,7 @@ use crate::error::UserFacingError; use pq_proto::StartupMessageParams; -use std::borrow::Cow; +use std::collections::HashSet; use thiserror::Error; use tracing::info; @@ -19,11 +19,10 @@ pub enum ClientCredsParseError { InconsistentProjectNames { domain: String, option: String }, #[error( - "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \ - SNI should be formatted as '.{}'.", - .sni, .cn, .cn, + "Common name inferred from SNI ('{}') is not known", + .cn, )] - InconsistentSni { sni: String, cn: String }, + UnknownCommonName { cn: String }, #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(String), @@ -37,7 +36,7 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials<'a> { pub user: &'a str, // TODO: this is a severe misnomer! We should think of a new name ASAP. - pub project: Option>, + pub project: Option, } impl ClientCredentials<'_> { @@ -51,7 +50,7 @@ impl<'a> ClientCredentials<'a> { pub fn parse( params: &'a StartupMessageParams, sni: Option<&str>, - common_name: Option<&str>, + common_names: Option>, ) -> Result { use ClientCredsParseError::*; @@ -60,37 +59,43 @@ impl<'a> ClientCredentials<'a> { let user = get_param("user")?; // Project name might be passed via PG's command-line options. - let project_option = params.options_raw().and_then(|mut options| { - options - .find_map(|opt| opt.strip_prefix("project=")) - .map(Cow::Borrowed) - }); + let project_option = params + .options_raw() + .and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project="))) + .map(|name| name.to_string()); - // Alternative project name is in fact a subdomain from SNI. - // NOTE: we do not consider SNI if `common_name` is missing. - let project_domain = sni - .zip(common_name) - .map(|(sni, cn)| { - subdomain_from_sni(sni, cn) - .ok_or_else(|| InconsistentSni { - sni: sni.into(), - cn: cn.into(), + let project_from_domain = if let Some(sni_str) = sni { + if let Some(cn) = common_names { + let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain); + + let project = common_name_from_sni + .and_then(|domain| { + if cn.contains(domain) { + subdomain_from_sni(sni_str, domain) + } else { + None + } }) - .map(Cow::<'static, str>::Owned) - }) - .transpose()?; + .ok_or_else(|| UnknownCommonName { + cn: common_name_from_sni.unwrap_or("").into(), + })?; - let project = match (project_option, project_domain) { + Some(project) + } else { + None + } + } else { + None + }; + + let project = match (project_option, project_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { - Some(Err(InconsistentProjectNames { - domain: domain.into(), - option: option.into(), - })) + Some(Err(InconsistentProjectNames { domain, option })) } // Invariant: project name may not contain certain characters. (a, b) => a.or(b).map(|name| match project_name_valid(&name) { - false => Err(MalformedProjectName(name.into())), + false => Err(MalformedProjectName(name)), true => Ok(name), }), } @@ -149,9 +154,9 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("foo.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_names)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -177,24 +182,41 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]); let sni = Some("baz.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_names)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("baz")); Ok(()) } + #[test] + fn parse_multi_common_names() -> anyhow::Result<()> { + let options = StartupMessageParams::new([("user", "john_doe")]); + + let common_names = Some(["a.com".into(), "b.com".into()].into()); + let sni = Some("p1.a.com"); + let creds = ClientCredentials::parse(&options, sni, common_names)?; + assert_eq!(creds.project.as_deref(), Some("p1")); + + let common_names = Some(["a.com".into(), "b.com".into()].into()); + let sni = Some("p1.b.com"); + let creds = ClientCredentials::parse(&options, sni, common_names)?; + assert_eq!(creds.project.as_deref(), Some("p1")); + + Ok(()) + } + #[test] fn parse_projects_different() { let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]); let sni = Some("second.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -209,13 +231,12 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); - let common_name = Some("example.com"); + let common_names = Some(["example.com".into()].into()); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); match err { - InconsistentSni { sni, cn } => { - assert_eq!(sni, "project.localhost"); - assert_eq!(cn, "example.com"); + UnknownCommonName { cn } => { + assert_eq!(cn, "localhost"); } _ => panic!("bad error: {err:?}"), } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 600db7f8ec..9f6241d733 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,6 +1,12 @@ use crate::auth; -use anyhow::{bail, ensure, Context}; -use std::{str::FromStr, sync::Arc, time::Duration}; +use anyhow::{bail, ensure, Context, Ok}; +use rustls::sign; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + sync::Arc, + time::Duration, +}; pub struct ProxyConfig { pub tls_config: Option, @@ -16,7 +22,7 @@ pub struct MetricCollectionConfig { pub struct TlsConfig { pub config: Arc, - pub common_name: Option, + pub common_names: Option>, } impl TlsConfig { @@ -26,28 +32,33 @@ impl TlsConfig { } /// Configure TLS for the main endpoint. -pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; +pub fn configure_tls( + key_path: &str, + cert_path: &str, + certs_dir: Option<&String>, +) -> anyhow::Result { + let mut cert_resolver = CertResolver::new(); - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() - }; + // add default certificate + cert_resolver.add_cert(key_path, cert_path)?; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + // add extra certificates + if let Some(certs_dir) = certs_dir { + for entry in std::fs::read_dir(certs_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + let key_path = path.join("key.pem"); + let cert_path = path.join("cert.pem"); + if key_path.exists() && cert_path.exists() { + cert_resolver + .add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; + } + } + } + } - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? - .into_iter() - .map(rustls::Certificate) - .collect() - }; + let common_names = cert_resolver.get_common_names(); let config = rustls::ServerConfig::builder() .with_safe_default_cipher_suites() @@ -55,27 +66,105 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result>, +} + +impl CertResolver { + fn new() -> Self { + Self { + certs: HashMap::new(), + } + } + + fn add_cert(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + let priv_key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) + .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + keys.pop().map(rustls::PrivateKey).unwrap() + }; + + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? + .into_iter() + .map(rustls::Certificate) + .collect() + }; + + let common_name = { + let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes) + .context(format!( + "Failed to parse PEM object from bytes from file at '{cert_path}'." + ))? + .1; + let common_name = pem.parse_x509()?.subject().to_string(); + common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + } + .context(format!( + "Failed to parse common name from certificate at '{cert_path}'." + ))?; + + self.certs.insert( + common_name, + Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)), + ); + + Ok(()) + } + + fn get_common_names(&self) -> HashSet { + self.certs.keys().map(|s| s.to_string()).collect() + } +} + +impl rustls::server::ResolvesServerCert for CertResolver { + fn resolve( + &self, + _client_hello: rustls::server::ClientHello, + ) -> Option> { + // loop here and cut off more and more subdomains until we find + // a match to get a proper wildcard support. OTOH, we now do not + // use nested domains, so keep this simple for now. + // + // With the current coding foo.com will match *.foo.com and that + // repeats behavior of the old code. + if let Some(mut sni_name) = _client_hello.server_name() { + loop { + if let Some(cert) = self.certs.get(sni_name) { + return Some(cert.clone()); + } + if let Some((_, rest)) = sni_name.split_once('.') { + sni_name = rest; + } else { + return None; + } + } + } else { + None + } + } +} + /// Helper for cmdline cache options parsing. pub struct CacheOptions { /// Max number of entries. diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 85478da3bc..c6526e9aff 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -132,7 +132,11 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?), + (Some(key_path), Some(cert_path)) => Some(config::configure_tls( + key_path, + cert_path, + args.get_one::("certs-dir"), + )?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; @@ -254,6 +258,12 @@ fn cli() -> clap::Command { .alias("ssl-cert") // backwards compatibility .help("path to TLS cert for client postgres connections"), ) + // tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + .arg( + Arg::new("certs-dir") + .long("certs-dir") + .help("path to directory with TLS certificates for client postgres connections"), + ) .arg( Arg::new("metric-collection-endpoint") .long("metric-collection-endpoint") diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 03c9c72f30..70fb25474e 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -124,11 +124,11 @@ pub async fn handle_ws_client( // Extract credentials which we're going to use for auth. let creds = { - let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let common_names = tls.and_then(|tls| tls.common_names.clone()); let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? @@ -163,11 +163,11 @@ async fn handle_client( // Extract credentials which we're going to use for auth. let creds = { let sni = stream.get_ref().sni_hostname(); - let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let common_names = tls.and_then(|tls| tls.common_names.clone()); let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index ed429df421..60acb588dc 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -54,9 +54,11 @@ fn generate_tls_config<'a>( .with_single_cert(vec![cert], key)? .into(); + let common_names = Some([common_name.to_owned()].iter().cloned().collect()); + TlsConfig { config, - common_name: Some(common_name.to_string()), + common_names, } }; From d8df5237fa3cb629c2afc6d65e960b10cd0fad8a Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 5 Apr 2023 20:56:52 +0300 Subject: [PATCH 232/426] Aligne extra certificate name with default cert-manager names --- proxy/src/config.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9f6241d733..5f9585149e 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -48,8 +48,9 @@ pub fn configure_tls( let entry = entry?; let path = entry.path(); if path.is_dir() { - let key_path = path.join("key.pem"); - let cert_path = path.join("cert.pem"); + // file names aligned with default cert-manager names + let key_path = path.join("tls.key"); + let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { cert_resolver .add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; From 9310949b44b7c9d5f89e6efa2322e6b841c9dafb Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 5 Apr 2023 20:08:06 +0100 Subject: [PATCH 233/426] GitHub Autocomment: Retry on server errors (#3958) Retry posting/updating a comment in case of 5XX errors from GitHub API --- .github/workflows/build_and_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 68102bce84..56c0aa8f9a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -409,6 +409,8 @@ jobs: steps.create-allure-report-release.outputs.report-url ) with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 script: | const reports = [{ buildType: "debug", From b17c24fa38c3985e8a03b1f839de2e5d831d1e3d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Apr 2023 12:47:21 +0300 Subject: [PATCH 234/426] fix: settle down to configured percent (#3947) in real env testing we noted that the disk-usage based eviction sails 1 percentage point above the configured value, which might be a source of confusion, so it might be better to get rid of that confusion now. confusion: "I configured 85% but pageserver sails at 86%". Co-authored-by: Christian Schwarz --- libs/utils/src/serde_percent.rs | 8 +++++ pageserver/src/disk_usage_eviction_task.rs | 41 +++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs index 63b62b5f1e..36e874a161 100644 --- a/libs/utils/src/serde_percent.rs +++ b/libs/utils/src/serde_percent.rs @@ -11,6 +11,14 @@ use serde::{Deserialize, Serialize}; pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); impl Percent { + pub const fn new(pct: u8) -> Option { + if pct <= 100 { + Some(Percent(pct)) + } else { + None + } + } + pub fn get(&self) -> u8 { self.0 } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index eeeb6fda89..f4a0f3f18e 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -639,7 +639,7 @@ mod filesystem_level_usage { ), ( "max_usage_pct", - usage_pct > self.config.max_usage_pct.get() as u64, + usage_pct >= self.config.max_usage_pct.get() as u64, ), ]; @@ -686,4 +686,43 @@ mod filesystem_level_usage { avail_bytes, }) } + + #[test] + fn max_usage_pct_pressure() { + use super::Usage as _; + use std::time::Duration; + use utils::serde_percent::Percent; + + let mut usage = Usage { + config: &DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(85).unwrap(), + min_avail_bytes: 0, + period: Duration::MAX, + #[cfg(feature = "testing")] + mock_statvfs: None, + }, + total_bytes: 100_000, + avail_bytes: 0, + }; + + assert!(usage.has_pressure(), "expected pressure at 100%"); + + usage.add_available_bytes(14_000); + assert!(usage.has_pressure(), "expected pressure at 86%"); + + usage.add_available_bytes(999); + assert!(usage.has_pressure(), "expected pressure at 85.001%"); + + usage.add_available_bytes(1); + assert!(usage.has_pressure(), "expected pressure at precisely 85%"); + + usage.add_available_bytes(1); + assert!(!usage.has_pressure(), "no pressure at 84.999%"); + + usage.add_available_bytes(999); + assert!(!usage.has_pressure(), "no pressure at 84%"); + + usage.add_available_bytes(16_000); + assert!(!usage.has_pressure()); + } } From 9db70f6232aa78823ebda8552b89b6bb1a61ee12 Mon Sep 17 00:00:00 2001 From: Gleb Novikov Date: Thu, 6 Apr 2023 14:02:56 +0400 Subject: [PATCH 235/426] Added disk_size and instance_type to payload (#3918) ## Describe your changes In https://github.com/neondatabase/cloud/issues/4354 we are making scheduling of projects based on available disk space and overcommit, so we need to know disk size and just in case instance type of the pageserver ## Issue ticket number and link https://github.com/neondatabase/cloud/issues/4354 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] ~If it is a core feature, I have added thorough tests.~ - [ ] ~Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?~ - [ ] ~If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.~ --- .github/ansible/scripts/init_pageserver.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh index e7d6efadae..d88f754a86 100644 --- a/.github/ansible/scripts/init_pageserver.sh +++ b/.github/ansible/scripts/init_pageserver.sh @@ -3,6 +3,8 @@ # fetch params from meta-data service INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) +INSTANCE_TYPE=$(curl -s http://169.254.169.254/latest/meta-data/instance-type) +DISK_SIZE=$(df -B1 /storage | tail -1 | awk '{print $2}') # store fqdn hostname in var HOST=$(hostname -f) @@ -18,7 +20,9 @@ cat < Date: Thu, 6 Apr 2023 12:53:58 +0200 Subject: [PATCH 236/426] Allow installation of `pg_stat_statements` --- Dockerfile.compute-node | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index ef861b15be..92a1bb69e5 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -38,6 +38,7 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ From 887cee64e26754c47be8d5a641f0923f95a616ee Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 6 Apr 2023 13:52:41 +0100 Subject: [PATCH 237/426] test_runner: add links to grafana for remote tests (#3961) Add Grafana links to allure reports to make it easier to debug perf test failures --- test_runner/fixtures/neon_fixtures.py | 10 ++++++- test_runner/fixtures/utils.py | 42 +++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a232bf8b6d..c24158e9ec 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -17,6 +17,7 @@ import uuid from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass, field +from datetime import datetime from enum import Flag, auto from functools import cached_property from itertools import chain, product @@ -48,6 +49,7 @@ from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, Fn, + allure_add_grafana_links, allure_attach_from_dir, get_self_dir, subprocess_capture, @@ -2436,10 +2438,16 @@ def remote_pg( connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable") - + start_ms = int(datetime.utcnow().timestamp() * 1000) with RemotePostgres(pg_bin, connstr) as remote_pg: yield remote_pg + end_ms = int(datetime.utcnow().timestamp() * 1000) + host = parse_dsn(connstr).get("host", "") + if host.endswith(".neon.build"): + # Add 10s margin to the start and end times + allure_add_grafana_links(host, start_ms - 10_000, end_ms + 10_000) + class PSQL: """ diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 1e15fea3c2..b58539ca86 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,4 +1,5 @@ import contextlib +import json import os import re import subprocess @@ -6,6 +7,7 @@ import tarfile import time from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, TypeVar +from urllib.parse import urlencode import allure from psycopg2.extensions import cursor @@ -184,6 +186,46 @@ def allure_attach_from_dir(dir: Path): allure.attach.file(source, name, attachment_type, extension) +DATASOURCE_ID = "xHHYY0dVz" + + +def allure_add_grafana_links(host: str, start_ms: int, end_ms: int): + """Add links to server logs in Grafana to Allure report""" + # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build + endpoint_id, region_id, _ = host.split(".", 2) + + expressions = { + "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}', + "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"', + "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"', + "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}', + } + + params: Dict[str, Any] = { + "datasource": DATASOURCE_ID, + "queries": [ + { + "expr": "", + "refId": "A", + "datasource": {"type": "loki", "uid": DATASOURCE_ID}, + "editorMode": "code", + "queryType": "range", + } + ], + "range": { + "from": str(start_ms), + "to": str(end_ms), + }, + } + for name, expr in expressions.items(): + params["queries"][0]["expr"] = expr + query_string = urlencode({"orgId": 1, "left": json.dumps(params)}) + link = f"https://neonprod.grafana.net/explore?{query_string}" + + allure.dynamic.link(link, name=name) + log.info(f"{name}: {link}") + + def start_in_background( command: list[str], cwd: Path, log_file_name: str, is_started: Fn ) -> subprocess.Popen[bytes]: From 102746bc8f0e24c6ba1ddfb88dffcea5624e442b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 6 Apr 2023 18:57:48 +0300 Subject: [PATCH 238/426] Apply clippy rule exclusion locally instead of a global approach (#3974) --- libs/pq_proto/src/lib.rs | 3 +++ run_clippy.sh | 8 +------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index a976e19029..ed0239072a 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -293,6 +293,9 @@ impl FeStartupPacket { // We shouldn't advance `buf` as probably full message is not there yet, // so can't directly use Bytes::get_u32 etc. let len = (&buf[0..4]).read_u32::().unwrap() as usize; + // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)` + // which is less readable + #[allow(clippy::manual_range_contains)] if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { return Err(ProtocolError::Protocol(format!( "invalid startup packet message length {}", diff --git a/run_clippy.sh b/run_clippy.sh index ae9482ee96..9adfddedc2 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -8,13 +8,7 @@ # warnings and errors right in the editor. # In vscode, this setting is Rust-analyzer>Check On Save:Command -# manual-range-contains wants -# !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len) -# instead of -# len < 4 || len > MAX_STARTUP_PACKET_LENGTH -# , let's disagree. - # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings +cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings From 4d64edf8a552a9123590bc5a121bb79093741b68 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 6 Apr 2023 17:18:24 +0100 Subject: [PATCH 239/426] Nightly Benchmarks: Add free tier sized compute (#3969) - Add support for VMs and CU - Add free tier limited benchmark (0.25 CU) - Ensure we use 1 CU by default for pgbench workload --- .github/actions/neon-project-create/action.yml | 16 ++++++++++++++++ .github/workflows/benchmarking.yml | 11 ++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 0480bfbc84..ae6464990e 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -14,6 +14,12 @@ inputs: api_host: desctiption: 'Neon API host' default: console.stage.neon.tech + provisioner: + desctiption: 'k8s-pod or k8s-neonvm' + default: 'k8s-pod' + compute_units: + desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + default: '[1, 1]' outputs: dsn: @@ -31,6 +37,10 @@ runs: # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | + if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then + echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU" + fi + project=$(curl \ "https://${API_HOST}/api/v2/projects" \ --fail \ @@ -42,6 +52,9 @@ runs: \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", + \"provisioner\": \"${PROVISIONER}\", + \"autoscaling_limit_min_cu\": ${MIN_CU}, + \"autoscaling_limit_max_cu\": ${MAX_CU}, \"settings\": { } } }") @@ -62,3 +75,6 @@ runs: API_KEY: ${{ inputs.api_key }} REGION_ID: ${{ inputs.region_id }} POSTGRES_VERSION: ${{ inputs.postgres_version }} + PROVISIONER: ${{ inputs.provisioner }} + MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} + MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 16be60b1a1..425d4d76c9 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -111,6 +111,7 @@ jobs: strategy: fail-fast: false matrix: + # neon-captest-freetier: Run pgbench with freetier-limited compute # neon-captest-new: Run pgbench in a freshly created project # neon-captest-reuse: Same, but reusing existing project # neon-captest-prefetch: Same, with prefetching enabled (new project) @@ -120,6 +121,9 @@ jobs: db_size: [ 10gb ] runner: [ us-east-2 ] include: + - platform: neon-captest-freetier + db_size: 3gb + runner: us-east-2 - platform: neon-captest-prefetch db_size: 50gb runner: us-east-2 @@ -160,13 +164,14 @@ jobs: echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch", "neon-captest-freetier"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} + compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} - name: Set up Connection String id: set-up-connstr @@ -175,7 +180,7 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new | neon-captest-prefetch) + neon-captest-new | neon-captest-prefetch | neon-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -185,7 +190,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', neon-captest-freetier, 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac From ba4a96fdb1cc08098c57e5b4f75492c5ea30345b Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 6 Apr 2023 20:57:06 +0300 Subject: [PATCH 240/426] Eagerly update wal_backup_lsn after each segment offload (#3976) Otherwise it can lag a lot, preventing WAL segments cleanup. Also max wal_backup_lsn on update, pulling it down is pointless. Should help with https://github.com/neondatabase/neon/issues/3957, but will not fix it completely. --- safekeeper/src/timeline.rs | 3 ++- safekeeper/src/wal_backup.rs | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 9dd8a63cf0..8097c863fa 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -674,7 +674,8 @@ impl Timeline { bail!(TimelineError::Cancelled(self.ttid)); } - self.write_shared_state().sk.inmem.backup_lsn = backup_lsn; + let mut state = self.write_shared_state(); + state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn); // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. Ok(()) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 798b9abaf3..163ac99be8 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -323,7 +323,8 @@ impl WalBackupTask { } match backup_lsn_range( - backup_lsn, + &self.timeline, + &mut backup_lsn, commit_lsn, self.wal_seg_size, &self.timeline_dir, @@ -331,13 +332,7 @@ impl WalBackupTask { ) .await { - Ok(backup_lsn_result) => { - backup_lsn = backup_lsn_result; - let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); - if let Err(e) = res { - error!("failed to set wal_backup_lsn: {}", e); - return; - } + Ok(()) => { retry_attempt = 0; } Err(e) => { @@ -354,20 +349,25 @@ impl WalBackupTask { } pub async fn backup_lsn_range( - start_lsn: Lsn, + timeline: &Arc, + backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, timeline_dir: &Path, workspace_dir: &Path, -) -> Result { - let mut res = start_lsn; +) -> Result<()> { + let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); for s in &segments { backup_single_segment(s, timeline_dir, workspace_dir) .await .with_context(|| format!("offloading segno {}", s.seg_no))?; - res = s.end_lsn; + let new_backup_lsn = s.end_lsn; + timeline + .set_wal_backup_lsn(new_backup_lsn) + .context("setting wal_backup_lsn")?; + *backup_lsn = new_backup_lsn; } info!( "offloaded segnos {:?} up to {}, previous backup_lsn {}", @@ -375,7 +375,7 @@ pub async fn backup_lsn_range( end_lsn, start_lsn, ); - Ok(res) + Ok(()) } async fn backup_single_segment( From b45c92e533d322cc693e23b6c5a19e662237c12e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 6 Apr 2023 21:21:39 +0300 Subject: [PATCH 241/426] tests: exclude compatibility tests by default (#3975) This allows to skip compatibility tests based on `CHECK_ONDISK_DATA_COMPATIBILITY` environment variable. When the variable is missing (default) compatibility tests wont be run. --- .github/workflows/build_and_test.yml | 1 + test_runner/regress/test_compatibility.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 56c0aa8f9a..c096aef4a9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -338,6 +338,7 @@ jobs: rerun_flaky: true env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} + CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - name: Merge and upload coverage data if: matrix.build_type == 'debug' diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index e9dadb5348..be6e1a69b2 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -34,9 +34,15 @@ from pytest import FixtureRequest # - check_neon_works performs the test itself, feel free to add more checks there. # +check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( + os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None, + reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set", +) + # Note: if renaming this test, don't forget to update a reference to it in a workflow file: # "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml +@check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(before="test_forward_compatibility") def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): @@ -81,6 +87,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it +@check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_backward_compatibility( @@ -134,6 +141,7 @@ def test_backward_compatibility( ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" +@check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_forward_compatibility( From e42982fb1e0178f5efe338a20e5fd8e593aa4ffb Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 6 Apr 2023 21:21:58 +0200 Subject: [PATCH 242/426] [compute_ctl] Empty computes and /configure API (#3963) This commit adds an option to start compute without spec and then pass it a valid spec via `POST /configure` API endpoint. This is a main prerequisite for maintaining the pool of compute nodes in the control-plane. For example: 1. Start compute with ```shell cargo run --bin compute_ctl -- -i no-compute \ -p http://localhost:9095 \ -D compute_pgdata \ -C "postgresql://cloud_admin@127.0.0.1:5434/postgres" \ -b ./pg_install/v15/bin/postgres ``` 2. Configure it with ```shell curl -d "{\"spec\": $(cat ./compute-spec.json)}" http://localhost:3080/configure ``` Internally, it's implemented using a `Condvar` + `Mutex`. Compute spec is moved under Mutex, as it's now could be updated in the http handler. Also `RwLock` was replaced with `Mutex` because the latter works well with `Condvar`. First part of the neondatabase/cloud#4433 --- compute_tools/src/bin/compute_ctl.rs | 153 +++++++++++++--------- compute_tools/src/compute.rs | 157 ++++++++++++++--------- compute_tools/src/http/api.rs | 128 +++++++++++++++++- compute_tools/src/http/mod.rs | 2 + compute_tools/src/http/openapi_spec.yaml | 105 +++++++++++++-- compute_tools/src/http/requests.rs | 11 ++ compute_tools/src/http/responses.rs | 40 ++++++ compute_tools/src/monitor.rs | 4 +- compute_tools/src/pg_helpers.rs | 6 +- compute_tools/src/spec.rs | 50 +++++--- 10 files changed, 498 insertions(+), 158 deletions(-) create mode 100644 compute_tools/src/http/requests.rs create mode 100644 compute_tools/src/http/responses.rs diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index f29a576413..1a3ac77af4 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -34,13 +34,14 @@ use std::fs::File; use std::panic; use std::path::Path; use std::process::exit; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Condvar, Mutex}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; use tracing::{error, info}; +use url::Url; use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; use compute_tools::http::api::launch_http_server; @@ -49,7 +50,6 @@ use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::pg_helpers::*; use compute_tools::spec::*; -use url::Url; fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; @@ -62,7 +62,7 @@ fn main() -> Result<()> { let connstr = matches .get_one::("connstr") .expect("Postgres connection string is required"); - let spec = matches.get_one::("spec"); + let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); let compute_id = matches.get_one::("compute-id"); @@ -71,40 +71,107 @@ fn main() -> Result<()> { // Try to use just 'postgres' if no path is provided let pgbin = matches.get_one::("pgbin").unwrap(); - let spec: ComputeSpec = match spec { + let mut spec = Default::default(); + let mut spec_set = false; + let mut live_config_allowed = false; + match spec_json { // First, try to get cluster spec from the cli argument - Some(json) => serde_json::from_str(json)?, + Some(json) => { + spec = serde_json::from_str(json)?; + spec_set = true; + } None => { // Second, try to read it from the file if path is provided if let Some(sp) = spec_path { let path = Path::new(sp); let file = File::open(path)?; - serde_json::from_reader(file)? + spec = serde_json::from_reader(file)?; + spec_set = true; } else if let Some(id) = compute_id { if let Some(cp_base) = control_plane_uri { - let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec"); - let jwt: String = match std::env::var("NEON_CONSOLE_JWT") { - Ok(v) => v, - Err(_) => "".to_string(), - }; - - reqwest::blocking::Client::new() - .get(cp_uri) - .header("Authorization", jwt) - .send()? - .json()? + live_config_allowed = true; + if let Ok(s) = get_spec_from_control_plane(cp_base, id) { + spec = s; + spec_set = true; + } } else { - panic!( - "must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"", - control_plane_uri, compute_id - ); + panic!("must specify both --control-plane-uri and --compute-id or none"); } } else { - panic!("compute spec should be provided via --spec or --spec-path argument"); + panic!( + "compute spec should be provided by one of the following ways: \ + --spec OR --spec-path OR --control-plane-uri and --compute-id" + ); } } }; + let mut new_state = ComputeState::new(); + if spec_set { + new_state.spec = spec; + } + let compute_node = ComputeNode { + start_time: Utc::now(), + connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, + pgdata: pgdata.to_string(), + pgbin: pgbin.to_string(), + live_config_allowed, + metrics: ComputeMetrics::default(), + state: Mutex::new(new_state), + state_changed: Condvar::new(), + }; + let compute = Arc::new(compute_node); + + // Launch http service first, so we were able to serve control-plane + // requests, while configuration is still in progress. + let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + + if !spec_set { + // No spec provided, hang waiting for it. + info!("no compute spec provided, waiting"); + let mut state = compute.state.lock().unwrap(); + while state.status != ComputeStatus::ConfigurationPending { + state = compute.state_changed.wait(state).unwrap(); + + if state.status == ComputeStatus::ConfigurationPending { + info!("got spec, continue configuration"); + // Spec is already set by the http server handler. + break; + } + } + } + + // We got all we need, fill in the state. + let mut state = compute.state.lock().unwrap(); + let pageserver_connstr = state + .spec + .cluster + .settings + .find("neon.pageserver_connstring") + .expect("pageserver connstr should be provided"); + let storage_auth_token = state.spec.storage_auth_token.clone(); + let tenant = state + .spec + .cluster + .settings + .find("neon.tenant_id") + .expect("tenant id should be provided"); + let timeline = state + .spec + .cluster + .settings + .find("neon.timeline_id") + .expect("tenant id should be provided"); + let startup_tracing_context = state.spec.startup_tracing_context.clone(); + + state.pageserver_connstr = pageserver_connstr; + state.storage_auth_token = storage_auth_token; + state.tenant = tenant; + state.timeline = timeline; + state.status = ComputeStatus::Init; + compute.state_changed.notify_all(); + drop(state); + // Extract OpenTelemetry context for the startup actions from the spec, and // attach it to the current tracing context. // @@ -120,7 +187,7 @@ fn main() -> Result<()> { // postgres is configured and up-and-running, we exit this span. Any other // actions that are performed on incoming HTTP requests, for example, are // performed in separate spans. - let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context { + let startup_context_guard = if let Some(ref carrier) = startup_tracing_context { use opentelemetry::propagation::TextMapPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; Some(TraceContextPropagator::new().extract(carrier).attach()) @@ -128,41 +195,7 @@ fn main() -> Result<()> { None }; - let pageserver_connstr = spec - .cluster - .settings - .find("neon.pageserver_connstring") - .expect("pageserver connstr should be provided"); - let storage_auth_token = spec.storage_auth_token.clone(); - let tenant = spec - .cluster - .settings - .find("neon.tenant_id") - .expect("tenant id should be provided"); - let timeline = spec - .cluster - .settings - .find("neon.timeline_id") - .expect("tenant id should be provided"); - - let compute_state = ComputeNode { - start_time: Utc::now(), - connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, - pgdata: pgdata.to_string(), - pgbin: pgbin.to_string(), - spec, - tenant, - timeline, - pageserver_connstr, - storage_auth_token, - metrics: ComputeMetrics::default(), - state: RwLock::new(ComputeState::new()), - }; - let compute = Arc::new(compute_state); - - // Launch service threads first, so we were able to serve availability - // requests, while configuration is still in progress. - let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + // Launch remaining service threads let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); // Start Postgres @@ -172,7 +205,7 @@ fn main() -> Result<()> { Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:?}", err); - let mut state = compute.state.write().unwrap(); + let mut state = compute.state.lock().unwrap(); state.error = Some(format!("{:?}", err)); state.status = ComputeStatus::Failed; drop(state); @@ -262,7 +295,7 @@ fn cli() -> clap::Command { Arg::new("control-plane-uri") .short('p') .long("control-plane-uri") - .value_name("CONTROL_PLANE"), + .value_name("CONTROL_PLANE_API_BASE_URI"), ) } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 00d1e234ab..3e92ec57dc 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -20,12 +20,12 @@ use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::RwLock; +use std::sync::{Condvar, Mutex}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use serde::{Serialize, Serializer}; +use serde::Serialize; use tokio_postgres; use tracing::{info, instrument, warn}; @@ -41,41 +41,52 @@ pub struct ComputeNode { pub connstr: url::Url, pub pgdata: String, pub pgbin: String, + pub metrics: ComputeMetrics, + /// We should only allow live re- / configuration of the compute node if + /// it uses 'pull model', i.e. it can go to control-plane and fetch + /// the latest configuration. Otherwise, there could be a case: + /// - we start compute with some spec provided as argument + /// - we push new spec and it does reconfiguration + /// - but then something happens and compute pod / VM is destroyed, + /// so k8s controller starts it again with the **old** spec + /// and the same for empty computes: + /// - we started compute without any spec + /// - we push spec and it does configuration + /// - but then it is restarted without any spec again + pub live_config_allowed: bool, + /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. + /// To allow HTTP API server to serving status requests, while configuration + /// is in progress, lock should be held only for short periods of time to do + /// read/write, not the whole configuration process. + pub state: Mutex, + /// `Condvar` to allow notifying waiters about state changes. + pub state_changed: Condvar, +} + +#[derive(Clone, Debug)] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + pub last_active: DateTime, + pub error: Option, pub spec: ComputeSpec, pub tenant: String, pub timeline: String, pub pageserver_connstr: String, pub storage_auth_token: Option, - pub metrics: ComputeMetrics, - /// Volatile part of the `ComputeNode` so should be used under `RwLock` - /// to allow HTTP API server to serve status requests, while configuration - /// is in progress. - pub state: RwLock, -} - -fn rfc3339_serialize(x: &DateTime, s: S) -> Result -where - S: Serializer, -{ - x.to_rfc3339().serialize(s) -} - -#[derive(Serialize)] -#[serde(rename_all = "snake_case")] -pub struct ComputeState { - pub status: ComputeStatus, - /// Timestamp of the last Postgres activity - #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, - pub error: Option, } impl ComputeState { pub fn new() -> Self { Self { - status: ComputeStatus::Init, + status: ComputeStatus::Empty, last_active: Utc::now(), error: None, + spec: ComputeSpec::default(), + tenant: String::new(), + timeline: String::new(), + pageserver_connstr: String::new(), + storage_auth_token: None, } } } @@ -86,11 +97,22 @@ impl Default for ComputeState { } } -#[derive(Serialize, Clone, Copy, PartialEq, Eq)] +#[derive(Serialize, Clone, Copy, PartialEq, Eq, Debug)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { + // Spec wasn't provided at start, waiting for it to be + // provided by control-plane. + Empty, + // Compute configuration was requested. + ConfigurationPending, + // Compute node has spec and initial startup and + // configuration is in progress. Init, + // Compute is configured and running. Running, + // Either startup or configuration failed, + // compute will exit soon or is waiting for + // control-plane to terminate it. Failed, } @@ -104,11 +126,13 @@ pub struct ComputeMetrics { impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { - self.state.write().unwrap().status = status; + let mut state = self.state.lock().unwrap(); + state.status = status; + self.state_changed.notify_all(); } pub fn get_status(&self) -> ComputeStatus { - self.state.read().unwrap().status + self.state.lock().unwrap().status } // Remove `pgdata` directory and create it again with right permissions. @@ -124,15 +148,15 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. - #[instrument(skip(self))] - fn get_basebackup(&self, lsn: &str) -> Result<()> { + #[instrument(skip(self, compute_state))] + fn get_basebackup(&self, compute_state: &ComputeState, lsn: &str) -> Result<()> { let start_time = Utc::now(); - let mut config = postgres::Config::from_str(&self.pageserver_connstr)?; + let mut config = postgres::Config::from_str(&compute_state.pageserver_connstr)?; // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. - if let Some(storage_auth_token) = &self.storage_auth_token { + if let Some(storage_auth_token) = &compute_state.storage_auth_token { info!("Got storage auth token from spec file"); config.password(storage_auth_token); } else { @@ -141,8 +165,14 @@ impl ComputeNode { let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { - "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute - _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), + "0/0" => format!( + "basebackup {} {}", + &compute_state.tenant, &compute_state.timeline + ), // First start of the compute + _ => format!( + "basebackup {} {} {}", + &compute_state.tenant, &compute_state.timeline, lsn + ), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; @@ -169,14 +199,14 @@ impl ComputeNode { // Run `postgres` in a special mode with `--sync-safekeepers` argument // and return the reported LSN back to the caller. - #[instrument(skip(self))] - fn sync_safekeepers(&self) -> Result { + #[instrument(skip(self, storage_auth_token))] + fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); let sync_handle = Command::new(&self.pgbin) .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode - .envs(if let Some(storage_auth_token) = &self.storage_auth_token { + .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { vec![] @@ -217,9 +247,9 @@ impl ComputeNode { /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. - #[instrument(skip(self))] - pub fn prepare_pgdata(&self) -> Result<()> { - let spec = &self.spec; + #[instrument(skip(self, compute_state))] + pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { + let spec = &compute_state.spec; let pgdata_path = Path::new(&self.pgdata); // Remove/create an empty pgdata directory and put configuration there. @@ -228,18 +258,18 @@ impl ComputeNode { info!("starting safekeepers syncing"); let lsn = self - .sync_safekeepers() + .sync_safekeepers(compute_state.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")?; info!("safekeepers synced at LSN {}", lsn); info!( "getting basebackup@{} from pageserver {}", - lsn, &self.pageserver_connstr + lsn, &compute_state.pageserver_connstr ); - self.get_basebackup(&lsn).with_context(|| { + self.get_basebackup(compute_state, &lsn).with_context(|| { format!( "failed to get basebackup@{} from pageserver {}", - lsn, &self.pageserver_connstr + lsn, &compute_state.pageserver_connstr ) })?; @@ -252,13 +282,16 @@ impl ComputeNode { /// Start Postgres as a child process and manage DBs/roles. /// After that this will hang waiting on the postmaster process to exit. #[instrument(skip(self))] - pub fn start_postgres(&self) -> Result { + pub fn start_postgres( + &self, + storage_auth_token: Option, + ) -> Result { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. let mut pg = Command::new(&self.pgbin) .args(["-D", &self.pgdata]) - .envs(if let Some(storage_auth_token) = &self.storage_auth_token { + .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { vec![] @@ -271,8 +304,9 @@ impl ComputeNode { Ok(pg) } - #[instrument(skip(self))] - pub fn apply_config(&self) -> Result<()> { + /// Do initial configuration of the already started Postgres. + #[instrument(skip(self, compute_state))] + pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { // If connection fails, // it may be the old node with `zenith_admin` superuser. // @@ -303,19 +337,19 @@ impl ComputeNode { }; // Proceed with post-startup configuration. Note, that order of operations is important. - handle_roles(&self.spec, &mut client)?; - handle_databases(&self.spec, &mut client)?; - handle_role_deletions(self, &mut client)?; - handle_grants(self, &mut client)?; + handle_roles(&compute_state.spec, &mut client)?; + handle_databases(&compute_state.spec, &mut client)?; + handle_role_deletions(&compute_state.spec, self.connstr.as_str(), &mut client)?; + handle_grants(&compute_state.spec, self.connstr.as_str(), &mut client)?; create_writability_check_data(&mut client)?; - handle_extensions(&self.spec, &mut client)?; + handle_extensions(&compute_state.spec, &mut client)?; // 'Close' connection drop(client); info!( "finished configuration of compute for project {}", - self.spec.cluster.cluster_id + compute_state.spec.cluster.cluster_id ); Ok(()) @@ -323,21 +357,22 @@ impl ComputeNode { #[instrument(skip(self))] pub fn start_compute(&self) -> Result { + let compute_state = self.state.lock().unwrap().clone(); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", - self.spec.cluster.cluster_id, - self.spec.operation_uuid.as_ref().unwrap(), - self.tenant, - self.timeline, + compute_state.spec.cluster.cluster_id, + compute_state.spec.operation_uuid.as_ref().unwrap(), + compute_state.tenant, + compute_state.timeline, ); - self.prepare_pgdata()?; + self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); - let pg = self.start_postgres()?; + let pg = self.start_postgres(compute_state.storage_auth_token.clone())?; - self.apply_config()?; + self.apply_config(&compute_state)?; let startup_end_time = Utc::now(); self.metrics.config_ms.store( diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 199e0f3bd0..8620b10636 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,12 +3,16 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; -use crate::compute::ComputeNode; +use crate::compute::{ComputeNode, ComputeStatus}; +use crate::http::requests::ConfigurationRequest; +use crate::http::responses::{ComputeStatusResponse, GenericAPIError}; + use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; use num_cpus; use serde_json; +use tokio::task; use tracing::{error, info}; use tracing_utils::http::OtelName; @@ -23,8 +27,10 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /status GET request"); - let state = compute.state.read().unwrap(); - Response::new(Body::from(serde_json::to_string(&*state).unwrap())) + let state = compute.state.lock().unwrap(); + let status_response = ComputeStatusResponse::from(state.clone()); + + Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) } // Startup metrics in JSON format. Keep /metrics reserved for a possible @@ -37,12 +43,29 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /insights GET request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!("compute is not running, current status: {:?}", status); + error!(msg); + return Response::new(Body::from(msg)); + } + let insights = compute.collect_insights().await; Response::new(Body::from(insights)) } (&Method::POST, "/check_writability") => { info!("serving /check_writability POST request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for check_writability request: {:?}", + status + ); + error!(msg); + return Response::new(Body::from(msg)); + } + let res = crate::checker::check_writability(compute).await; match res { Ok(_) => Response::new(Body::from("true")), @@ -61,6 +84,23 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /configure POST request"); + match handle_configure_request(req, compute).await { + Ok(msg) => Response::new(Body::from(msg)), + Err((msg, code)) => { + error!("error handling /configure request: {msg}"); + render_json_error(&msg, code) + } + } + } + // Return the `404 Not Found` for any other routes. _ => { let mut not_found = Response::new(Body::from("404 Not Found")); @@ -70,6 +110,88 @@ async fn routes(req: Request, compute: &Arc) -> Response, + compute: &Arc, +) -> Result { + if !compute.live_config_allowed { + return Err(( + "live configuration is not allowed for this compute node".to_string(), + StatusCode::PRECONDITION_FAILED, + )); + } + + let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap(); + let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap(); + if let Ok(request) = serde_json::from_str::(&spec_raw) { + let spec = request.spec; + // XXX: wrap state update under lock in code blocks. Otherwise, + // we will try to `Send` `mut state` into the spawned thread + // bellow, which will cause error: + // ``` + // error: future cannot be sent between threads safely + // ``` + { + let mut state = compute.state.lock().unwrap(); + if state.status != ComputeStatus::Empty { + let msg = format!( + "invalid compute status for configuration request: {:?}", + state.status.clone() + ); + return Err((msg, StatusCode::PRECONDITION_FAILED)); + } + state.spec = spec; + state.status = ComputeStatus::ConfigurationPending; + compute.state_changed.notify_all(); + drop(state); + info!("set new spec and notified waiters"); + } + + // Spawn a blocking thread to wait for compute to become Running. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Running { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become Running, current status: {:?}", + state.status + ); + + if state.status == ComputeStatus::Failed { + let err = state.error.clone().unwrap_or("unknown error".to_string()); + let msg = format!("compute configuration failed: {:?}", err); + return Err((msg, StatusCode::INTERNAL_SERVER_ERROR)); + } + } + + Ok(()) + }) + .await + .unwrap()?; + + // Return current compute state if everything went well. + let state = compute.state.lock().unwrap().clone(); + let status_response = ComputeStatusResponse::from(state); + Ok(serde_json::to_string(&status_response).unwrap()) + } else { + Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST)) + } +} + +fn render_json_error(e: &str, status: StatusCode) -> Response { + let error = GenericAPIError { + error: e.to_string(), + }; + Response::builder() + .status(status) + .body(Body::from(serde_json::to_string(&error).unwrap())) + .unwrap() +} + // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] async fn serve(state: Arc) { diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index e5fdf85eed..e54b4e3341 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1 +1,3 @@ pub mod api; +pub mod requests; +pub mod responses; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 5c74dfd2d2..bdb09d4a6b 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -11,7 +11,7 @@ paths: get: tags: - Info - summary: Get compute node internal status + summary: Get compute node internal status. description: "" operationId: getComputeStatus responses: @@ -26,7 +26,7 @@ paths: get: tags: - Info - summary: Get compute node startup metrics in JSON format + summary: Get compute node startup metrics in JSON format. description: "" operationId: getComputeMetricsJSON responses: @@ -41,9 +41,9 @@ paths: get: tags: - Info - summary: Get current compute insights in JSON format + summary: Get current compute insights in JSON format. description: | - Note, that this doesn't include any historical data + Note, that this doesn't include any historical data. operationId: getComputeInsights responses: 200: @@ -56,12 +56,12 @@ paths: /info: get: tags: - - "info" - summary: Get info about the compute Pod/VM + - Info + summary: Get info about the compute pod / VM. description: "" operationId: getInfo responses: - "200": + 200: description: Info content: application/json: @@ -72,7 +72,7 @@ paths: post: tags: - Check - summary: Check that we can write new data on this compute + summary: Check that we can write new data on this compute. description: "" operationId: checkComputeWritability responses: @@ -82,9 +82,64 @@ paths: text/plain: schema: type: string - description: Error text or 'true' if check passed + description: Error text or 'true' if check passed. example: "true" + /configure: + post: + tags: + - Configure + summary: Perform compute node configuration. + description: | + This is a blocking API endpoint, i.e. it blocks waiting until + compute is finished configuration and is in `Running` state. + Optional non-blocking mode could be added later. + operationId: configureCompute + requestBody: + description: Configuration request. + required: true + content: + application/json: + schema: + type: object + required: + - spec + properties: + spec: + # XXX: I don't want to explain current spec in the OpenAPI format, + # as it could be changed really soon. Consider doing it later. + type: object + responses: + 200: + description: Compute configuration finished. + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeState" + 400: + description: Provided spec is invalid. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 412: + description: | + It's not possible to do live-configuration of the compute. + It's either in the wrong state, or compute doesn't use pull + mode of configuration. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: | + Compute configuration request was processed, but error + occurred. Compute will likely shutdown soon. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: @@ -95,7 +150,7 @@ components: schemas: ComputeMetrics: type: object - description: Compute startup metrics + description: Compute startup metrics. required: - sync_safekeepers_ms - basebackup_ms @@ -113,7 +168,7 @@ components: Info: type: object - description: Information about VM/Pod + description: Information about VM/Pod. required: - num_cpus properties: @@ -130,17 +185,26 @@ components: $ref: '#/components/schemas/ComputeStatus' last_active: type: string - description: The last detected compute activity timestamp in UTC and RFC3339 format + description: The last detected compute activity timestamp in UTC and RFC3339 format. example: "2022-10-12T07:20:50.52Z" error: type: string - description: Text of the error during compute startup, if any + description: Text of the error during compute startup, if any. + example: "" + tenant: + type: string + description: Identifier of the current tenant served by compute node, if any. + example: c9269c359e9a199fad1ea0981246a78f + timeline: + type: string + description: Identifier of the current timeline served by compute node, if any. + example: ece7de74d4b8cbe5433a68ce4d1b97b4 ComputeInsights: type: object properties: pg_stat_statements: - description: Contains raw output from pg_stat_statements in JSON format + description: Contains raw output from pg_stat_statements in JSON format. type: array items: type: object @@ -151,6 +215,19 @@ components: - init - failed - running + example: running + + # + # Errors + # + + GenericError: + type: object + required: + - error + properties: + error: + type: string security: - JWT: [] diff --git a/compute_tools/src/http/requests.rs b/compute_tools/src/http/requests.rs new file mode 100644 index 0000000000..2e41c7aea4 --- /dev/null +++ b/compute_tools/src/http/requests.rs @@ -0,0 +1,11 @@ +use serde::Deserialize; + +use crate::spec::ComputeSpec; + +/// We now pass only `spec` in the configuration request, but later we can +/// extend it and something like `restart: bool` or something else. So put +/// `spec` into a struct initially to be more flexible in the future. +#[derive(Deserialize, Debug)] +pub struct ConfigurationRequest { + pub spec: ComputeSpec, +} diff --git a/compute_tools/src/http/responses.rs b/compute_tools/src/http/responses.rs new file mode 100644 index 0000000000..1ef4b380a9 --- /dev/null +++ b/compute_tools/src/http/responses.rs @@ -0,0 +1,40 @@ +use serde::{Serialize, Serializer}; + +use chrono::{DateTime, Utc}; + +use crate::compute::{ComputeState, ComputeStatus}; + +#[derive(Serialize, Debug)] +pub struct GenericAPIError { + pub error: String, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ComputeStatusResponse { + pub tenant: String, + pub timeline: String, + pub status: ComputeStatus, + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +impl From for ComputeStatusResponse { + fn from(state: ComputeState) -> Self { + ComputeStatusResponse { + tenant: state.tenant, + timeline: state.timeline, + status: state.status, + last_active: state.last_active, + error: state.error, + } + } +} + +fn rfc3339_serialize(x: &DateTime, s: S) -> Result +where + S: Serializer, +{ + x.to_rfc3339().serialize(s) +} diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 7c9878ffcf..a30b52aed4 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) { AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? &[], ); - let mut last_active = compute.state.read().unwrap().last_active; + let mut last_active = compute.state.lock().unwrap().last_active; if let Ok(backs) = backends { let mut idle_backs: Vec> = vec![]; @@ -87,7 +87,7 @@ fn watch_compute_activity(compute: &ComputeNode) { } // Update the last activity in the shared state if we got a more recent one. - let mut state = compute.state.write().unwrap(); + let mut state = compute.state.lock().unwrap(); if last_active > state.last_active { state.last_active = last_active; debug!("set the last compute activity time to: {}", last_active); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 01b192b2de..38d1a6d777 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -17,7 +17,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -26,7 +26,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -36,7 +36,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct GenericOption { pub name: String, pub value: Option, diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 9694ba9a88..b7f15a99d1 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -8,14 +8,13 @@ use postgres::{Client, NoTls}; use serde::Deserialize; use tracing::{info, info_span, instrument, span_enabled, warn, Level}; -use crate::compute::ComputeNode; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug, Default)] pub struct ComputeSpec { pub format_version: f32, pub timestamp: String, @@ -31,7 +30,7 @@ pub struct ComputeSpec { /// Cluster state seen from the perspective of the external tools /// like Rails web console. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug, Default)] pub struct Cluster { pub cluster_id: String, pub name: String, @@ -47,13 +46,36 @@ pub struct Cluster { /// - DROP ROLE /// - ALTER ROLE name RENAME TO new_name /// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct DeltaOp { pub action: String, pub name: PgIdent, pub new_name: Option, } +/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` +/// env variable is set, it will be used for authorization. +pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result { + let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec"); + let jwt: String = match std::env::var("NEON_CONSOLE_JWT") { + Ok(v) => v, + Err(_) => "".to_string(), + }; + info!("getting spec from control plane: {}", cp_uri); + + // TODO: check the response. We should distinguish cases when it's + // - network error, then retry + // - no spec for compute yet, then wait + // - compute id is unknown or any other error, then bail out + let spec = reqwest::blocking::Client::new() + .get(cp_uri) + .header("Authorization", jwt) + .send()? + .json()?; + + Ok(spec) +} + /// It takes cluster specification and does the following: /// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file. /// - Update `pg_hba.conf` to allow external connections. @@ -226,8 +248,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Reassign all dependent objects and delete requested roles. #[instrument(skip_all)] -pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { - if let Some(ops) = &node.spec.delta_operations { +pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { + if let Some(ops) = &spec.delta_operations { // First, reassign all dependent objects to db owners. info!("reassigning dependent objects of to-be-deleted roles"); @@ -244,7 +266,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< // Check that role is still present in Postgres, as this could be a // restart with the same spec after role deletion. if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) { - reassign_owned_objects(node, &op.name)?; + reassign_owned_objects(spec, connstr, &op.name)?; } } @@ -268,10 +290,10 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< } // Reassign all owned objects in all databases to the owner of the database. -fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { - for db in &node.spec.cluster.databases { +fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> { + for db in &spec.cluster.databases { if db.owner != *role_name { - let mut conf = Config::from_str(node.connstr.as_str())?; + let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); let mut client = conf.connect(NoTls)?; @@ -416,9 +438,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { - let spec = &node.spec; - +pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { info!("cluster spec grants:"); // We now have a separate `web_access` role to connect to the database @@ -450,8 +470,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { // Do some per-database access adjustments. We'd better do this at db creation time, // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants // atomically. - for db in &node.spec.cluster.databases { - let mut conf = Config::from_str(node.connstr.as_str())?; + for db in &spec.cluster.databases { + let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); let mut db_client = conf.connect(NoTls)?; From 6d01d835a89d99d2314a49eb00bd4afaceee224a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 4 Apr 2023 16:52:40 +0300 Subject: [PATCH 243/426] [proxy] Report error if proxy_io_bytes_per_client metric has decreased --- proxy/src/metrics.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index be22c45836..445c2e930c 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -5,7 +5,7 @@ use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; use std::collections::HashMap; -use tracing::{debug, error, info, instrument, trace}; +use tracing::{error, info, instrument, trace, warn}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -84,10 +84,14 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { let value = ms.get_counter().get_value() as u64; - debug!( - "branch_id {} endpoint_id {} val: {}", - branch_id, endpoint_id, value - ); + // Report if the metric value is suspiciously large + if value > (1u64 << 40) { + warn!( + "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}", + branch_id, endpoint_id, value + ); + } + current_metrics.push(( Ids { endpoint_id: endpoint_id.to_string(), @@ -124,11 +128,15 @@ async fn collect_metrics_iteration( let mut value = *curr_val; if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) { - // Only send metrics updates if the metric has changed - if curr_val - prev_val > 0 { + // Only send metrics updates if the metric has increased + if curr_val > prev_val { value = curr_val - prev_val; start_time = *prev_time; } else { + if curr_val < prev_val { + error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}", + prev_val, curr_val, curr_key); + } return None; } }; @@ -189,7 +197,7 @@ async fn collect_metrics_iteration( }) // update cached value (add delta) and time .and_modify(|e| { - e.0 += send_metric.value; + e.0 = e.0.saturating_add(send_metric.value); e.1 = stop_time }) // cache new metric From b1c2a6384ae8fd9f1da4b3d186eb48774e0da5d7 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 6 Apr 2023 13:59:45 +0300 Subject: [PATCH 244/426] Set non-wildcard common names in link auth proxy Old coding here ignored non-wildcard common names and passed None instead. With my recent changes I started throwing an error in that case. Old logic doesn't seem to be a great choice, so instead of passing None I actually set non-wildcard common names too. That way it is possible to avoid handling cases with None in downstream code. --- proxy/src/config.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 5f9585149e..ad51502b49 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -119,7 +119,18 @@ impl CertResolver { ))? .1; let common_name = pem.parse_x509()?.subject().to_string(); - common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + + // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as + // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so + // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names + // and passed None instead, which blows up number of cases downstream code should handle. Proper coding + // here should better avoid Option for common_names, and do wildcard-based certificate selection instead + // of cutting off '*.' parts. + if common_name.starts_with("CN=*.") { + common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + } else { + common_name.strip_prefix("CN=").map(|s| s.to_string()) + } } .context(format!( "Failed to parse common name from certificate at '{cert_path}'." From bfeb428d1b367351df11b7b17a75cff1c3c64ff6 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 7 Apr 2023 13:47:28 +0300 Subject: [PATCH 245/426] tests: make neon_fixtures a bit thinner by splitting out some pageserver related helpers (#3977) neon_fixture is quite big and messy, lets clean it up a bit. --- test_runner/fixtures/neon_fixtures.py | 702 +----------------- test_runner/fixtures/pageserver/__init__.py | 0 test_runner/fixtures/pageserver/http.py | 545 ++++++++++++++ test_runner/fixtures/pageserver/utils.py | 145 ++++ test_runner/fixtures/utils.py | 16 + .../performance/test_branch_creation.py | 2 +- test_runner/regress/test_auth.py | 3 +- test_runner/regress/test_compatibility.py | 5 +- .../regress/test_disk_usage_eviction.py | 10 +- test_runner/regress/test_import.py | 3 +- test_runner/regress/test_layer_eviction.py | 3 +- test_runner/regress/test_neon_cli.py | 2 +- test_runner/regress/test_normal_work.py | 3 +- test_runner/regress/test_ondemand_download.py | 12 +- test_runner/regress/test_pageserver_api.py | 2 +- test_runner/regress/test_read_trace.py | 3 +- test_runner/regress/test_readonly_node.py | 3 +- test_runner/regress/test_remote_storage.py | 33 +- test_runner/regress/test_tenant_conf.py | 3 +- test_runner/regress/test_tenant_detach.py | 8 +- test_runner/regress/test_tenant_relocation.py | 12 +- test_runner/regress/test_tenant_size.py | 2 +- .../test_tenants_with_remote_storage.py | 6 +- test_runner/regress/test_timeline_delete.py | 3 +- test_runner/regress/test_timeline_size.py | 26 +- test_runner/regress/test_wal_acceptor.py | 3 +- .../test_walredo_not_left_behind_on_detach.py | 3 +- 27 files changed, 779 insertions(+), 779 deletions(-) create mode 100644 test_runner/fixtures/pageserver/__init__.py create mode 100644 test_runner/fixtures/pageserver/http.py create mode 100644 test_runner/fixtures/pageserver/utils.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c24158e9ec..5b6f2e5c96 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,7 +14,6 @@ import tempfile import textwrap import time import uuid -from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass, field from datetime import datetime @@ -44,11 +43,11 @@ from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal from fixtures.log_helper import log -from fixtures.metrics import Metrics, parse_metrics +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, - Fn, allure_add_grafana_links, allure_attach_from_dir, get_self_dir, @@ -1120,538 +1119,6 @@ def neon_env_builder( yield builder -class PageserverApiException(Exception): - def __init__(self, message, status_code: int): - super().__init__(message) - self.status_code = status_code - - -class PageserverHttpClient(requests.Session): - def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): - super().__init__() - self.port = port - self.auth_token = auth_token - self.is_testing_enabled_or_skip = is_testing_enabled_or_skip - - if auth_token is not None: - self.headers["Authorization"] = f"Bearer {auth_token}" - - def verbose_error(self, res: requests.Response): - try: - res.raise_for_status() - except requests.RequestException as e: - try: - msg = res.json()["msg"] - except: # noqa: E722 - msg = "" - raise PageserverApiException(msg, res.status_code) from e - - def check_status(self): - self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - - def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): - self.is_testing_enabled_or_skip() - - if isinstance(config_strings, tuple): - pairs = [config_strings] - else: - pairs = config_strings - - log.info(f"Requesting config failpoints: {repr(pairs)}") - - res = self.put( - f"http://localhost:{self.port}/v1/failpoints", - json=[{"name": name, "actions": actions} for name, actions in pairs], - ) - log.info(f"Got failpoints request response code {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is None - return res_json - - def tenant_list(self) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json={ - "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, - }, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") - new_tenant_id = res.json() - assert isinstance(new_tenant_id, str) - return TenantId(new_tenant_id) - - def tenant_attach(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") - self.verbose_error(res) - - def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): - params = {} - if detach_ignored: - params["detach_ignored"] = "true" - - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) - self.verbose_error(res) - - def tenant_load(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") - self.verbose_error(res) - - def tenant_ignore(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") - self.verbose_error(res) - - def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def tenant_config(self, tenant_id: TenantId) -> TenantConfig: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") - self.verbose_error(res) - return TenantConfig.from_json(res.json()) - - def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): - assert "tenant_id" not in config.keys() - res = self.put( - f"http://localhost:{self.port}/v1/tenant/config", - json={**config, "tenant_id": str(tenant_id)}, - ) - self.verbose_error(res) - - def patch_tenant_config_client_side( - self, - tenant_id: TenantId, - inserts: Optional[Dict[str, Any]] = None, - removes: Optional[List[str]] = None, - ): - current = self.tenant_config(tenant_id).tenant_specific_overrides - if inserts is not None: - current.update(inserts) - if removes is not None: - for key in removes: - del current[key] - self.set_tenant_config(tenant_id, current) - - def tenant_size(self, tenant_id: TenantId) -> int: - return self.tenant_size_and_modelinputs(tenant_id)[0] - - def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]: - """ - Returns the tenant size, together with the model inputs as the second tuple item. - """ - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size") - self.verbose_error(res) - res = res.json() - assert isinstance(res, dict) - assert TenantId(res["id"]) == tenant_id - size = res["size"] - assert type(size) == int - inputs = res["inputs"] - assert type(inputs) is dict - return (size, inputs) - - def tenant_size_debug(self, tenant_id: TenantId) -> str: - """ - Returns the tenant size debug info, as an HTML string - """ - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size", - headers={"Accept": "text/html"}, - ) - return res.text - - def timeline_list( - self, - tenant_id: TenantId, - include_non_incremental_logical_size: bool = False, - include_timeline_dir_layer_file_size_sum: bool = False, - ) -> List[Dict[str, Any]]: - params = {} - if include_non_incremental_logical_size: - params["include-non-incremental-logical-size"] = "true" - if include_timeline_dir_layer_file_size_sum: - params["include-timeline-dir-layer-file-size-sum"] = "true" - - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def timeline_create( - self, - tenant_id: TenantId, - new_timeline_id: Optional[TimelineId] = None, - ancestor_timeline_id: Optional[TimelineId] = None, - ancestor_start_lsn: Optional[Lsn] = None, - ) -> Dict[Any, Any]: - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", - json={ - "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, - "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, - "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, - }, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") - - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def timeline_detail( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - include_non_incremental_logical_size: bool = False, - include_timeline_dir_layer_file_size_sum: bool = False, - **kwargs, - ) -> Dict[Any, Any]: - params = {} - if include_non_incremental_logical_size: - params["include-non-incremental-logical-size"] = "true" - if include_timeline_dir_layer_file_size_sum: - params["include-timeline-dir-layer-file-size-sum"] = "true" - - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", - params=params, - **kwargs, - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): - res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" - ) - self.verbose_error(res) - res_json = res.json() - assert res_json is None - - def timeline_gc( - self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] - ) -> dict[str, Any]: - self.is_testing_enabled_or_skip() - - log.info( - f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" - ) - res = self.put( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", - json={"gc_horizon": gc_horizon}, - ) - log.info(f"Got GC request response code: {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is not None - assert isinstance(res_json, dict) - return res_json - - def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): - self.is_testing_enabled_or_skip() - - log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") - res = self.put( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" - ) - log.info(f"Got compact request response code: {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is None - - def timeline_get_lsn_by_timestamp( - self, tenant_id: TenantId, timeline_id: TimelineId, timestamp - ): - log.info( - f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" - ) - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}", - ) - self.verbose_error(res) - res_json = res.json() - return res_json - - def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): - self.is_testing_enabled_or_skip() - - log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") - res = self.put( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" - ) - log.info(f"Got checkpoint request response code: {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is None - - def timeline_spawn_download_remote_layers( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - max_concurrent_downloads: int, - ) -> dict[str, Any]: - body = { - "max_concurrent_downloads": max_concurrent_downloads, - } - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", - json=body, - ) - self.verbose_error(res) - res_json = res.json() - assert res_json is not None - assert isinstance(res_json, dict) - return res_json - - def timeline_poll_download_remote_layers_status( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - spawn_response: dict[str, Any], - poll_state=None, - ) -> None | dict[str, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", - ) - self.verbose_error(res) - res_json = res.json() - assert res_json is not None - assert isinstance(res_json, dict) - - # assumption in this API client here is that nobody else spawns the task - assert res_json["task_id"] == spawn_response["task_id"] - - if poll_state is None or res_json["state"] == poll_state: - return res_json - return None - - def timeline_download_remote_layers( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - max_concurrent_downloads: int, - errors_ok=False, - at_least_one_download=True, - ): - res = self.timeline_spawn_download_remote_layers( - tenant_id, timeline_id, max_concurrent_downloads - ) - while True: - completed = self.timeline_poll_download_remote_layers_status( - tenant_id, timeline_id, res, poll_state="Completed" - ) - if not completed: - time.sleep(0.1) - continue - if not errors_ok: - assert completed["failed_download_count"] == 0 - if at_least_one_download: - assert completed["successful_download_count"] > 0 - return completed - - def get_metrics_str(self) -> str: - """You probably want to use get_metrics() instead.""" - res = self.get(f"http://localhost:{self.port}/metrics") - self.verbose_error(res) - return res.text - - def get_metrics(self) -> Metrics: - res = self.get_metrics_str() - return parse_metrics(res) - - def get_timeline_metric( - self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str - ) -> float: - metrics = self.get_metrics() - return metrics.query_one( - metric_name, - filter={ - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - }, - ).value - - def get_remote_timeline_client_metric( - self, - metric_name: str, - tenant_id: TenantId, - timeline_id: TimelineId, - file_kind: str, - op_kind: str, - ) -> Optional[float]: - metrics = self.get_metrics() - matches = metrics.query_all( - name=metric_name, - filter={ - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "file_kind": str(file_kind), - "op_kind": str(op_kind), - }, - ) - if len(matches) == 0: - value = None - elif len(matches) == 1: - value = matches[0].value - assert value is not None - else: - assert len(matches) < 2, "above filter should uniquely identify metric" - return value - - def get_metric_value( - self, name: str, filter: Optional[Dict[str, str]] = None - ) -> Optional[float]: - metrics = self.get_metrics() - results = metrics.query_all(name, filter=filter) - if not results: - log.info(f'could not find metric "{name}"') - return None - assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" - return results[0].value - - def layer_map_info( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> LayerMapInfo: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/", - ) - self.verbose_error(res) - return LayerMapInfo.from_json(res.json()) - - def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", - ) - self.verbose_error(res) - - assert res.status_code == 200 - - def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): - res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", - ) - self.verbose_error(res) - - assert res.status_code == 200 - - def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): - info = self.layer_map_info(tenant_id, timeline_id) - for layer in info.historic_layers: - self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) - - def disk_usage_eviction_run(self, request: dict[str, Any]): - res = self.put( - f"http://localhost:{self.port}/v1/disk_usage_eviction/run", - json=request, - ) - self.verbose_error(res) - return res.json() - - def tenant_break(self, tenant_id: TenantId): - res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") - self.verbose_error(res) - - -@dataclass -class TenantConfig: - tenant_specific_overrides: Dict[str, Any] - effective_config: Dict[str, Any] - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> TenantConfig: - return TenantConfig( - tenant_specific_overrides=d["tenant_specific_overrides"], - effective_config=d["effective_config"], - ) - - -@dataclass -class LayerMapInfo: - in_memory_layers: List[InMemoryLayerInfo] - historic_layers: List[HistoricLayerInfo] - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo: - info = LayerMapInfo(in_memory_layers=[], historic_layers=[]) - - json_in_memory_layers = d["in_memory_layers"] - assert isinstance(json_in_memory_layers, List) - for json_in_memory_layer in json_in_memory_layers: - info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer)) - - json_historic_layers = d["historic_layers"] - assert isinstance(json_historic_layers, List) - for json_historic_layer in json_historic_layers: - info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer)) - - return info - - def kind_count(self) -> Dict[str, int]: - counts: Dict[str, int] = defaultdict(int) - for inmem_layer in self.in_memory_layers: - counts[inmem_layer.kind] += 1 - for hist_layer in self.historic_layers: - counts[hist_layer.kind] += 1 - return counts - - -@dataclass -class InMemoryLayerInfo: - kind: str - lsn_start: str - lsn_end: Optional[str] - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo: - return InMemoryLayerInfo( - kind=d["kind"], - lsn_start=d["lsn_start"], - lsn_end=d.get("lsn_end"), - ) - - -@dataclass(frozen=True) -class HistoricLayerInfo: - kind: str - layer_file_name: str - layer_file_size: Optional[int] - lsn_start: str - lsn_end: Optional[str] - remote: bool - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: - return HistoricLayerInfo( - kind=d["kind"], - layer_file_name=d["layer_file_name"], - layer_file_size=d.get("layer_file_size"), - lsn_start=d["lsn_start"], - lsn_end=d.get("lsn_end"), - remote=d["remote"], - ) - - @dataclass class PageserverPort: pg: int @@ -3386,151 +2853,6 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def wait_until(number_of_iterations: int, interval: float, func): - """ - Wait until 'func' returns successfully, without exception. Returns the - last return value from the function. - """ - last_exception = None - for i in range(number_of_iterations): - try: - res = func() - except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) - last_exception = e - time.sleep(interval) - continue - return res - raise Exception("timed out while waiting for %s" % func) from last_exception - - -def wait_while(number_of_iterations: int, interval: float, func): - """ - Wait until 'func' returns false, or throws an exception. - """ - for i in range(number_of_iterations): - try: - if not func(): - return - log.info("waiting for %s iteration %s failed", func, i + 1) - time.sleep(interval) - continue - except Exception: - return - raise Exception("timed out while waiting for %s" % func) - - -def assert_tenant_status( - pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str -): - tenant_status = pageserver_http_client.tenant_status(tenant) - log.info(f"tenant_status: {tenant_status}") - assert tenant_status["state"] == expected_status, tenant_status - - -def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId): - tenants = ps_http.tenant_list() - matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] - assert len(matching) < 2 - if len(matching) == 0: - return None - return matching[0] - - -def remote_consistent_lsn( - pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -) -> Lsn: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - if detail["remote_consistent_lsn"] is None: - # No remote information at all. This happens right after creating - # a timeline, before any part of it has been uploaded to remote - # storage yet. - return Lsn(0) - else: - lsn_str = detail["remote_consistent_lsn"] - assert isinstance(lsn_str, str) - return Lsn(lsn_str) - - -def wait_for_upload( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, - timeline: TimelineId, - lsn: Lsn, -): - """waits for local timeline upload up to specified lsn""" - for i in range(20): - current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - log.info("wait finished") - return - log.info( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) - ) - time.sleep(1) - raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn, current_lsn - ) - ) - - -# Does not use `wait_until` for debugging purposes -def wait_until_tenant_state( - pageserver_http: PageserverHttpClient, - tenant_id: TenantId, - expected_state: str, - iterations: int, -) -> bool: - for _ in range(iterations): - try: - tenant = pageserver_http.tenant_status(tenant_id=tenant_id) - log.debug(f"Tenant {tenant_id} data: {tenant}") - if tenant["state"] == expected_state: - return True - except Exception as e: - log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") - - time.sleep(1) - - raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") - - -def last_record_lsn( - pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -) -> Lsn: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - lsn_str = detail["last_record_lsn"] - assert isinstance(lsn_str, str) - return Lsn(lsn_str) - - -def wait_for_last_record_lsn( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, - timeline: TimelineId, - lsn: Lsn, -) -> Lsn: - """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" - for i in range(10): - current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - return current_lsn - log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) - ) - time.sleep(1) - raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) - ) - - def wait_for_last_flush_lsn( env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId ) -> Lsn: @@ -3592,23 +2914,3 @@ def wait_for_sk_commit_lsn_to_reach_remote_storage( ps_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(ps_http, tenant_id, timeline_id, lsn) return lsn - - -def wait_for_upload_queue_empty( - pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId -): - ps_http = pageserver.http_client() - while True: - all_metrics = ps_http.get_metrics() - tl = all_metrics.query_all( - "pageserver_remote_timeline_client_calls_unfinished", - { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - }, - ) - assert len(tl) > 0 - log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") - if all(m.value == 0 for m in tl): - return - time.sleep(0.2) diff --git a/test_runner/fixtures/pageserver/__init__.py b/test_runner/fixtures/pageserver/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py new file mode 100644 index 0000000000..1e1effe295 --- /dev/null +++ b/test_runner/fixtures/pageserver/http.py @@ -0,0 +1,545 @@ +from __future__ import annotations + +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +import requests + +from fixtures.log_helper import log +from fixtures.metrics import Metrics, parse_metrics +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import Fn + + +class PageserverApiException(Exception): + def __init__(self, message, status_code: int): + super().__init__(message) + self.status_code = status_code + + +@dataclass +class InMemoryLayerInfo: + kind: str + lsn_start: str + lsn_end: Optional[str] + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo: + return InMemoryLayerInfo( + kind=d["kind"], + lsn_start=d["lsn_start"], + lsn_end=d.get("lsn_end"), + ) + + +@dataclass(frozen=True) +class HistoricLayerInfo: + kind: str + layer_file_name: str + layer_file_size: Optional[int] + lsn_start: str + lsn_end: Optional[str] + remote: bool + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: + return HistoricLayerInfo( + kind=d["kind"], + layer_file_name=d["layer_file_name"], + layer_file_size=d.get("layer_file_size"), + lsn_start=d["lsn_start"], + lsn_end=d.get("lsn_end"), + remote=d["remote"], + ) + + +@dataclass +class LayerMapInfo: + in_memory_layers: List[InMemoryLayerInfo] + historic_layers: List[HistoricLayerInfo] + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo: + info = LayerMapInfo(in_memory_layers=[], historic_layers=[]) + + json_in_memory_layers = d["in_memory_layers"] + assert isinstance(json_in_memory_layers, List) + for json_in_memory_layer in json_in_memory_layers: + info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer)) + + json_historic_layers = d["historic_layers"] + assert isinstance(json_historic_layers, List) + for json_historic_layer in json_historic_layers: + info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer)) + + return info + + def kind_count(self) -> Dict[str, int]: + counts: Dict[str, int] = defaultdict(int) + for inmem_layer in self.in_memory_layers: + counts[inmem_layer.kind] += 1 + for hist_layer in self.historic_layers: + counts[hist_layer.kind] += 1 + return counts + + +@dataclass +class TenantConfig: + tenant_specific_overrides: Dict[str, Any] + effective_config: Dict[str, Any] + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> TenantConfig: + return TenantConfig( + tenant_specific_overrides=d["tenant_specific_overrides"], + effective_config=d["effective_config"], + ) + + +class PageserverHttpClient(requests.Session): + def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): + super().__init__() + self.port = port + self.auth_token = auth_token + self.is_testing_enabled_or_skip = is_testing_enabled_or_skip + + if auth_token is not None: + self.headers["Authorization"] = f"Bearer {auth_token}" + + def verbose_error(self, res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" + raise PageserverApiException(msg, res.status_code) from e + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + + def tenant_list(self) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: + res = self.post( + f"http://localhost:{self.port}/v1/tenant", + json={ + "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, + }, + ) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return TenantId(new_tenant_id) + + def tenant_attach(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") + self.verbose_error(res) + + def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): + params = {} + if detach_ignored: + params["detach_ignored"] = "true" + + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) + self.verbose_error(res) + + def tenant_load(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") + self.verbose_error(res) + + def tenant_ignore(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") + self.verbose_error(res) + + def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_config(self, tenant_id: TenantId) -> TenantConfig: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") + self.verbose_error(res) + return TenantConfig.from_json(res.json()) + + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): + assert "tenant_id" not in config.keys() + res = self.put( + f"http://localhost:{self.port}/v1/tenant/config", + json={**config, "tenant_id": str(tenant_id)}, + ) + self.verbose_error(res) + + def patch_tenant_config_client_side( + self, + tenant_id: TenantId, + inserts: Optional[Dict[str, Any]] = None, + removes: Optional[List[str]] = None, + ): + current = self.tenant_config(tenant_id).tenant_specific_overrides + if inserts is not None: + current.update(inserts) + if removes is not None: + for key in removes: + del current[key] + self.set_tenant_config(tenant_id, current) + + def tenant_size(self, tenant_id: TenantId) -> int: + return self.tenant_size_and_modelinputs(tenant_id)[0] + + def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]: + """ + Returns the tenant size, together with the model inputs as the second tuple item. + """ + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size") + self.verbose_error(res) + res = res.json() + assert isinstance(res, dict) + assert TenantId(res["id"]) == tenant_id + size = res["size"] + assert type(size) == int + inputs = res["inputs"] + assert type(inputs) is dict + return (size, inputs) + + def tenant_size_debug(self, tenant_id: TenantId) -> str: + """ + Returns the tenant size debug info, as an HTML string + """ + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size", + headers={"Accept": "text/html"}, + ) + return res.text + + def timeline_list( + self, + tenant_id: TenantId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + ) -> List[Dict[str, Any]]: + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "true" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "true" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def timeline_create( + self, + tenant_id: TenantId, + new_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: Optional[TimelineId] = None, + ancestor_start_lsn: Optional[Lsn] = None, + ) -> Dict[Any, Any]: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", + json={ + "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, + "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, + }, + ) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") + + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_detail( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + **kwargs, + ) -> Dict[Any, Any]: + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "true" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "true" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + params=params, + **kwargs, + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_gc( + self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + ) -> dict[str, Any]: + self.is_testing_enabled_or_skip() + + log.info( + f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", + json={"gc_horizon": gc_horizon}, + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" + ) + log.info(f"Got compact request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_get_lsn_by_timestamp( + self, tenant_id: TenantId, timeline_id: TimelineId, timestamp + ): + log.info( + f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" + ) + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" + ) + log.info(f"Got checkpoint request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_spawn_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + max_concurrent_downloads: int, + ) -> dict[str, Any]: + body = { + "max_concurrent_downloads": max_concurrent_downloads, + } + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + json=body, + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_poll_download_remote_layers_status( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + spawn_response: dict[str, Any], + poll_state=None, + ) -> None | dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + + # assumption in this API client here is that nobody else spawns the task + assert res_json["task_id"] == spawn_response["task_id"] + + if poll_state is None or res_json["state"] == poll_state: + return res_json + return None + + def timeline_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + max_concurrent_downloads: int, + errors_ok=False, + at_least_one_download=True, + ): + res = self.timeline_spawn_download_remote_layers( + tenant_id, timeline_id, max_concurrent_downloads + ) + while True: + completed = self.timeline_poll_download_remote_layers_status( + tenant_id, timeline_id, res, poll_state="Completed" + ) + if not completed: + time.sleep(0.1) + continue + if not errors_ok: + assert completed["failed_download_count"] == 0 + if at_least_one_download: + assert completed["successful_download_count"] > 0 + return completed + + def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" + res = self.get(f"http://localhost:{self.port}/metrics") + self.verbose_error(res) + return res.text + + def get_metrics(self) -> Metrics: + res = self.get_metrics_str() + return parse_metrics(res) + + def get_timeline_metric( + self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str + ) -> float: + metrics = self.get_metrics() + return metrics.query_one( + metric_name, + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ).value + + def get_remote_timeline_client_metric( + self, + metric_name: str, + tenant_id: TenantId, + timeline_id: TimelineId, + file_kind: str, + op_kind: str, + ) -> Optional[float]: + metrics = self.get_metrics() + matches = metrics.query_all( + name=metric_name, + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ) + if len(matches) == 0: + value = None + elif len(matches) == 1: + value = matches[0].value + assert value is not None + else: + assert len(matches) < 2, "above filter should uniquely identify metric" + return value + + def get_metric_value( + self, name: str, filter: Optional[Dict[str, str]] = None + ) -> Optional[float]: + metrics = self.get_metrics() + results = metrics.query_all(name, filter=filter) + if not results: + log.info(f'could not find metric "{name}"') + return None + assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" + return results[0].value + + def layer_map_info( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> LayerMapInfo: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/", + ) + self.verbose_error(res) + return LayerMapInfo.from_json(res.json()) + + def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", + ) + self.verbose_error(res) + + assert res.status_code == 200 + + def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", + ) + self.verbose_error(res) + + assert res.status_code == 200 + + def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + info = self.layer_map_info(tenant_id, timeline_id) + for layer in info.historic_layers: + self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + + def disk_usage_eviction_run(self, request: dict[str, Any]): + res = self.put( + f"http://localhost:{self.port}/v1/disk_usage_eviction/run", + json=request, + ) + self.verbose_error(res) + return res.json() + + def tenant_break(self, tenant_id: TenantId): + res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") + self.verbose_error(res) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py new file mode 100644 index 0000000000..65eda5b636 --- /dev/null +++ b/test_runner/fixtures/pageserver/utils.py @@ -0,0 +1,145 @@ +import time + +from fixtures.log_helper import log +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.types import Lsn, TenantId, TimelineId + + +def assert_tenant_status( + pageserver_http: PageserverHttpClient, tenant: TenantId, expected_status: str +): + tenant_status = pageserver_http.tenant_status(tenant) + log.info(f"tenant_status: {tenant_status}") + assert tenant_status["state"] == expected_status, tenant_status + + +def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId): + tenants = pageserver_http.tenant_list() + matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] + assert len(matching) < 2 + if len(matching) == 0: + return None + return matching[0] + + +def remote_consistent_lsn( + pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId +) -> Lsn: + detail = pageserver_http.timeline_detail(tenant, timeline) + + if detail["remote_consistent_lsn"] is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it has been uploaded to remote + # storage yet. + return Lsn(0) + else: + lsn_str = detail["remote_consistent_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) + + +def wait_for_upload( + pageserver_http: PageserverHttpClient, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +): + """waits for local timeline upload up to specified lsn""" + for i in range(20): + current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline) + if current_lsn >= lsn: + log.info("wait finished") + return + log.info( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) + ) + time.sleep(1) + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn, current_lsn + ) + ) + + +def wait_until_tenant_state( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + expected_state: str, + iterations: int, +) -> bool: + """ + Does not use `wait_until` for debugging purposes + """ + for _ in range(iterations): + try: + tenant = pageserver_http.tenant_status(tenant_id=tenant_id) + log.debug(f"Tenant {tenant_id} data: {tenant}") + if tenant["state"] == expected_state: + return True + except Exception as e: + log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") + + time.sleep(1) + + raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") + + +def wait_until_tenant_active( + pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int = 30 +): + wait_until_tenant_state( + pageserver_http, tenant_id, expected_state="Active", iterations=iterations + ) + + +def last_record_lsn( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId +) -> Lsn: + detail = pageserver_http_client.timeline_detail(tenant, timeline) + + lsn_str = detail["last_record_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) + + +def wait_for_last_record_lsn( + pageserver_http: PageserverHttpClient, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +) -> Lsn: + """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" + for i in range(10): + current_lsn = last_record_lsn(pageserver_http, tenant, timeline) + if current_lsn >= lsn: + return current_lsn + log.info( + "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) + ) + time.sleep(1) + raise Exception( + "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + ) + + +def wait_for_upload_queue_empty( + pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): + while True: + all_metrics = pageserver_http.get_metrics() + tl = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_unfinished", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + assert len(tl) > 0 + log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") + if all(m.value == 0 for m in tl): + return + time.sleep(0.2) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index b58539ca86..71df74dfba 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -278,3 +278,19 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn): continue return res raise Exception("timed out while waiting for %s" % func) from last_exception + + +def wait_while(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns false, or throws an exception. + """ + for i in range(number_of_iterations): + try: + if not func(): + return + log.info("waiting for %s iteration %s failed", func, i + 1) + time.sleep(interval) + continue + except Exception: + return + raise Exception("timed out while waiting for %s" % func) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 4b109c150f..16c5438b8f 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -10,7 +10,7 @@ import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log -from fixtures.neon_fixtures import wait_for_last_record_lsn +from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index f3d153d934..f7c4736e04 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,7 +1,8 @@ from contextlib import closing import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PgProtocol +from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol +from fixtures.pageserver.http import PageserverApiException from fixtures.types import TenantId diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index be6e1a69b2..0cc111bd8c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -10,12 +10,11 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonCli, NeonEnvBuilder, - PageserverHttpClient, PgBin, PortDistributor, - wait_for_last_record_lsn, - wait_for_upload, ) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn from pytest import FixtureRequest diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 6ed09734fe..413d6c9d5a 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -11,14 +11,14 @@ from fixtures.neon_fixtures import ( LocalFsStorage, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, PgBin, RemoteStorageKind, wait_for_last_flush_lsn, - wait_for_upload_queue_empty, - wait_until, ) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -138,7 +138,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev # remove the initial tenant ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865 assert env.initial_timeline - wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline) pageserver_http.tenant_detach(env.initial_tenant) assert isinstance(env.remote_storage, LocalFsStorage) tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant) @@ -182,7 +182,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev # after stopping the safekeepers, we know that no new WAL will be coming in for tenant_id, timeline_id in timelines: pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id) + wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 1dc10fbf4f..774ed98563 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -13,9 +13,8 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, Postgres, - wait_for_last_record_lsn, - wait_for_upload, ) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import subprocess_capture diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 80e7ae8d7e..2d07d02ce7 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -6,10 +6,9 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, wait_for_last_flush_lsn, - wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, - wait_for_upload, ) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index d146f78c3a..cd481e69eb 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -5,8 +5,8 @@ from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.types import TenantId, TimelineId diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index 73933021a4..aa37a2411c 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PageserverHttpClient +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder +from fixtures.pageserver.http import PageserverHttpClient def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index fd13651427..90ab8e68d8 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -10,20 +10,20 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, RemoteStorageKind, - assert_tenant_status, available_remote_storages, wait_for_last_flush_lsn, - wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( + assert_tenant_status, + wait_for_last_record_lsn, wait_for_upload, - wait_until, wait_until_tenant_state, ) from fixtures.types import Lsn -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id): diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index eb22ac5f99..5b05989ae4 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -6,8 +6,8 @@ from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py index 1b00b272c2..be0eb76ccd 100644 --- a/test_runner/regress/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -1,6 +1,7 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 7487757071..69d6e427ce 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn +from fixtures.neon_fixtures import NeonEnv +from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index f6600e8974..222305f006 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -13,13 +13,15 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( LocalFsStorage, NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, RemoteStorageKind, available_remote_storages, wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, + wait_until_tenant_active, wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -172,15 +174,10 @@ def test_remote_storage_backup_and_restore( client.tenant_attach(tenant_id) log.info("waiting for tenant to become active. this should be quick with on-demand download") - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until( - number_of_iterations=5, - interval=1, - func=tenant_active, + wait_until_tenant_active( + pageserver_http=client, + tenant_id=tenant_id, + iterations=5, ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -357,12 +354,7 @@ def test_remote_storage_upload_queue_retries( client.tenant_attach(tenant_id) - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until(30, 1, tenant_active) + wait_until_tenant_active(client, tenant_id) log.info("restarting postgres to validate") pg = env.postgres.create_start("main", tenant_id=tenant_id) @@ -497,12 +489,7 @@ def test_remote_timeline_client_calls_started_metric( client.tenant_attach(tenant_id) - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until(30, 1, tenant_active) + wait_until_tenant_active(client, tenant_id) log.info("restarting postgres to validate") pg = env.postgres.create_start("main", tenant_id=tenant_id) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index c5f9a3d157..67aba227e5 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -6,9 +6,8 @@ from fixtures.neon_fixtures import ( LocalFsStorage, NeonEnvBuilder, RemoteStorageKind, - assert_tenant_status, - wait_for_upload, ) +from fixtures.pageserver.utils import assert_tenant_status, wait_for_upload from fixtures.types import Lsn from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 5db79eef4a..58a010951e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -9,18 +9,18 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, Postgres, RemoteStorageKind, available_remote_storages, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, - wait_until, wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until def do_gc_target( diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index aaf33c0d59..8ad4bd1c11 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -10,18 +10,24 @@ from fixtures.neon_fixtures import ( NeonBroker, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, PortDistributor, Postgres, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import ( assert_tenant_status, tenant_exists, wait_for_last_record_lsn, wait_for_upload, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import ( + query_scalar, + start_in_background, + subprocess_capture, wait_until, wait_while, ) -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar, start_in_background, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index a4b5f7739a..9037fe0045 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -6,11 +6,11 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - PageserverHttpClient, Postgres, wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.types import Lsn, TenantId, TimelineId diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index c786f8a8e1..ec1c12a0d8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -20,10 +20,12 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, Postgres, RemoteStorageKind, - assert_tenant_status, available_remote_storages, - wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, +) +from fixtures.pageserver.utils import ( + assert_tenant_status, + wait_for_last_record_lsn, wait_for_upload, ) from fixtures.types import Lsn, TenantId, TimelineId diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 93fafff934..cf607f4f7b 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,5 +1,6 @@ import pytest -from fixtures.neon_fixtures import NeonEnv, PageserverApiException +from fixtures.neon_fixtures import NeonEnv +from fixtures.pageserver.http import PageserverApiException from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index c4e8e7aa07..7c77e1fe59 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -14,20 +14,21 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, PgBin, PortDistributor, Postgres, RemoteStorageKind, VanillaPostgres, - assert_tenant_status, wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( + assert_tenant_status, wait_for_upload_queue_empty, - wait_until, + wait_until_tenant_active, ) from fixtures.types import TenantId, TimelineId -from fixtures.utils import get_timeline_dir_size +from fixtures.utils import get_timeline_dir_size, wait_until def test_timeline_size(neon_simple_env: NeonEnv): @@ -246,12 +247,7 @@ def test_timeline_initial_logical_size_calculation_cancellation( extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} ) - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until(30, 1, tenant_active) + wait_until_tenant_active(client, tenant_id) # kick off initial size calculation task (the response we get here is the estimated size) def assert_size_calculation_not_done(): @@ -425,7 +421,7 @@ def test_timeline_physical_size_post_compaction( pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) assert_physical_size_invariants( get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), @@ -478,7 +474,7 @@ def test_timeline_physical_size_post_gc( pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) assert_physical_size_invariants( get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), @@ -584,7 +580,7 @@ def test_tenant_physical_size( tenant, timeline = env.neon_cli.create_tenant() if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, tenant, timeline) + wait_for_upload_queue_empty(pageserver_http, tenant, timeline) def get_timeline_resident_physical_size(timeline: TimelineId): sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind) @@ -609,7 +605,7 @@ def test_tenant_physical_size( pageserver_http.timeline_checkpoint(tenant, timeline) if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, tenant, timeline) + wait_for_upload_queue_empty(pageserver_http, tenant, timeline) timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 407085a01a..306c492e8f 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -30,9 +30,8 @@ from fixtures.neon_fixtures import ( SafekeeperHttpClient, SafekeeperPort, available_remote_storages, - wait_for_last_record_lsn, - wait_for_upload, ) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 395d54b8c3..d6302f8632 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -3,7 +3,8 @@ import time import psutil import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pageserver.http import PageserverApiException from fixtures.types import TenantId From bfee4127014022a43bd85bccb562ed4bc62dc075 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 7 Apr 2023 14:26:21 +0300 Subject: [PATCH 246/426] Trigger tests for index scan implementation (#3968) ## Describe your changes ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 757df1dab8..3e70693c91 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 757df1dab82f69bdf69469119420a0bbb307f992 +Subproject commit 3e70693c9178878404d14a61c96b15b74eb02688 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index f8a650e49b..4ad87b0f36 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit f8a650e49b06d39ad131b860117504044b01f312 +Subproject commit 4ad87b0f364a2313600c1d9774ca33df00e606f4 From 979fa8b1ba8cf5d33f3fe33f9b448c62ea755777 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 14 Feb 2023 09:58:06 +0100 Subject: [PATCH 247/426] Compile timescaledb --- Dockerfile.compute-node | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 92a1bb69e5..48d49a60f3 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -301,6 +301,27 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control +######################################################################################### +# +# Layer "timescaledb-pg-build" +# compile timescaledb extension +# +######################################################################################### +FROM build-deps AS timescaledb-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin:$PATH" + +RUN apt-get update && \ + apt-get install -y cmake && \ + wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ + mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ + ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOLL=OFF -DAPACHE_ONLY:BOOL=ON && \ + cd build && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control + ######################################################################################### # # Layer "rust extensions" @@ -405,6 +426,7 @@ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 31f2cdeb1ec6fb66d26dc40a7c71f4b47dd8b31f Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 31 Mar 2023 14:46:38 +0200 Subject: [PATCH 248/426] Update Dockerfile.compute-node Co-authored-by: MMeent --- Dockerfile.compute-node | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 48d49a60f3..3473487444 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -316,7 +316,7 @@ RUN apt-get update && \ apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ - ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOLL=OFF -DAPACHE_ONLY:BOOL=ON && \ + ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ From 0bf70e113f284a553fa5b3dbaf1162ebd7b4b8e9 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 7 Apr 2023 15:03:13 +0300 Subject: [PATCH 249/426] Add extra cnames to staging proxy --- .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 40814e55c9..2a8f028f3b 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -23,6 +23,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.us-east-2.aws.neon.build" + extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"] sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" From dec58092e8b7c20e743a584f3e6fa8aaca73c988 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 7 Apr 2023 21:39:49 +0300 Subject: [PATCH 250/426] Replace Box with impl in RemoteStorage upload (#3984) Replaces `Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>` with `impl io::AsyncRead + Unpin + Send + Sync + 'static` usages in the `RemoteStorage` interface, to make it closer to [`#![feature(async_fn_in_trait)]`](https://blog.rust-lang.org/inside-rust/2022/11/17/async-fn-in-trait-nightly.html) For `GenericRemoteStorage`, replaces `type Target = dyn RemoteStorage` with another impl with `RemoteStorage` methods inside it. We can reuse the trait, that would require importing the trait in every file where it's used and makes us farther from the unstable feature. After this PR, I've manged to create a patch with the changes: https://github.com/neondatabase/neon/compare/kb/less-dyn-storage...kb/nightly-async-trait?expand=1 Current rust implementation does not like recursive async trait calls, so `UnreliableWrapper` was removed: it contained a `GenericRemoteStorage` that implemented the `RemoteStorage` trait, and itself implemented the trait, which nightly rustc did not like and proposed to box the future. Similarly, `GenericRemoteStorage` cannot implement `RemoteStorage` for nightly rustc to work, since calls various remote storages' methods from inside. I've compiled current `main` and the nightly branch both with `time env RUSTC_WRAPPER="" cargo +nightly build --all --timings` command, and got ``` Finished dev [optimized + debuginfo] target(s) in 2m 04s env RUSTC_WRAPPER="" cargo +nightly build --all --timings 1283.19s user 50.40s system 1074% cpu 2:04.15 total for the new feature tried and Finished dev [optimized + debuginfo] target(s) in 2m 40s env RUSTC_WRAPPER="" cargo +nightly build --all --timings 1288.59s user 52.06s system 834% cpu 2:40.71 total for the old async_trait approach. ``` On my machine, the `remote_storage` lib compilation takes ~10 less time with the nightly feature (left) than the regular main (right). ![image](https://user-images.githubusercontent.com/2690773/230620797-163d8b89-dac8-4366-bcf6-cd1cdddcd22c.png) Full cargo reports are available at [timings.zip](https://github.com/neondatabase/neon/files/11179369/timings.zip) --- libs/remote_storage/src/lib.rs | 72 +++++++++++++++++--- libs/remote_storage/src/local_fs.rs | 2 +- libs/remote_storage/src/s3_bucket.rs | 2 +- libs/remote_storage/src/simulate_failures.rs | 2 +- 4 files changed, 65 insertions(+), 13 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 5b74308514..e0cc3ca543 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -13,7 +13,6 @@ use std::{ collections::HashMap, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, - ops::Deref, path::{Path, PathBuf}, pin::Pin, sync::Arc, @@ -90,7 +89,7 @@ pub trait RemoteStorage: Send + Sync + 'static { /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, - data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, @@ -161,14 +160,67 @@ pub enum GenericRemoteStorage { Unreliable(Arc), } -impl Deref for GenericRemoteStorage { - type Target = dyn RemoteStorage; - - fn deref(&self) -> &Self::Target { +impl GenericRemoteStorage { + pub async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { match self { - GenericRemoteStorage::LocalFs(local_fs) => local_fs, - GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(), - GenericRemoteStorage::Unreliable(s) => s.as_ref(), + Self::LocalFs(s) => s.list_prefixes(prefix).await, + Self::AwsS3(s) => s.list_prefixes(prefix).await, + Self::Unreliable(s) => s.list_prefixes(prefix).await, + } + } + + pub async fn upload( + &self, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + ) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await, + } + } + + pub async fn download(&self, from: &RemotePath) -> Result { + match self { + Self::LocalFs(s) => s.download(from).await, + Self::AwsS3(s) => s.download(from).await, + Self::Unreliable(s) => s.download(from).await, + } + } + + pub async fn download_byte_range( + &self, + from: &RemotePath, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + match self { + Self::LocalFs(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + Self::AwsS3(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + Self::Unreliable(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + } + } + + pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.delete(path).await, + Self::AwsS3(s) => s.delete(path).await, + Self::Unreliable(s) => s.delete(path).await, } } } @@ -199,7 +251,7 @@ impl GenericRemoteStorage { /// this path is used for the remote object id conversion only. pub async fn upload_storage_object( &self, - from: Box, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, ) -> anyhow::Result<()> { diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 21a4156ad3..d7b46731cd 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -118,7 +118,7 @@ impl RemoteStorage for LocalFs { async fn upload( &self, - data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + data: impl io::AsyncRead + Unpin + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index fdf3ae02d3..e6c1e19ad5 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -343,7 +343,7 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, - from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, metadata: Option, diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index d1d062f8e7..cb40859831 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -84,7 +84,7 @@ impl RemoteStorage for UnreliableWrapper { async fn upload( &self, - data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>, + data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, From 818e341af0918d5ab78313a9af7fb408fa6a9e55 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Sun, 9 Apr 2023 12:52:49 +0100 Subject: [PATCH 251/426] Nightly Benchmarks: replace neon-captest-prefetch with -new/-reuse (#3970) We have enabled prefetch by default, let's use this in Nightly Benchmarks: - effective_io_concurrency=100 by default (instead of 32) - maintenance_io_concurrency=100 by default (instead of 32) Rename `neon-captest-prefetch` to `neon-captest-new` (for pgbench with initialisation) and `neon-captest-reuse` (for OLAP scenarios) --- .github/workflows/benchmarking.yml | 79 +++++++----------------------- 1 file changed, 17 insertions(+), 62 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 425d4d76c9..2aeea6eca4 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -114,17 +114,16 @@ jobs: # neon-captest-freetier: Run pgbench with freetier-limited compute # neon-captest-new: Run pgbench in a freshly created project # neon-captest-reuse: Same, but reusing existing project - # neon-captest-prefetch: Same, with prefetching enabled (new project) # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ] + platform: [ neon-captest-reuse, neon-captest-new, rds-postgres ] db_size: [ 10gb ] runner: [ us-east-2 ] include: - platform: neon-captest-freetier db_size: 3gb runner: us-east-2 - - platform: neon-captest-prefetch + - platform: neon-captest-new db_size: 50gb runner: us-east-2 - platform: rds-aurora @@ -164,7 +163,7 @@ jobs: echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch", "neon-captest-freetier"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -180,7 +179,7 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new | neon-captest-prefetch | neon-captest-freetier) + neon-captest-new | neon-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -190,7 +189,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', neon-captest-freetier, 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -199,17 +198,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -286,10 +274,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled + # neon-captest-reuse: We have pre-created projects 1 CU # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -325,7 +313,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; rds-aurora) @@ -335,7 +323,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -344,17 +332,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: @@ -397,10 +374,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled + # neon-captest-reuse: We have pre-created projects 1 CU # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -436,7 +413,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }} ;; rds-aurora) @@ -446,7 +423,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -455,17 +432,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: @@ -502,10 +468,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled + # neon-captest-reuse: We have pre-created projects with 1 CU # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -541,7 +507,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; rds-aurora) @@ -551,7 +517,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -560,17 +526,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Run user examples uses: ./.github/actions/run-python-test-set with: From f0b2e076d9beb3601049388dc2d7d94ef7b68f23 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 9 Apr 2023 21:52:28 +0300 Subject: [PATCH 252/426] Move compute_ctl structs used in HTTP API and spec file to separate crate. This is in preparation of using compute_ctl to launch postgres nodes in the neon_local control plane. And seems like a good idea to separate the public interfaces anyway. One non-mechanical change here is that the 'metrics' field is moved under the Mutex, instead of using atomics. We were not using atomics for performance but for convenience here, and it seems more clear to not use atomics in the model for the HTTP response type. --- Cargo.lock | 13 +++ Cargo.toml | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/bin/compute_ctl.rs | 5 +- compute_tools/src/compute.rs | 81 +++++----------- compute_tools/src/config.rs | 2 +- compute_tools/src/http/api.rs | 26 +++-- compute_tools/src/http/mod.rs | 2 - compute_tools/src/http/responses.rs | 40 -------- compute_tools/src/pg_helpers.rs | 78 +++++---------- compute_tools/src/spec.rs | 43 +-------- compute_tools/tests/pg_helpers_tests.rs | 7 +- libs/compute_api/Cargo.toml | 14 +++ libs/compute_api/src/lib.rs | 3 + .../http => libs/compute_api/src}/requests.rs | 5 +- libs/compute_api/src/responses.rs | 66 +++++++++++++ libs/compute_api/src/spec.rs | 94 +++++++++++++++++++ .../compute_api}/tests/cluster_spec.json | 0 18 files changed, 271 insertions(+), 210 deletions(-) delete mode 100644 compute_tools/src/http/responses.rs create mode 100644 libs/compute_api/Cargo.toml create mode 100644 libs/compute_api/src/lib.rs rename {compute_tools/src/http => libs/compute_api/src}/requests.rs (76%) create mode 100644 libs/compute_api/src/responses.rs create mode 100644 libs/compute_api/src/spec.rs rename {compute_tools => libs/compute_api}/tests/cluster_spec.json (100%) diff --git a/Cargo.lock b/Cargo.lock index 4590e76014..c5b64b235a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -841,6 +841,18 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "compute_api" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "serde", + "serde_json", + "serde_with", + "workspace_hack", +] + [[package]] name = "compute_tools" version = "0.1.0" @@ -848,6 +860,7 @@ dependencies = [ "anyhow", "chrono", "clap 4.1.4", + "compute_api", "futures", "hyper", "notify", diff --git a/Cargo.toml b/Cargo.toml index 09cc150606..d563324c86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -132,6 +132,7 @@ tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df6 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending ## Local libraries +compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 59433535f1..f315d2b7d9 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,4 +27,5 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true url.workspace = true +compute_api.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 1a3ac77af4..d61eae5f7a 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -43,7 +43,9 @@ use clap::Arg; use tracing::{error, info}; use url::Url; -use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; +use compute_api::responses::ComputeStatus; + +use compute_tools::compute::{ComputeNode, ComputeState}; use compute_tools::http::api::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; @@ -116,7 +118,6 @@ fn main() -> Result<()> { pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), live_config_allowed, - metrics: ComputeMetrics::default(), state: Mutex::new(new_state), state_changed: Condvar::new(), }; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 3e92ec57dc..689aa6ef43 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -19,16 +19,17 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Condvar, Mutex}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use serde::Serialize; use tokio_postgres; use tracing::{info, instrument, warn}; +use compute_api::responses::{ComputeMetrics, ComputeStatus}; +use compute_api::spec::ComputeSpec; + use crate::checker::create_writability_check_data; use crate::config; use crate::pg_helpers::*; @@ -41,7 +42,6 @@ pub struct ComputeNode { pub connstr: url::Url, pub pgdata: String, pub pgbin: String, - pub metrics: ComputeMetrics, /// We should only allow live re- / configuration of the compute node if /// it uses 'pull model', i.e. it can go to control-plane and fetch /// the latest configuration. Otherwise, there could be a case: @@ -74,6 +74,8 @@ pub struct ComputeState { pub timeline: String, pub pageserver_connstr: String, pub storage_auth_token: Option, + + pub metrics: ComputeMetrics, } impl ComputeState { @@ -87,6 +89,7 @@ impl ComputeState { timeline: String::new(), pageserver_connstr: String::new(), storage_auth_token: None, + metrics: ComputeMetrics::default(), } } } @@ -97,33 +100,6 @@ impl Default for ComputeState { } } -#[derive(Serialize, Clone, Copy, PartialEq, Eq, Debug)] -#[serde(rename_all = "snake_case")] -pub enum ComputeStatus { - // Spec wasn't provided at start, waiting for it to be - // provided by control-plane. - Empty, - // Compute configuration was requested. - ConfigurationPending, - // Compute node has spec and initial startup and - // configuration is in progress. - Init, - // Compute is configured and running. - Running, - // Either startup or configuration failed, - // compute will exit soon or is waiting for - // control-plane to terminate it. - Failed, -} - -#[derive(Default, Serialize)] -pub struct ComputeMetrics { - pub sync_safekeepers_ms: AtomicU64, - pub basebackup_ms: AtomicU64, - pub config_ms: AtomicU64, - pub total_startup_ms: AtomicU64, -} - impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { let mut state = self.state.lock().unwrap(); @@ -185,15 +161,11 @@ impl ComputeNode { ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?; - self.metrics.basebackup_ms.store( - Utc::now() - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - + self.state.lock().unwrap().metrics.basebackup_ms = Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; Ok(()) } @@ -231,14 +203,11 @@ impl ComputeNode { ); } - self.metrics.sync_safekeepers_ms.store( - Utc::now() - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); + self.state.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); @@ -375,23 +344,19 @@ impl ComputeNode { self.apply_config(&compute_state)?; let startup_end_time = Utc::now(); - self.metrics.config_ms.store( - startup_end_time + { + let mut state = self.state.lock().unwrap(); + state.metrics.config_ms = startup_end_time .signed_duration_since(start_time) .to_std() .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - self.metrics.total_startup_ms.store( - startup_end_time + .as_millis() as u64; + state.metrics.total_startup_ms = startup_end_time .signed_duration_since(self.start_time) .to_std() .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - + .as_millis() as u64; + } self.set_status(ComputeStatus::Running); Ok(pg) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 6cbd0e3d4c..d25eb9b2fc 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use crate::spec::ComputeSpec; +use compute_api::spec::ComputeSpec; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 8620b10636..cea45dc596 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,9 +3,9 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; -use crate::compute::{ComputeNode, ComputeStatus}; -use crate::http::requests::ConfigurationRequest; -use crate::http::responses::{ComputeStatusResponse, GenericAPIError}; +use crate::compute::{ComputeNode, ComputeState}; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; @@ -16,6 +16,16 @@ use tokio::task; use tracing::{error, info}; use tracing_utils::http::OtelName; +fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { + ComputeStatusResponse { + tenant: state.tenant.clone(), + timeline: state.timeline.clone(), + status: state.status, + last_active: state.last_active, + error: state.error.clone(), + } +} + // Service function to handle all available routes. async fn routes(req: Request, compute: &Arc) -> Response { // @@ -28,8 +38,7 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /status GET request"); let state = compute.state.lock().unwrap(); - let status_response = ComputeStatusResponse::from(state.clone()); - + let status_response = status_response_from_state(&state); Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) } @@ -37,7 +46,8 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /metrics.json GET request"); - Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) + let metrics = compute.state.lock().unwrap().metrics.clone(); + Response::new(Body::from(serde_json::to_string(&metrics).unwrap())) } // Collect Postgres current usage insights @@ -162,7 +172,7 @@ async fn handle_configure_request( ); if state.status == ComputeStatus::Failed { - let err = state.error.clone().unwrap_or("unknown error".to_string()); + let err = state.error.as_ref().map_or("unknown error", |x| x); let msg = format!("compute configuration failed: {:?}", err); return Err((msg, StatusCode::INTERNAL_SERVER_ERROR)); } @@ -175,7 +185,7 @@ async fn handle_configure_request( // Return current compute state if everything went well. let state = compute.state.lock().unwrap().clone(); - let status_response = ComputeStatusResponse::from(state); + let status_response = status_response_from_state(&state); Ok(serde_json::to_string(&status_response).unwrap()) } else { Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST)) diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index e54b4e3341..e5fdf85eed 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1,3 +1 @@ pub mod api; -pub mod requests; -pub mod responses; diff --git a/compute_tools/src/http/responses.rs b/compute_tools/src/http/responses.rs deleted file mode 100644 index 1ef4b380a9..0000000000 --- a/compute_tools/src/http/responses.rs +++ /dev/null @@ -1,40 +0,0 @@ -use serde::{Serialize, Serializer}; - -use chrono::{DateTime, Utc}; - -use crate::compute::{ComputeState, ComputeStatus}; - -#[derive(Serialize, Debug)] -pub struct GenericAPIError { - pub error: String, -} - -#[derive(Serialize, Debug)] -#[serde(rename_all = "snake_case")] -pub struct ComputeStatusResponse { - pub tenant: String, - pub timeline: String, - pub status: ComputeStatus, - #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, - pub error: Option, -} - -impl From for ComputeStatusResponse { - fn from(state: ComputeState) -> Self { - ComputeStatusResponse { - tenant: state.tenant, - timeline: state.timeline, - status: state.status, - last_active: state.last_active, - error: state.error, - } - } -} - -fn rfc3339_serialize(x: &DateTime, s: S) -> Result -where - S: Serializer, -{ - x.to_rfc3339().serialize(s) -} diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 38d1a6d777..bb787d0506 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -10,43 +10,12 @@ use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; -use serde::Deserialize; use tracing::{debug, instrument}; +use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; + const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds -/// Rust representation of Postgres role info with only those fields -/// that matter for us. -#[derive(Clone, Deserialize, Debug)] -pub struct Role { - pub name: PgIdent, - pub encrypted_password: Option, - pub options: GenericOptions, -} - -/// Rust representation of Postgres database info with only those fields -/// that matter for us. -#[derive(Clone, Deserialize, Debug)] -pub struct Database { - pub name: PgIdent, - pub owner: PgIdent, - pub options: GenericOptions, -} - -/// Common type representing both SQL statement params with or without value, -/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config -/// options like `wal_level = logical`. -#[derive(Clone, Deserialize, Debug)] -pub struct GenericOption { - pub name: String, - pub value: Option, - pub vartype: String, -} - -/// Optional collection of `GenericOption`'s. Type alias allows us to -/// declare a `trait` on it. -pub type GenericOptions = Option>; - /// Escape a string for including it in a SQL literal fn escape_literal(s: &str) -> String { s.replace('\'', "''").replace('\\', "\\\\") @@ -58,9 +27,14 @@ fn escape_conf_value(s: &str) -> String { s.replace('\'', "''").replace('\\', "\\\\") } -impl GenericOption { +trait GenericOptionExt { + fn to_pg_option(&self) -> String; + fn to_pg_setting(&self) -> String; +} + +impl GenericOptionExt for GenericOption { /// Represent `GenericOption` as SQL statement parameter. - pub fn to_pg_option(&self) -> String { + fn to_pg_option(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { "string" => format!("{} '{}'", self.name, escape_literal(val)), @@ -72,7 +46,7 @@ impl GenericOption { } /// Represent `GenericOption` as configuration option. - pub fn to_pg_setting(&self) -> String { + fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { "string" => format!("{} = '{}'", self.name, escape_conf_value(val)), @@ -131,10 +105,14 @@ impl GenericOptionsSearch for GenericOptions { } } -impl Role { +pub trait RoleExt { + fn to_pg_options(&self) -> String; +} + +impl RoleExt for Role { /// Serialize a list of role parameters into a Postgres-acceptable /// string of arguments. - pub fn to_pg_options(&self) -> String { + fn to_pg_options(&self) -> String { // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane. // For now, we do not use generic `options` for roles. Once used, add // `self.options.as_pg_options()` somewhere here. @@ -159,21 +137,17 @@ impl Role { } } -impl Database { - pub fn new(name: PgIdent, owner: PgIdent) -> Self { - Self { - name, - owner, - options: None, - } - } +pub trait DatabaseExt { + fn to_pg_options(&self) -> String; +} +impl DatabaseExt for Database { /// Serialize a list of database parameters into a Postgres-acceptable /// string of arguments. /// NB: `TEMPLATE` is actually also an identifier, but so far we only need /// to use `template0` and `template1`, so it is not a problem. Yet in the future /// it may require a proper quoting too. - pub fn to_pg_options(&self) -> String { + fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); write!(params, " OWNER {}", &self.owner.pg_quote()) .expect("String is documented to not to error during write operations"); @@ -182,10 +156,6 @@ impl Database { } } -/// String type alias representing Postgres identifier and -/// intended to be used for DB / role names. -pub type PgIdent = String; - /// Generic trait used to provide quoting / encoding for strings used in the /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { @@ -226,7 +196,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { &[], )? .iter() - .map(|row| Database::new(row.get("datname"), row.get("owner"))) + .map(|row| Database { + name: row.get("datname"), + owner: row.get("owner"), + options: None, + }) .collect(); Ok(postgres_dbs) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index b7f15a99d1..2350113c39 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,57 +1,16 @@ -use std::collections::HashMap; use std::path::Path; use std::str::FromStr; use anyhow::Result; use postgres::config::Config; use postgres::{Client, NoTls}; -use serde::Deserialize; use tracing::{info, info_span, instrument, span_enabled, warn, Level}; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -/// Cluster spec or configuration represented as an optional number of -/// delta operations + final cluster state description. -#[derive(Clone, Deserialize, Debug, Default)] -pub struct ComputeSpec { - pub format_version: f32, - pub timestamp: String, - pub operation_uuid: Option, - /// Expected cluster state at the end of transition process. - pub cluster: Cluster, - pub delta_operations: Option>, - - pub storage_auth_token: Option, - - pub startup_tracing_context: Option>, -} - -/// Cluster state seen from the perspective of the external tools -/// like Rails web console. -#[derive(Clone, Deserialize, Debug, Default)] -pub struct Cluster { - pub cluster_id: String, - pub name: String, - pub state: Option, - pub roles: Vec, - pub databases: Vec, - pub settings: GenericOptions, -} - -/// Single cluster state changing operation that could not be represented as -/// a static `Cluster` structure. For example: -/// - DROP DATABASE -/// - DROP ROLE -/// - ALTER ROLE name RENAME TO new_name -/// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize, Debug)] -pub struct DeltaOp { - pub action: String, - pub name: PgIdent, - pub new_name: Option, -} +use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` /// env variable is set, it will be used for authorization. diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index f48211f7ed..a63ee038c7 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -1,14 +1,13 @@ #[cfg(test)] mod pg_helpers_tests { - use std::fs::File; + use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent}; use compute_tools::pg_helpers::*; - use compute_tools::spec::ComputeSpec; #[test] fn params_serialize() { - let file = File::open("tests/cluster_spec.json").unwrap(); + let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( @@ -23,7 +22,7 @@ mod pg_helpers_tests { #[test] fn settings_serialize() { - let file = File::open("tests/cluster_spec.json").unwrap(); + let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml new file mode 100644 index 0000000000..533a091207 --- /dev/null +++ b/libs/compute_api/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "compute_api" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +serde.workspace = true +serde_with.workspace = true +serde_json.workspace = true + +workspace_hack.workspace = true diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs new file mode 100644 index 0000000000..b660799ec0 --- /dev/null +++ b/libs/compute_api/src/lib.rs @@ -0,0 +1,3 @@ +pub mod requests; +pub mod responses; +pub mod spec; diff --git a/compute_tools/src/http/requests.rs b/libs/compute_api/src/requests.rs similarity index 76% rename from compute_tools/src/http/requests.rs rename to libs/compute_api/src/requests.rs index 2e41c7aea4..5896c7dc65 100644 --- a/compute_tools/src/http/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,7 +1,10 @@ -use serde::Deserialize; +//! Structs representing the JSON formats used in the compute_ctl's HTTP API. use crate::spec::ComputeSpec; +use serde::Deserialize; +/// Request of the /configure API +/// /// We now pass only `spec` in the configuration request, but later we can /// extend it and something like `restart: bool` or something else. So put /// `spec` into a struct initially to be more flexible in the future. diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs new file mode 100644 index 0000000000..43289a5e3e --- /dev/null +++ b/libs/compute_api/src/responses.rs @@ -0,0 +1,66 @@ +//! Structs representing the JSON formats used in the compute_ctl's HTTP API. + +use chrono::{DateTime, Utc}; +use serde::{Serialize, Serializer}; + +#[derive(Serialize, Debug)] +pub struct GenericAPIError { + pub error: String, +} + +/// Response of the /status API +#[derive(Serialize, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ComputeStatusResponse { + pub tenant: String, + pub timeline: String, + pub status: ComputeStatus, + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputeStatus { + // Spec wasn't provided at start, waiting for it to be + // provided by control-plane. + Empty, + // Compute configuration was requested. + ConfigurationPending, + // Compute node has spec and initial startup and + // configuration is in progress. + Init, + // Compute is configured and running. + Running, + // Either startup or configuration failed, + // compute will exit soon or is waiting for + // control-plane to terminate it. + Failed, +} + +fn rfc3339_serialize(x: &DateTime, s: S) -> Result +where + S: Serializer, +{ + x.to_rfc3339().serialize(s) +} + +/// Response of the /metrics.json API +#[derive(Clone, Debug, Default, Serialize)] +pub struct ComputeMetrics { + pub sync_safekeepers_ms: u64, + pub basebackup_ms: u64, + pub config_ms: u64, + pub total_startup_ms: u64, +} diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs new file mode 100644 index 0000000000..37fe133b68 --- /dev/null +++ b/libs/compute_api/src/spec.rs @@ -0,0 +1,94 @@ +//! `ComputeSpec` represents the contents of the spec.json file. +//! +//! The spec.json file is used to pass information to 'compute_ctl'. It contains +//! all the information needed to start up the right version of PostgreSQL, +//! and connect it to the storage nodes. +use serde::Deserialize; +use std::collections::HashMap; + +/// String type alias representing Postgres identifier and +/// intended to be used for DB / role names. +pub type PgIdent = String; + +/// Cluster spec or configuration represented as an optional number of +/// delta operations + final cluster state description. +#[derive(Clone, Debug, Default, Deserialize)] +pub struct ComputeSpec { + pub format_version: f32, + pub timestamp: String, + pub operation_uuid: Option, + /// Expected cluster state at the end of transition process. + pub cluster: Cluster, + pub delta_operations: Option>, + + pub storage_auth_token: Option, + + pub startup_tracing_context: Option>, +} + +#[derive(Clone, Debug, Default, Deserialize)] +pub struct Cluster { + pub cluster_id: String, + pub name: String, + pub state: Option, + pub roles: Vec, + pub databases: Vec, + pub settings: GenericOptions, +} + +/// Single cluster state changing operation that could not be represented as +/// a static `Cluster` structure. For example: +/// - DROP DATABASE +/// - DROP ROLE +/// - ALTER ROLE name RENAME TO new_name +/// - ALTER DATABASE name RENAME TO new_name +#[derive(Clone, Debug, Deserialize)] +pub struct DeltaOp { + pub action: String, + pub name: PgIdent, + pub new_name: Option, +} + +/// Rust representation of Postgres role info with only those fields +/// that matter for us. +#[derive(Clone, Debug, Deserialize)] +pub struct Role { + pub name: PgIdent, + pub encrypted_password: Option, + pub options: GenericOptions, +} + +/// Rust representation of Postgres database info with only those fields +/// that matter for us. +#[derive(Clone, Debug, Deserialize)] +pub struct Database { + pub name: PgIdent, + pub owner: PgIdent, + pub options: GenericOptions, +} + +/// Common type representing both SQL statement params with or without value, +/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config +/// options like `wal_level = logical`. +#[derive(Clone, Debug, Deserialize)] +pub struct GenericOption { + pub name: String, + pub value: Option, + pub vartype: String, +} + +/// Optional collection of `GenericOption`'s. Type alias allows us to +/// declare a `trait` on it. +pub type GenericOptions = Option>; + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::File; + + #[test] + fn parse_spec_file() { + let file = File::open("tests/cluster_spec.json").unwrap(); + let _spec: ComputeSpec = serde_json::from_reader(file).unwrap(); + } +} diff --git a/compute_tools/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json similarity index 100% rename from compute_tools/tests/cluster_spec.json rename to libs/compute_api/tests/cluster_spec.json From 98df7db09483d954792644756d804a300210d41e Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 10 Apr 2023 23:41:16 +0300 Subject: [PATCH 253/426] Support aarch64 in walredo seccomp code Aarch64 doesn't implement some old syscalls like open and select. Use openat instead of open to check if seccomp is supported. Leave both select and pselect6 in the allowlist since we don't call select syscall directly and may hope that libc will call pselect6 on aarch64. To check whether some syscall is supported it is possible to use `scmp_sys_resolver` from seccopm package: ``` > apt install seccopm > scmp_sys_resolver -a x86_64 select 23 > scmp_sys_resolver -a aarch64 select -10101 > scmp_sys_resolver -a aarch64 pselect6 72 ``` Negative value means that syscall is not supported. Another cross-check is to look up for the actuall syscall table in `unistd.h`. To resolve all the macroses one can use `gcc -E` as it is done in `dump_sys_aarch64()` function in libseccomp/src/arch-syscall-validate. --- pgxn/neon_walredo/seccomp.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c index 5d5ba549ef..1a9121734e 100644 --- a/pgxn/neon_walredo/seccomp.c +++ b/pgxn/neon_walredo/seccomp.c @@ -9,6 +9,13 @@ * To prevent this, it has been decided to limit possible interactions * with the outside world using the Secure Computing BPF mode. * + * This code is intended to support both x86_64 and aarch64. The latter + * doesn't implement some syscalls like open and select. So instead of + * open we use openat. We add both select (absend on aarch64) and + * pselect6 (present on both architectures) to the allowlist. Since we + * don't call select syscall directly we may expect that libc will call + * pselect6 on aarch64. + * * We use this mode to disable all syscalls not in the allowlist. This * approach has its pros & cons: * @@ -122,9 +129,10 @@ seccomp_load_rules(PgSeccompRule *rules, int count) /* * First, check that open of a well-known file works. - * XXX: We use raw syscall() to call the very open(). + * XXX: We use raw syscall() to call the very openat() which is + * present both on x86_64 and on aarch64. */ - fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -142,8 +150,8 @@ seccomp_load_rules(PgSeccompRule *rules, int count) (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not load test trap"))); - /* Finally, check that open() now raises SIGSYS */ - (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + /* Finally, check that openat() now raises SIGSYS */ + (void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (!seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), From 83549a8d409a35a8b6f0233c867c2088bb5e665c Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 11 Apr 2023 00:08:01 +0300 Subject: [PATCH 254/426] Revert "Support aarch64 in walredo seccomp code" This reverts commit 98df7db09483d954792644756d804a300210d41e. --- pgxn/neon_walredo/seccomp.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c index 1a9121734e..5d5ba549ef 100644 --- a/pgxn/neon_walredo/seccomp.c +++ b/pgxn/neon_walredo/seccomp.c @@ -9,13 +9,6 @@ * To prevent this, it has been decided to limit possible interactions * with the outside world using the Secure Computing BPF mode. * - * This code is intended to support both x86_64 and aarch64. The latter - * doesn't implement some syscalls like open and select. So instead of - * open we use openat. We add both select (absend on aarch64) and - * pselect6 (present on both architectures) to the allowlist. Since we - * don't call select syscall directly we may expect that libc will call - * pselect6 on aarch64. - * * We use this mode to disable all syscalls not in the allowlist. This * approach has its pros & cons: * @@ -129,10 +122,9 @@ seccomp_load_rules(PgSeccompRule *rules, int count) /* * First, check that open of a well-known file works. - * XXX: We use raw syscall() to call the very openat() which is - * present both on x86_64 and on aarch64. + * XXX: We use raw syscall() to call the very open(). */ - fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); + fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); if (seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -150,8 +142,8 @@ seccomp_load_rules(PgSeccompRule *rules, int count) (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not load test trap"))); - /* Finally, check that openat() now raises SIGSYS */ - (void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); + /* Finally, check that open() now raises SIGSYS */ + (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); if (!seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), From 22c890b71c2a0547a48f1056f7d17acf432dd9a1 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 10 Apr 2023 20:16:12 +0300 Subject: [PATCH 255/426] Add more cnames to proxies --- .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 1 + .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml | 1 + .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 1 + .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index aa5be89101..36dac8309d 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" + extraDomains: ["*.ap-southeast-1.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index 083af6aa2d..f5b2f31cb9 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" + extraDomains: ["*.eu-central-1.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 40fbc52b39..0be78d868a 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-2.aws.neon.tech" + extraDomains: ["*.us-east-2.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 810a6a5f78..79115be0e2 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-west-2.aws.neon.tech" + extraDomains: ["*.us-west-2.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" From 7ad5a5e847ff9f007fe3dcaffe9f8a403297a995 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 10 Apr 2023 22:13:53 +0400 Subject: [PATCH 256/426] Enable timeout on reading from socket in safekeeper WAL service. TCP_KEEPALIVE is not enabled by default, so this prevents hanged up connections in case of abrupt client termination. Add 'closed' flag to PostgresBackendReader and pass it during handles join to prevent attempts to read from socket if we errored out previously -- now with timeouts this is a common situation. It looks like 2023-04-10T18:08:37.493448Z INFO {cid=68}:WAL receiver{ttid=59f91ad4e821ab374f9ccdf918da3a85/16438f99d61572c72f0c7b0ed772785d}: terminated: timed out Presumably fixes https://github.com/neondatabase/neon/issues/3971 --- Cargo.lock | 1 + Cargo.toml | 1 + libs/postgres_backend/src/lib.rs | 47 ++++++++++++++++----- safekeeper/Cargo.toml | 1 + safekeeper/src/wal_service.rs | 70 +++++++++++++++++++------------- 5 files changed, 80 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c5b64b235a..5b99e93e76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3367,6 +3367,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "tokio-io-timeout", "tokio-postgres", "toml_edit", "tracing", diff --git a/Cargo.toml b/Cargo.toml index d563324c86..679605dc1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,6 +101,7 @@ test-context = "0.1" thiserror = "1.0" tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } +tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.9.0" tokio-rustls = "0.23" tokio-stream = "0.1" diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 60932a5950..f6bf7c6fc2 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -54,7 +54,7 @@ pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset + ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut ) } @@ -320,9 +320,17 @@ impl PostgresBackend { if let ProtoState::Closed = self.state { Ok(None) } else { - let m = self.framed.read_message().await?; - trace!("read msg {:?}", m); - Ok(m) + match self.framed.read_message().await { + Ok(m) => { + trace!("read msg {:?}", m); + Ok(m) + } + Err(e) => { + // remember not to try to read anymore + self.state = ProtoState::Closed; + Err(e) + } + } } } @@ -493,7 +501,10 @@ impl PostgresBackend { MaybeWriteOnly::Full(framed) => { let (reader, writer) = framed.split(); self.framed = MaybeWriteOnly::WriteOnly(writer); - Ok(PostgresBackendReader(reader)) + Ok(PostgresBackendReader { + reader, + closed: false, + }) } MaybeWriteOnly::WriteOnly(_) => { anyhow::bail!("PostgresBackend is already split") @@ -510,8 +521,12 @@ impl PostgresBackend { anyhow::bail!("PostgresBackend is not split") } MaybeWriteOnly::WriteOnly(writer) => { - let joined = Framed::unsplit(reader.0, writer); + let joined = Framed::unsplit(reader.reader, writer); self.framed = MaybeWriteOnly::Full(joined); + // if reader encountered connection error, do not attempt reading anymore + if reader.closed { + self.state = ProtoState::Closed; + } Ok(()) } MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"), @@ -797,15 +812,25 @@ impl PostgresBackend { } } -pub struct PostgresBackendReader(FramedReader>); +pub struct PostgresBackendReader { + reader: FramedReader>, + closed: bool, // true if received error closing the connection +} impl PostgresBackendReader { /// Read full message or return None if connection is cleanly closed with no /// unprocessed data. pub async fn read_message(&mut self) -> Result, ConnectionError> { - let m = self.0.read_message().await?; - trace!("read msg {:?}", m); - Ok(m) + match self.reader.read_message().await { + Ok(m) => { + trace!("read msg {:?}", m); + Ok(m) + } + Err(e) => { + self.closed = true; + Err(e) + } + } } /// Get CopyData contents of the next message in COPY stream or error @@ -923,7 +948,7 @@ pub enum CopyStreamHandlerEnd { #[error("EOF on COPY stream")] EOF, /// The connection was lost - #[error(transparent)] + #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), /// Some other error #[error(transparent)] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 8b0733832a..00cd111da5 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,7 @@ serde_with.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } +tokio-io-timeout.workspace = true tokio-postgres.workspace = true toml_edit.workspace = true tracing.workspace = true diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 22f50c3428..fb0d77a9f2 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,8 +4,9 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; -use std::{future, thread}; +use std::{future, thread, time::Duration}; use tokio::net::TcpStream; +use tokio_io_timeout::TimeoutReader; use tracing::*; use utils::measured_stream::MeasuredStream; @@ -67,41 +68,52 @@ fn handle_socket( let runtime = tokio::runtime::Builder::new_current_thread() .enable_all() .build()?; - let local = tokio::task::LocalSet::new(); socket.set_nodelay(true)?; let peer_addr = socket.peer_addr()?; - let traffic_metrics = TrafficMetrics::new(); - if let Some(current_az) = conf.availability_zone.as_deref() { - traffic_metrics.set_sk_az(current_az); - } + // TimeoutReader wants async runtime during creation. + runtime.block_on(async move { + // Set timeout on reading from the socket. It prevents hanged up connection + // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by + // default, and tokio doesn't provide ability to set it out of the box. + let mut socket = TimeoutReader::new(socket); + let wal_service_timeout = Duration::from_secs(60 * 10); + socket.set_timeout(Some(wal_service_timeout)); + // pin! is here because TimeoutReader (due to storing sleep future inside) + // is not Unpin, and all pgbackend/framed/tokio dependencies require stream + // to be Unpin. Which is reasonable, as indeed something like TimeoutReader + // shouldn't be moved. + tokio::pin!(socket); - let socket = MeasuredStream::new( - socket, - |cnt| { - traffic_metrics.observe_read(cnt); - }, - |cnt| { - traffic_metrics.observe_write(cnt); - }, - ); + let traffic_metrics = TrafficMetrics::new(); + if let Some(current_az) = conf.availability_zone.as_deref() { + traffic_metrics.set_sk_az(current_az); + } - let auth_type = match conf.auth { - None => AuthType::Trust, - Some(_) => AuthType::NeonJWT, - }; - let mut conn_handler = - SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; - // libpq protocol between safekeeper and walproposer / pageserver - // We don't use shutdown. - local.block_on( - &runtime, - pgbackend.run(&mut conn_handler, future::pending::<()>), - )?; + let socket = MeasuredStream::new( + socket, + |cnt| { + traffic_metrics.observe_read(cnt); + }, + |cnt| { + traffic_metrics.observe_write(cnt); + }, + ); - Ok(()) + let auth_type = match conf.auth { + None => AuthType::Trust, + Some(_) => AuthType::NeonJWT, + }; + let mut conn_handler = + SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + // libpq protocol between safekeeper and walproposer / pageserver + // We don't use shutdown. + pgbackend + .run(&mut conn_handler, future::pending::<()>) + .await + }) } /// Unique WAL service connection ids are logged in spans for observability. From c79d5a947cde6f0d6e0781dca6cd20480a3d7762 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 11 Apr 2023 10:58:04 +0100 Subject: [PATCH 257/426] Nightly Benchmarks: run third-party benchmarks once a week (#3987) --- .github/workflows/benchmarking.yml | 101 ++++++++++++++++++----------- 1 file changed, 63 insertions(+), 38 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 2aeea6eca4..028fe8d8ad 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -107,28 +107,65 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + generate-matrices: + # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) + # + # Available platforms: + # - neon-captest-new: Freshly created project (1 CU) + # - neon-captest-freetier: Use freetier-sized compute (0.25 CU) + # - neon-captest-reuse: Reusing existing project + # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs + # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage + runs-on: ubuntu-latest + outputs: + pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} + olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }} + + steps: + - name: Generate matrix for pgbench benchmark + id: pgbench-compare-matrix + run: | + matrix='{ + "platform": [ + "neon-captest-new", + "neon-captest-reuse" + ], + "db_size": [ "10gb" ], + "include": [ + { "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "platform": "neon-captest-new", "db_size": "50gb" } + ] + }' + + if [ "$(date +%A)" = "Saturday" ]; then + matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, + { "platform": "rds-aurora", "db_size": "50gb"}]') + fi + + echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT + + - name: Generate matrix for OLAP benchmarks + id: olap-compare-matrix + run: | + matrix='{ + "platform": [ + "neon-captest-reuse" + ] + }' + + if [ "$(date +%A)" = "Saturday" ]; then + matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" }, + { "platform": "rds-aurora" }]') + fi + + echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT + pgbench-compare: + needs: [ generate-matrices ] + strategy: fail-fast: false - matrix: - # neon-captest-freetier: Run pgbench with freetier-limited compute - # neon-captest-new: Run pgbench in a freshly created project - # neon-captest-reuse: Same, but reusing existing project - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, neon-captest-new, rds-postgres ] - db_size: [ 10gb ] - runner: [ us-east-2 ] - include: - - platform: neon-captest-freetier - db_size: 3gb - runner: us-east-2 - - platform: neon-captest-new - db_size: 50gb - runner: us-east-2 - - platform: rds-aurora - db_size: 50gb - runner: us-east-2 + matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}} env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" @@ -140,7 +177,7 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} PLATFORM: ${{ matrix.platform }} - runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ] + runs-on: [ self-hosted, us-east-2, x64 ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -269,15 +306,11 @@ jobs: # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: success() || failure() - needs: [ pgbench-compare ] + needs: [ generate-matrices, pgbench-compare ] strategy: fail-fast: false - matrix: - # neon-captest-reuse: We have pre-created projects 1 CU - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -369,15 +402,11 @@ jobs: # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) if: success() || failure() - needs: [ clickbench-compare ] + needs: [ generate-matrices, clickbench-compare ] strategy: fail-fast: false - matrix: - # neon-captest-reuse: We have pre-created projects 1 CU - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -463,15 +492,11 @@ jobs: user-examples-compare: if: success() || failure() - needs: [ tpch-compare ] + needs: [ generate-matrices, tpch-compare ] strategy: fail-fast: false - matrix: - # neon-captest-reuse: We have pre-created projects with 1 CU - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install From de99ee2c0d4c2640309eb3e160e1a7f50141abf9 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 11 Apr 2023 12:50:10 +0300 Subject: [PATCH 258/426] Add more proxy cnames --- .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 2 +- .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml | 2 +- .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 2 +- .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 36dac8309d..5a98217bae 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" - extraDomains: ["*.ap-southeast-1.retooldb.com"] + extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index f5b2f31cb9..a9ee49d82f 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" - extraDomains: ["*.eu-central-1.retooldb.com"] + extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 0be78d868a..239a9911c7 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-2.aws.neon.tech" - extraDomains: ["*.us-east-2.retooldb.com"] + extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 79115be0e2..c987ae236a 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-west-2.aws.neon.tech" - extraDomains: ["*.us-west-2.retooldb.com"] + extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" From 40a68e907753b6813d00d8fd1266601c7e929132 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 11 Apr 2023 15:05:35 +0200 Subject: [PATCH 259/426] [compute_ctl] Add timeout for `tracing_utils::shutdown_tracing()` (#3982) Shutting down OTEL tracing provider may hang for quite some time, see, for example: - https://github.com/open-telemetry/opentelemetry-rust/issues/868 - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 Yet, we want computes to shut down fast enough, as we may need a new one for the same timeline ASAP. So wait no longer than 2s for the shutdown to complete, then just error out and exit the main thread. Related to neondatabase/cloud#3707 --- compute_tools/src/bin/compute_ctl.rs | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index d61eae5f7a..bce860b56b 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -34,7 +34,7 @@ use std::fs::File; use std::panic; use std::path::Path; use std::process::exit; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::{mpsc, Arc, Condvar, Mutex}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; @@ -239,10 +239,25 @@ fn main() -> Result<()> { thread::sleep(Duration::from_secs(30)); } - info!("shutting down tracing"); // Shutdown trace pipeline gracefully, so that it has a chance to send any - // pending traces before we exit. - tracing_utils::shutdown_tracing(); + // pending traces before we exit. Shutting down OTEL tracing provider may + // hang for quite some time, see, for example: + // - https://github.com/open-telemetry/opentelemetry-rust/issues/868 + // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 + // + // Yet, we want computes to shut down fast enough, as we may need a new one + // for the same timeline ASAP. So wait no longer than 2s for the shutdown to + // complete, then just error out and exit the main thread. + info!("shutting down tracing"); + let (sender, receiver) = mpsc::channel(); + let _ = thread::spawn(move || { + tracing_utils::shutdown_tracing(); + sender.send(()).ok() + }); + let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000)); + if shutdown_res.is_err() { + error!("timed out while shutting down tracing, exiting anyway"); + } info!("shutting down"); exit(exit_code.unwrap_or(1)) From 3c9f42a2e21a4bbfd4c669c5b7b52fad454ffae4 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 11 Apr 2023 22:28:18 +0300 Subject: [PATCH 260/426] Support aarch64 in walredo seccomp code (#3996) Aarch64 doesn't implement some old syscalls like open and select. Use openat instead of open to check if seccomp is supported. Leave both select and pselect6 in the allowlist since we don't call select syscall directly and may hope that libc will call pselect6 on aarch64. To check whether some syscall is supported it is possible to use `scmp_sys_resolver` from seccopm package: ``` > apt install seccopm > scmp_sys_resolver -a x86_64 select 23 > scmp_sys_resolver -a aarch64 select -10101 > scmp_sys_resolver -a aarch64 pselect6 72 ``` Negative value means that syscall is not supported. Another cross-check is to look up for the actuall syscall table in `unistd.h`. To resolve all the macroses one can use `gcc -E` as it is done in `dump_sys_aarch64()` function in libseccomp/src/arch-syscall-validate. --------- Co-authored-by: Heikki Linnakangas --- pgxn/neon_walredo/seccomp.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c index 5d5ba549ef..1e8f6682a2 100644 --- a/pgxn/neon_walredo/seccomp.c +++ b/pgxn/neon_walredo/seccomp.c @@ -9,6 +9,14 @@ * To prevent this, it has been decided to limit possible interactions * with the outside world using the Secure Computing BPF mode. * + * This code is intended to support both x86_64 and aarch64. The latter + * doesn't implement some syscalls like open and select. We allow both + * select (absent on aarch64) and pselect6 (present on both architectures) + * We call select(2) through libc, and the libc wrapper calls select or pselect6 + * depending on the architecture. You can check which syscalls are present on + * different architectures with the `scmp_sys_resolver` tool from the + * seccomp package. + * * We use this mode to disable all syscalls not in the allowlist. This * approach has its pros & cons: * @@ -73,8 +81,6 @@ * I suspect that certain libc functions might involve slightly * different syscalls, e.g. select/pselect6/pselect6_time64/whatever. * - * - Test on any arch other than amd64 to see if it works there. - * *------------------------------------------------------------------------- */ @@ -122,9 +128,10 @@ seccomp_load_rules(PgSeccompRule *rules, int count) /* * First, check that open of a well-known file works. - * XXX: We use raw syscall() to call the very open(). + * XXX: We use raw syscall() to call the very openat() which is + * present both on x86_64 and on aarch64. */ - fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -135,15 +142,15 @@ seccomp_load_rules(PgSeccompRule *rules, int count) errmsg("seccomp: could not open /dev/null for seccomp testing: %m"))); close((int) fd); - /* Set a trap on open() to test seccomp bpf */ - rule = PG_SCMP(open, SCMP_ACT_TRAP); + /* Set a trap on openat() to test seccomp bpf */ + rule = PG_SCMP(openat, SCMP_ACT_TRAP); if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not load test trap"))); - /* Finally, check that open() now raises SIGSYS */ - (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + /* Finally, check that openat() now raises SIGSYS */ + (void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (!seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -224,7 +231,7 @@ seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unus die(1, DIE_PREFIX "bad signal number\n"); /* TODO: maybe somehow extract the hardcoded syscall number */ - if (info->si_syscall != SCMP_SYS(open)) + if (info->si_syscall != SCMP_SYS(openat)) die(1, DIE_PREFIX "bad syscall number\n"); #undef DIE_PREFIX From 6064a26963ec7811989da0f944904622997d964f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 12 Apr 2023 01:13:17 +0300 Subject: [PATCH 261/426] Refactor 'spec' in ComputeState. Sometimes, it contained real values, sometimes just defaults if the spec was not received yet. Make the state more clear by making it an Option instead. One consequence is that if some of the required settings like neon.tenant_id are missing from the spec file sent to the /configure endpoint, it is spotted earlier and you get an immediate HTTP error response. Not that it matters very much, but it's nicer nevertheless. --- compute_tools/src/bin/compute_ctl.rs | 53 ++++---------- compute_tools/src/compute.rs | 101 +++++++++++++++++---------- compute_tools/src/http/api.rs | 14 ++-- libs/compute_api/src/responses.rs | 4 +- 4 files changed, 91 insertions(+), 81 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index bce860b56b..633e603f6b 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -45,12 +45,11 @@ use url::Url; use compute_api::responses::ComputeStatus; -use compute_tools::compute::{ComputeNode, ComputeState}; +use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_tools::http::api::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; -use compute_tools::pg_helpers::*; use compute_tools::spec::*; fn main() -> Result<()> { @@ -73,28 +72,24 @@ fn main() -> Result<()> { // Try to use just 'postgres' if no path is provided let pgbin = matches.get_one::("pgbin").unwrap(); - let mut spec = Default::default(); - let mut spec_set = false; + let mut spec = None; let mut live_config_allowed = false; match spec_json { // First, try to get cluster spec from the cli argument Some(json) => { - spec = serde_json::from_str(json)?; - spec_set = true; + spec = Some(serde_json::from_str(json)?); } None => { // Second, try to read it from the file if path is provided if let Some(sp) = spec_path { let path = Path::new(sp); let file = File::open(path)?; - spec = serde_json::from_reader(file)?; - spec_set = true; + spec = Some(serde_json::from_reader(file)?); } else if let Some(id) = compute_id { if let Some(cp_base) = control_plane_uri { live_config_allowed = true; if let Ok(s) = get_spec_from_control_plane(cp_base, id) { - spec = s; - spec_set = true; + spec = Some(s); } } else { panic!("must specify both --control-plane-uri and --compute-id or none"); @@ -109,8 +104,13 @@ fn main() -> Result<()> { }; let mut new_state = ComputeState::new(); - if spec_set { - new_state.spec = spec; + let spec_set; + if let Some(spec) = spec { + let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; + new_state.pspec = Some(pspec); + spec_set = true; + } else { + spec_set = false; } let compute_node = ComputeNode { start_time: Utc::now(), @@ -142,33 +142,10 @@ fn main() -> Result<()> { } } - // We got all we need, fill in the state. + // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); - let pageserver_connstr = state - .spec - .cluster - .settings - .find("neon.pageserver_connstring") - .expect("pageserver connstr should be provided"); - let storage_auth_token = state.spec.storage_auth_token.clone(); - let tenant = state - .spec - .cluster - .settings - .find("neon.tenant_id") - .expect("tenant id should be provided"); - let timeline = state - .spec - .cluster - .settings - .find("neon.timeline_id") - .expect("tenant id should be provided"); - let startup_tracing_context = state.spec.startup_tracing_context.clone(); - - state.pageserver_connstr = pageserver_connstr; - state.storage_auth_token = storage_auth_token; - state.tenant = tenant; - state.timeline = timeline; + let pspec = state.pspec.as_ref().expect("spec must be set"); + let startup_tracing_context = pspec.spec.startup_tracing_context.clone(); state.status = ComputeStatus::Init; compute.state_changed.notify_all(); drop(state); diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 689aa6ef43..94ec2a785c 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -69,12 +69,7 @@ pub struct ComputeState { /// Timestamp of the last Postgres activity pub last_active: DateTime, pub error: Option, - pub spec: ComputeSpec, - pub tenant: String, - pub timeline: String, - pub pageserver_connstr: String, - pub storage_auth_token: Option, - + pub pspec: Option, pub metrics: ComputeMetrics, } @@ -84,11 +79,7 @@ impl ComputeState { status: ComputeStatus::Empty, last_active: Utc::now(), error: None, - spec: ComputeSpec::default(), - tenant: String::new(), - timeline: String::new(), - pageserver_connstr: String::new(), - storage_auth_token: None, + pspec: None, metrics: ComputeMetrics::default(), } } @@ -100,6 +91,45 @@ impl Default for ComputeState { } } +#[derive(Clone, Debug)] +pub struct ParsedSpec { + pub spec: ComputeSpec, + pub tenant: String, + pub timeline: String, + pub pageserver_connstr: String, + pub storage_auth_token: Option, +} + +impl TryFrom for ParsedSpec { + type Error = String; + fn try_from(spec: ComputeSpec) -> Result { + let pageserver_connstr = spec + .cluster + .settings + .find("neon.pageserver_connstring") + .ok_or("pageserver connstr should be provided")?; + let storage_auth_token = spec.storage_auth_token.clone(); + let tenant = spec + .cluster + .settings + .find("neon.tenant_id") + .ok_or("tenant id should be provided")?; + let timeline = spec + .cluster + .settings + .find("neon.timeline_id") + .ok_or("tenant id should be provided")?; + + Ok(ParsedSpec { + spec, + pageserver_connstr, + storage_auth_token, + tenant, + timeline, + }) + } +} + impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { let mut state = self.state.lock().unwrap(); @@ -126,13 +156,14 @@ impl ComputeNode { // unarchive it to `pgdata` directory overriding all its previous content. #[instrument(skip(self, compute_state))] fn get_basebackup(&self, compute_state: &ComputeState, lsn: &str) -> Result<()> { + let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Utc::now(); - let mut config = postgres::Config::from_str(&compute_state.pageserver_connstr)?; + let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?; // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. - if let Some(storage_auth_token) = &compute_state.storage_auth_token { + if let Some(storage_auth_token) = &spec.storage_auth_token { info!("Got storage auth token from spec file"); config.password(storage_auth_token); } else { @@ -141,14 +172,8 @@ impl ComputeNode { let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { - "0/0" => format!( - "basebackup {} {}", - &compute_state.tenant, &compute_state.timeline - ), // First start of the compute - _ => format!( - "basebackup {} {} {}", - &compute_state.tenant, &compute_state.timeline, lsn - ), + "0/0" => format!("basebackup {} {}", &spec.tenant, &spec.timeline), // First start of the compute + _ => format!("basebackup {} {} {}", &spec.tenant, &spec.timeline, lsn), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; @@ -218,27 +243,27 @@ impl ComputeNode { /// safekeepers sync, basebackup, etc. #[instrument(skip(self, compute_state))] pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { - let spec = &compute_state.spec; + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let pgdata_path = Path::new(&self.pgdata); // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?; info!("starting safekeepers syncing"); let lsn = self - .sync_safekeepers(compute_state.storage_auth_token.clone()) + .sync_safekeepers(pspec.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")?; info!("safekeepers synced at LSN {}", lsn); info!( "getting basebackup@{} from pageserver {}", - lsn, &compute_state.pageserver_connstr + lsn, &pspec.pageserver_connstr ); self.get_basebackup(compute_state, &lsn).with_context(|| { format!( "failed to get basebackup@{} from pageserver {}", - lsn, &compute_state.pageserver_connstr + lsn, &pspec.pageserver_connstr ) })?; @@ -306,19 +331,20 @@ impl ComputeNode { }; // Proceed with post-startup configuration. Note, that order of operations is important. - handle_roles(&compute_state.spec, &mut client)?; - handle_databases(&compute_state.spec, &mut client)?; - handle_role_deletions(&compute_state.spec, self.connstr.as_str(), &mut client)?; - handle_grants(&compute_state.spec, self.connstr.as_str(), &mut client)?; + let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; + handle_roles(spec, &mut client)?; + handle_databases(spec, &mut client)?; + handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; + handle_grants(spec, self.connstr.as_str(), &mut client)?; create_writability_check_data(&mut client)?; - handle_extensions(&compute_state.spec, &mut client)?; + handle_extensions(spec, &mut client)?; // 'Close' connection drop(client); info!( "finished configuration of compute for project {}", - compute_state.spec.cluster.cluster_id + spec.cluster.cluster_id ); Ok(()) @@ -327,19 +353,20 @@ impl ComputeNode { #[instrument(skip(self))] pub fn start_compute(&self) -> Result { let compute_state = self.state.lock().unwrap().clone(); + let spec = compute_state.pspec.as_ref().expect("spec must be set"); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", - compute_state.spec.cluster.cluster_id, - compute_state.spec.operation_uuid.as_ref().unwrap(), - compute_state.tenant, - compute_state.timeline, + spec.spec.cluster.cluster_id, + spec.spec.operation_uuid.as_ref().unwrap(), + spec.tenant, + spec.timeline, ); self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); - let pg = self.start_postgres(compute_state.storage_auth_token.clone())?; + let pg = self.start_postgres(spec.storage_auth_token.clone())?; self.apply_config(&compute_state)?; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index cea45dc596..2ef2d898e1 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,7 +3,7 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; -use crate::compute::{ComputeNode, ComputeState}; +use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; @@ -18,8 +18,8 @@ use tracing_utils::http::OtelName; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { - tenant: state.tenant.clone(), - timeline: state.timeline.clone(), + tenant: state.pspec.as_ref().map(|pspec| pspec.tenant.clone()), + timeline: state.pspec.as_ref().map(|pspec| pspec.timeline.clone()), status: state.status, last_active: state.last_active, error: state.error.clone(), @@ -135,6 +135,12 @@ async fn handle_configure_request( let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap(); if let Ok(request) = serde_json::from_str::(&spec_raw) { let spec = request.spec; + + let parsed_spec = match ParsedSpec::try_from(spec) { + Ok(ps) => ps, + Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)), + }; + // XXX: wrap state update under lock in code blocks. Otherwise, // we will try to `Send` `mut state` into the spawned thread // bellow, which will cause error: @@ -150,7 +156,7 @@ async fn handle_configure_request( ); return Err((msg, StatusCode::PRECONDITION_FAILED)); } - state.spec = spec; + state.pspec = Some(parsed_spec); state.status = ComputeStatus::ConfigurationPending; compute.state_changed.notify_all(); drop(state); diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 43289a5e3e..a28c6e8996 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -12,8 +12,8 @@ pub struct GenericAPIError { #[derive(Serialize, Debug)] #[serde(rename_all = "snake_case")] pub struct ComputeStatusResponse { - pub tenant: String, - pub timeline: String, + pub tenant: Option, + pub timeline: Option, pub status: ComputeStatus, #[serde(serialize_with = "rfc3339_serialize")] pub last_active: DateTime, From ef68321b315fdf4433b6d4850d807326f40c458d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 12 Apr 2023 01:57:27 +0300 Subject: [PATCH 262/426] Use Lsn, TenantId, TimelineId types in compute_ctl. Stronger types are generally nicer. --- Cargo.lock | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 38 ++++++++++++++++++++--------------- compute_tools/src/http/api.rs | 10 +++++++-- 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b99e93e76..668487a9bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -879,6 +879,7 @@ dependencies = [ "tracing-subscriber", "tracing-utils", "url", + "utils", "workspace_hack", ] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index f315d2b7d9..21226249cf 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -28,4 +28,5 @@ tracing-utils.workspace = true url.workspace = true compute_api.workspace = true +utils.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 94ec2a785c..426e2845ee 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -26,6 +26,8 @@ use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; use tokio_postgres; use tracing::{info, instrument, warn}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::ComputeSpec; @@ -94,8 +96,8 @@ impl Default for ComputeState { #[derive(Clone, Debug)] pub struct ParsedSpec { pub spec: ComputeSpec, - pub tenant: String, - pub timeline: String, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, pub pageserver_connstr: String, pub storage_auth_token: Option, } @@ -109,23 +111,27 @@ impl TryFrom for ParsedSpec { .find("neon.pageserver_connstring") .ok_or("pageserver connstr should be provided")?; let storage_auth_token = spec.storage_auth_token.clone(); - let tenant = spec + let tenant_id: TenantId = spec .cluster .settings .find("neon.tenant_id") - .ok_or("tenant id should be provided")?; - let timeline = spec + .ok_or("tenant id should be provided") + .map(|s| TenantId::from_str(&s))? + .or(Err("invalid tenant id"))?; + let timeline_id: TimelineId = spec .cluster .settings .find("neon.timeline_id") - .ok_or("tenant id should be provided")?; + .ok_or("timeline id should be provided") + .map(|s| TimelineId::from_str(&s))? + .or(Err("invalid timeline id"))?; Ok(ParsedSpec { spec, pageserver_connstr, storage_auth_token, - tenant, - timeline, + tenant_id, + timeline_id, }) } } @@ -155,7 +161,7 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. #[instrument(skip(self, compute_state))] - fn get_basebackup(&self, compute_state: &ComputeState, lsn: &str) -> Result<()> { + fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Utc::now(); @@ -172,8 +178,8 @@ impl ComputeNode { let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { - "0/0" => format!("basebackup {} {}", &spec.tenant, &spec.timeline), // First start of the compute - _ => format!("basebackup {} {} {}", &spec.tenant, &spec.timeline, lsn), + Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute + _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; @@ -197,7 +203,7 @@ impl ComputeNode { // Run `postgres` in a special mode with `--sync-safekeepers` argument // and return the reported LSN back to the caller. #[instrument(skip(self, storage_auth_token))] - fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { + fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); let sync_handle = Command::new(&self.pgbin) @@ -234,7 +240,7 @@ impl ComputeNode { .unwrap() .as_millis() as u64; - let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); + let lsn = Lsn::from_str(String::from_utf8(sync_output.stdout)?.trim())?; Ok(lsn) } @@ -260,7 +266,7 @@ impl ComputeNode { "getting basebackup@{} from pageserver {}", lsn, &pspec.pageserver_connstr ); - self.get_basebackup(compute_state, &lsn).with_context(|| { + self.get_basebackup(compute_state, lsn).with_context(|| { format!( "failed to get basebackup@{} from pageserver {}", lsn, &pspec.pageserver_connstr @@ -358,8 +364,8 @@ impl ComputeNode { "starting compute for project {}, operation {}, tenant {}, timeline {}", spec.spec.cluster.cluster_id, spec.spec.operation_uuid.as_ref().unwrap(), - spec.tenant, - spec.timeline, + spec.tenant_id, + spec.timeline_id, ); self.prepare_pgdata(&compute_state)?; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 2ef2d898e1..81d4953345 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -18,8 +18,14 @@ use tracing_utils::http::OtelName; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { - tenant: state.pspec.as_ref().map(|pspec| pspec.tenant.clone()), - timeline: state.pspec.as_ref().map(|pspec| pspec.timeline.clone()), + tenant: state + .pspec + .as_ref() + .map(|pspec| pspec.tenant_id.to_string()), + timeline: state + .pspec + .as_ref() + .map(|pspec| pspec.timeline_id.to_string()), status: state.status, last_active: state.last_active, error: state.error.clone(), From 8ace7a7515bab697aec906d469d2a663315a1740 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 12 Apr 2023 01:57:43 +0300 Subject: [PATCH 263/426] Remove unused 'timestamp' field from ComputeSpec struct. --- libs/compute_api/src/spec.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 37fe133b68..f771910329 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -15,7 +15,10 @@ pub type PgIdent = String; #[derive(Clone, Debug, Default, Deserialize)] pub struct ComputeSpec { pub format_version: f32, - pub timestamp: String, + + // The control plane also includes a 'timestamp' field in the JSON document, + // but we don't use it for anything. Serde will ignore missing fields when + // deserializing it. pub operation_uuid: Option, /// Expected cluster state at the end of transition process. pub cluster: Cluster, From 06ce83c9125110361a78e15b01b66d36040a87b0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 12 Apr 2023 01:57:48 +0300 Subject: [PATCH 264/426] Tolerate missing 'operation_uuid' field in spec file. 'compute_ctl' doesn't use the operation_uuid for anything, it just prints it to the log. --- compute_tools/src/compute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 426e2845ee..07ede44c9b 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -363,7 +363,7 @@ impl ComputeNode { info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", spec.spec.cluster.cluster_id, - spec.spec.operation_uuid.as_ref().unwrap(), + spec.spec.operation_uuid.as_deref().unwrap_or("None"), spec.tenant_id, spec.timeline_id, ); From d8939d4162816d0576e2a7fb4ecc0b3479c4f28b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 12 Apr 2023 12:39:02 +0300 Subject: [PATCH 265/426] Move walreceiver start and stop behind a struct (#3973) The PR changes module function-based walreceiver interface with a `WalReceiver` struct that exposes a few public methods, `new`, `start` and `stop` now. Later, the same struct is planned to be used for getting walreceiver stats (and, maybe, other extra data) to display during missing wal errors for https://github.com/neondatabase/neon/issues/2106 Now though, the change required extra logic changes: * due to the `WalReceiver` struct added, it became easier to pass `ctx` and later do a `detached_child` instead of https://github.com/neondatabase/neon/blob/bfee4127014022a43bd85bccb562ed4bc62dc075/pageserver/src/tenant/timeline.rs#L1379-L1381 * `WalReceiver::start` which is now the public API to start the walreceiver, could return an `Err` which now may turn a tenant into `Broken`, same as the timeline that it tries to load during startup. * `WalReceiverConf` was added to group walreceiver parameters from pageserver's tenant config --- pageserver/src/tenant.rs | 49 +++-- pageserver/src/tenant/timeline.rs | 84 ++++---- pageserver/src/tenant/timeline/walreceiver.rs | 133 +++++++++++- .../walreceiver/connection_manager.rs | 201 ++++++------------ .../walreceiver/walreceiver_connection.rs | 4 +- 5 files changed, 268 insertions(+), 203 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7fac7d2ac0..03a4ff8c8e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -177,9 +177,9 @@ impl UninitializedTimeline<'_> { /// /// The new timeline is initialized in Active state, and its background jobs are /// started - pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result> { + pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result> { let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - self.initialize_with_lock(&mut timelines, true, true) + self.initialize_with_lock(ctx, &mut timelines, true, true) } /// Like `initialize`, but the caller is already holding lock on Tenant::timelines. @@ -189,6 +189,7 @@ impl UninitializedTimeline<'_> { /// been initialized. fn initialize_with_lock( mut self, + ctx: &RequestContext, timelines: &mut HashMap>, load_layer_map: bool, activate: bool, @@ -229,7 +230,9 @@ impl UninitializedTimeline<'_> { new_timeline.maybe_spawn_flush_loop(); if activate { - new_timeline.activate(); + new_timeline + .activate(ctx) + .context("initializing timeline activation")?; } } } @@ -469,7 +472,7 @@ impl Tenant { local_metadata: Option, ancestor: Option>, first_save: bool, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; @@ -504,7 +507,7 @@ impl Tenant { // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver // will ingest data which may require looking at the layers which are not yet available locally - match timeline.initialize_with_lock(&mut timelines_accessor, true, false) { + match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true, false) { Ok(new_timeline) => new_timeline, Err(e) => { error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}"); @@ -629,7 +632,7 @@ impl Tenant { /// /// Background task that downloads all data for a tenant and brings it to Active state. /// - #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] async fn attach(self: &Arc, ctx: RequestContext) -> anyhow::Result<()> { // Create directory with marker file to indicate attaching state. // The load_local_tenants() function in tenant::mgr relies on the marker file @@ -750,7 +753,7 @@ impl Tenant { // Start background operations and open the tenant for business. // The loops will shut themselves down when they notice that the tenant is inactive. - self.activate()?; + self.activate(&ctx)?; info!("Done"); @@ -1022,7 +1025,7 @@ impl Tenant { // Start background operations and open the tenant for business. // The loops will shut themselves down when they notice that the tenant is inactive. - self.activate()?; + self.activate(ctx)?; info!("Done"); @@ -1358,12 +1361,7 @@ impl Tenant { // Stop the walreceiver first. debug!("waiting for wal receiver to shutdown"); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_id), - Some(timeline_id), - ) - .await; + timeline.walreceiver.stop().await; debug!("wal receiver shutdown confirmed"); info!("waiting for timeline tasks to shutdown"); @@ -1450,7 +1448,7 @@ impl Tenant { } /// Changes tenant status to active, unless shutdown was already requested. - fn activate(&self) -> anyhow::Result<()> { + fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> { let mut result = Ok(()); self.state.send_modify(|current_state| { match *current_state { @@ -1484,7 +1482,20 @@ impl Tenant { tasks::start_background_loops(self.tenant_id); for timeline in not_broken_timelines { - timeline.activate(); + match timeline + .activate(ctx) + .context("timeline activation for activating tenant") + { + Ok(()) => {} + Err(e) => { + error!( + "Failed to activate timeline {}: {:#}", + timeline.timeline_id, e + ); + timeline.set_state(TimelineState::Broken); + *current_state = TenantState::Broken; + } + } } } } @@ -2093,7 +2104,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result> { let src_id = src_timeline.timeline_id; @@ -2186,7 +2197,7 @@ impl Tenant { false, Some(Arc::clone(src_timeline)), )? - .initialize_with_lock(&mut timelines, true, true)?; + .initialize_with_lock(ctx, &mut timelines, true, true)?; drop(timelines); // Root timeline gets its layers during creation and uploads them along with the metadata. @@ -2299,7 +2310,7 @@ impl Tenant { let timeline = { let mut timelines = self.timelines.lock().unwrap(); - raw_timeline.initialize_with_lock(&mut timelines, false, true)? + raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)? }; info!( diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e80e32644b..4b0d7a6994 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,6 +14,7 @@ use pageserver_api::models::{ DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState, }; use remote_storage::GenericRemoteStorage; +use storage_broker::BrokerClientChannel; use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -30,7 +31,7 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::broker_client::is_broker_client_initialized; +use crate::broker_client::{get_broker_client, is_broker_client_initialized}; use crate::context::{DownloadBehavior, RequestContext}; use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata}; use crate::tenant::storage_layer::{ @@ -71,10 +72,10 @@ use crate::walredo::WalRedoManager; use crate::METADATA_FILE_NAME; use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; -use walreceiver::spawn_connection_manager_task; pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; +use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::layer_map::BatchedUpdates; use super::remote_timeline_client::index::IndexPart; @@ -214,6 +215,7 @@ pub struct Timeline { /// or None if WAL receiver has not received anything for this timeline /// yet. pub last_received_wal: Mutex>, + pub walreceiver: WalReceiver, /// Relation size cache pub rel_size_cache: RwLock>, @@ -866,10 +868,18 @@ impl Timeline { Ok(()) } - pub fn activate(self: &Arc) { + pub fn activate(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { + if is_broker_client_initialized() { + self.launch_wal_receiver(ctx, get_broker_client().clone())?; + } else if cfg!(test) { + info!("not launching WAL receiver because broker client hasn't been initialized"); + } else { + anyhow::bail!("broker client not initialized"); + } + self.set_state(TimelineState::Active); - self.launch_wal_receiver(); self.launch_eviction_task(); + Ok(()) } pub fn set_state(&self, new_state: TimelineState) { @@ -1220,7 +1230,31 @@ impl Timeline { let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); + let tenant_conf_guard = tenant_conf.read().unwrap(); + let wal_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(conf.default_tenant_conf.lagging_wal_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag); + drop(tenant_conf_guard); + Arc::new_cyclic(|myself| { + let walreceiver = WalReceiver::new( + TenantTimelineId::new(tenant_id, timeline_id), + Weak::clone(myself), + WalReceiverConf { + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), + availability_zone: conf.availability_zone.clone(), + }, + ); + let mut result = Timeline { conf, tenant_conf, @@ -1231,6 +1265,7 @@ impl Timeline { layers: RwLock::new(LayerMap::default()), walredo_mgr, + walreceiver, remote_client: remote_client.map(Arc::new), @@ -1350,44 +1385,17 @@ impl Timeline { *flush_loop_state = FlushLoopState::Running; } - pub(super) fn launch_wal_receiver(self: &Arc) { - if !is_broker_client_initialized() { - if cfg!(test) { - info!("not launching WAL receiver because broker client hasn't been initialized"); - return; - } else { - panic!("broker client not initialized"); - } - } - + pub(super) fn launch_wal_receiver( + &self, + ctx: &RequestContext, + broker_client: BrokerClientChannel, + ) -> anyhow::Result<()> { info!( "launching WAL receiver for timeline {} of tenant {}", self.timeline_id, self.tenant_id ); - let tenant_conf_guard = self.tenant_conf.read().unwrap(); - let lagging_wal_timeout = tenant_conf_guard - .lagging_wal_timeout - .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); - let walreceiver_connect_timeout = tenant_conf_guard - .walreceiver_connect_timeout - .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); - let max_lsn_wal_lag = tenant_conf_guard - .max_lsn_wal_lag - .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); - drop(tenant_conf_guard); - let self_clone = Arc::clone(self); - let background_ctx = - // XXX: this is a detached_child. Plumb through the ctx from call sites. - RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - spawn_connection_manager_task( - self_clone, - walreceiver_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), - self.conf.availability_zone.clone(), - background_ctx, - ); + self.walreceiver.start(ctx, broker_client)?; + Ok(()) } /// diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f33a12c5cc..00f446af38 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -23,14 +23,133 @@ mod connection_manager; mod walreceiver_connection; -use crate::task_mgr::WALRECEIVER_RUNTIME; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::timeline::walreceiver::connection_manager::{ + connection_manager_loop_step, ConnectionManagerState, +}; +use anyhow::Context; use std::future::Future; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::atomic::{self, AtomicBool}; +use std::sync::{Arc, Weak}; +use std::time::Duration; +use storage_broker::BrokerClientChannel; +use tokio::select; use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -pub use connection_manager::spawn_connection_manager_task; +use utils::id::TenantTimelineId; + +use super::Timeline; + +#[derive(Clone)] +pub struct WalReceiverConf { + /// The timeout on the connection to safekeeper for WAL streaming. + pub wal_connect_timeout: Duration, + /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. + pub lagging_wal_timeout: Duration, + /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. + pub max_lsn_wal_lag: NonZeroU64, + pub auth_token: Option>, + pub availability_zone: Option, +} + +pub struct WalReceiver { + timeline: TenantTimelineId, + timeline_ref: Weak, + conf: WalReceiverConf, + started: AtomicBool, +} + +impl WalReceiver { + pub fn new( + timeline: TenantTimelineId, + timeline_ref: Weak, + conf: WalReceiverConf, + ) -> Self { + Self { + timeline, + timeline_ref, + conf, + started: AtomicBool::new(false), + } + } + + pub fn start( + &self, + ctx: &RequestContext, + mut broker_client: BrokerClientChannel, + ) -> anyhow::Result<()> { + if self.started.load(atomic::Ordering::Acquire) { + anyhow::bail!("Wal receiver is already started"); + } + + let timeline = self.timeline_ref.upgrade().with_context(|| { + format!("walreceiver start on a dropped timeline {}", self.timeline) + })?; + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let walreceiver_ctx = + ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); + + let wal_receiver_conf = self.conf.clone(); + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverManager, + Some(tenant_id), + Some(timeline_id), + &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), + false, + async move { + info!("WAL receiver manager started, connecting to broker"); + let mut connection_manager_state = ConnectionManagerState::new( + timeline, + wal_receiver_conf, + ); + loop { + select! { + _ = task_mgr::shutdown_watcher() => { + info!("WAL receiver shutdown requested, shutting down"); + connection_manager_state.shutdown().await; + return Ok(()); + }, + loop_step_result = connection_manager_loop_step( + &mut broker_client, + &mut connection_manager_state, + &walreceiver_ctx, + ) => match loop_step_result { + ControlFlow::Continue(()) => continue, + ControlFlow::Break(()) => { + info!("Connection manager loop ended, shutting down"); + connection_manager_state.shutdown().await; + return Ok(()); + } + }, + } + } + }.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id)) + ); + + self.started.store(true, atomic::Ordering::Release); + + Ok(()) + } + + pub async fn stop(&self) { + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.timeline.tenant_id), + Some(self.timeline.timeline_id), + ) + .await; + self.started.store(false, atomic::Ordering::Release); + } +} /// A handle of an asynchronous task. /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`] @@ -39,26 +158,26 @@ pub use connection_manager::spawn_connection_manager_task; /// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. /// That may lead to certain events not being observed by the listener. #[derive(Debug)] -pub struct TaskHandle { +struct TaskHandle { join_handle: Option>>, events_receiver: watch::Receiver>, cancellation: CancellationToken, } -pub enum TaskEvent { +enum TaskEvent { Update(TaskStateUpdate), End(anyhow::Result<()>), } #[derive(Debug, Clone)] -pub enum TaskStateUpdate { +enum TaskStateUpdate { Started, Progress(E), } impl TaskHandle { /// Initializes the task, starting it immediately after the creation. - pub fn spawn( + fn spawn( task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where @@ -131,7 +250,7 @@ impl TaskHandle { } /// Aborts current task, waiting for it to finish. - pub async fn shutdown(self) { + async fn shutdown(self) { if let Some(jh) = self.join_handle { self.cancellation.cancel(); match jh.await { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index de07676ffe..efcbfbce3d 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -11,11 +11,9 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; -use super::TaskStateUpdate; -use crate::broker_client::get_broker_client; +use super::{TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::WALRECEIVER_RUNTIME; -use crate::task_mgr::{self, TaskKind}; +use crate::task_mgr::TaskKind; use crate::tenant::Timeline; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; @@ -38,75 +36,17 @@ use utils::{ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; -/// Spawns the loop to take care of the timeline's WAL streaming connection. -pub fn spawn_connection_manager_task( - timeline: Arc, - wal_connect_timeout: Duration, - lagging_wal_timeout: Duration, - max_lsn_wal_lag: NonZeroU64, - auth_token: Option>, - availability_zone: Option, - ctx: RequestContext, -) { - let mut broker_client = get_broker_client().clone(); - - let tenant_id = timeline.tenant_id; - let timeline_id = timeline.timeline_id; - - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverManager, - Some(tenant_id), - Some(timeline_id), - &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), - false, - async move { - info!("WAL receiver manager started, connecting to broker"); - let mut walreceiver_state = WalreceiverState::new( - timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, - auth_token, - availability_zone, - ); - loop { - select! { - _ = task_mgr::shutdown_watcher() => { - info!("WAL receiver shutdown requested, shutting down"); - walreceiver_state.shutdown().await; - return Ok(()); - }, - loop_step_result = connection_manager_loop_step( - &mut broker_client, - &mut walreceiver_state, - &ctx, - ) => match loop_step_result { - ControlFlow::Continue(()) => continue, - ControlFlow::Break(()) => { - info!("Connection manager loop ended, shutting down"); - walreceiver_state.shutdown().await; - return Ok(()); - } - }, - } - } - } - .instrument( - info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), - ), - ); -} - /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task. /// If storage broker subscription is cancelled, exits. -async fn connection_manager_loop_step( +pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, - walreceiver_state: &mut WalreceiverState, + connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, ) -> ControlFlow<(), ()> { - let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates(); + let mut timeline_state_updates = connection_manager_state + .timeline + .subscribe_for_state_updates(); match wait_for_active_timeline(&mut timeline_state_updates).await { ControlFlow::Continue(()) => {} @@ -117,8 +57,8 @@ async fn connection_manager_loop_step( } let id = TenantTimelineId { - tenant_id: walreceiver_state.timeline.tenant_id, - timeline_id: walreceiver_state.timeline.timeline_id, + tenant_id: connection_manager_state.timeline.tenant_id, + timeline_id: connection_manager_state.timeline.timeline_id, }; // Subscribe to the broker updates. Stream shares underlying TCP connection @@ -128,7 +68,7 @@ async fn connection_manager_loop_step( info!("Subscribed for broker timeline updates"); loop { - let time_until_next_retry = walreceiver_state.time_until_next_retry(); + let time_until_next_retry = connection_manager_state.time_until_next_retry(); // These things are happening concurrently: // @@ -141,12 +81,12 @@ async fn connection_manager_loop_step( // - timeline state changes to something that does not allow walreceiver to run concurrently select! { Some(wal_connection_update) = async { - match walreceiver_state.wal_connection.as_mut() { + match connection_manager_state.wal_connection.as_mut() { Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), None => None, } } => { - let wal_connection = walreceiver_state.wal_connection.as_mut() + let wal_connection = connection_manager_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { TaskEvent::Update(TaskStateUpdate::Started) => {}, @@ -156,7 +96,7 @@ async fn connection_manager_loop_step( // from this safekeeper. This is good enough to clean unsuccessful // retries history and allow reconnecting to this safekeeper without // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + connection_manager_state.wal_connection_retries.remove(&wal_connection.sk_id); } wal_connection.status = new_status; } @@ -165,7 +105,7 @@ async fn connection_manager_loop_step( Ok(()) => debug!("WAL receiving task finished"), Err(e) => error!("wal receiver task finished with an error: {e:?}"), } - walreceiver_state.drop_old_connection(false).await; + connection_manager_state.drop_old_connection(false).await; }, } }, @@ -173,7 +113,7 @@ async fn connection_manager_loop_step( // Got a new update from the broker broker_update = broker_subscription.message() => { match broker_update { - Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update), + Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(e) => { error!("broker subscription failed: {e}"); return ControlFlow::Continue(()); @@ -187,12 +127,12 @@ async fn connection_manager_loop_step( new_event = async { loop { - if walreceiver_state.timeline.current_state() == TimelineState::Loading { + if connection_manager_state.timeline.current_state() == TimelineState::Loading { warn!("wal connection manager should only be launched after timeline has become active"); } match timeline_state_updates.changed().await { Ok(()) => { - let new_state = walreceiver_state.timeline.current_state(); + let new_state = connection_manager_state.timeline.current_state(); match new_state { // we're already active as walreceiver, no need to reactivate TimelineState::Active => continue, @@ -234,9 +174,9 @@ async fn connection_manager_loop_step( } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), } - if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { + if let Some(new_candidate) = connection_manager_state.next_connection_candidate() { info!("Switching to new connection candidate: {new_candidate:?}"); - walreceiver_state + connection_manager_state .change_connection(new_candidate, ctx) .await } @@ -314,25 +254,17 @@ const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0; const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. -struct WalreceiverState { +pub(super) struct ConnectionManagerState { id: TenantTimelineId, - /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, - /// The timeout on the connection to safekeeper for WAL streaming. - wal_connect_timeout: Duration, - /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. - lagging_wal_timeout: Duration, - /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. - max_lsn_wal_lag: NonZeroU64, + conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, /// Info about retries and unsuccessful attempts to connect to safekeepers. wal_connection_retries: HashMap, /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id. wal_stream_candidates: HashMap, - auth_token: Option>, - availability_zone: Option, } /// Current connection data. @@ -375,15 +307,8 @@ struct BrokerSkTimeline { latest_update: NaiveDateTime, } -impl WalreceiverState { - fn new( - timeline: Arc, - wal_connect_timeout: Duration, - lagging_wal_timeout: Duration, - max_lsn_wal_lag: NonZeroU64, - auth_token: Option>, - availability_zone: Option, - ) -> Self { +impl ConnectionManagerState { + pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_id, timeline_id: timeline.timeline_id, @@ -391,14 +316,10 @@ impl WalreceiverState { Self { id, timeline, - wal_connect_timeout, - lagging_wal_timeout, - max_lsn_wal_lag, + conf, wal_connection: None, wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), - auth_token, - availability_zone, } } @@ -407,7 +328,7 @@ impl WalreceiverState { self.drop_old_connection(true).await; let id = self.id; - let connect_timeout = self.wal_connect_timeout; + let connect_timeout = self.conf.wal_connect_timeout; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -563,7 +484,7 @@ impl WalreceiverState { (now - existing_wal_connection.status.latest_connection_update).to_std() { // Drop connection if we haven't received keepalive message for a while. - if latest_interaciton > self.wal_connect_timeout { + if latest_interaciton > self.conf.wal_connect_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, @@ -573,7 +494,7 @@ impl WalreceiverState { existing_wal_connection.status.latest_connection_update, ), check_time: now, - threshold: self.wal_connect_timeout, + threshold: self.conf.wal_connect_timeout, }, }); } @@ -589,7 +510,7 @@ impl WalreceiverState { // Check if the new candidate has much more WAL than the current one. match new_commit_lsn.0.checked_sub(current_commit_lsn.0) { Some(new_sk_lsn_advantage) => { - if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + if new_sk_lsn_advantage >= self.conf.max_lsn_wal_lag.get() { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, @@ -597,16 +518,16 @@ impl WalreceiverState { reason: ReconnectReason::LaggingWal { current_commit_lsn, new_commit_lsn, - threshold: self.max_lsn_wal_lag, + threshold: self.conf.max_lsn_wal_lag, }, }); } // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver, // and the current one is not, switch to the new one. - if self.availability_zone.is_some() + if self.conf.availability_zone.is_some() && existing_wal_connection.availability_zone - != self.availability_zone - && self.availability_zone == new_availability_zone + != self.conf.availability_zone + && self.conf.availability_zone == new_availability_zone { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, @@ -677,7 +598,7 @@ impl WalreceiverState { if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since { if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() { if candidate_commit_lsn > current_commit_lsn - && waiting_for_new_wal > self.lagging_wal_timeout + && waiting_for_new_wal > self.conf.lagging_wal_timeout { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, @@ -691,7 +612,7 @@ impl WalreceiverState { existing_wal_connection.status.latest_wal_update, ), check_time: now, - threshold: self.lagging_wal_timeout, + threshold: self.conf.lagging_wal_timeout, }, }); } @@ -757,11 +678,11 @@ impl WalreceiverState { match wal_stream_connection_config( self.id, info.safekeeper_connstr.as_ref(), - match &self.auth_token { + match &self.conf.auth_token { None => None, Some(x) => Some(x), }, - self.availability_zone.as_deref(), + self.conf.availability_zone.as_deref(), ) { Ok(connstr) => Some((*sk_id, info, connstr)), Err(e) => { @@ -775,7 +696,7 @@ impl WalreceiverState { /// Remove candidates which haven't sent broker updates for a while. fn cleanup_old_candidates(&mut self) { let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); - let lagging_wal_timeout = self.lagging_wal_timeout; + let lagging_wal_timeout = self.conf.lagging_wal_timeout; self.wal_stream_candidates.retain(|node_id, broker_info| { if let Ok(time_since_latest_broker_update) = @@ -799,7 +720,7 @@ impl WalreceiverState { } } - async fn shutdown(mut self) { + pub(super) async fn shutdown(mut self) { if let Some(wal_connection) = self.wal_connection.take() { wal_connection.connection_task.shutdown().await; } @@ -903,7 +824,7 @@ mod tests { let mut state = dummy_state(&harness).await; let now = Utc::now().naive_utc(); - let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?; let delay_over_threshold = now - lagging_wal_timeout - lagging_wal_timeout; state.wal_connection = None; @@ -914,7 +835,7 @@ mod tests { ( NodeId(3), dummy_broker_sk_timeline( - 1 + state.max_lsn_wal_lag.get(), + 1 + state.conf.max_lsn_wal_lag.get(), "delay_over_threshold", delay_over_threshold, ), @@ -948,7 +869,7 @@ mod tests { streaming_lsn: Some(Lsn(current_lsn)), }; - state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); + state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, @@ -966,7 +887,7 @@ mod tests { ( connected_sk_id, dummy_broker_sk_timeline( - current_lsn + state.max_lsn_wal_lag.get() * 2, + current_lsn + state.conf.max_lsn_wal_lag.get() * 2, DUMMY_SAFEKEEPER_HOST, now, ), @@ -978,7 +899,7 @@ mod tests { ( NodeId(2), dummy_broker_sk_timeline( - current_lsn + state.max_lsn_wal_lag.get() / 2, + current_lsn + state.conf.max_lsn_wal_lag.get() / 2, "not_enough_advanced_lsn", now, ), @@ -1003,7 +924,11 @@ mod tests { state.wal_connection = None; state.wal_stream_candidates = HashMap::from([( NodeId(0), - dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now), + dummy_broker_sk_timeline( + 1 + state.conf.max_lsn_wal_lag.get(), + DUMMY_SAFEKEEPER_HOST, + now, + ), )]); let only_candidate = state @@ -1101,7 +1026,7 @@ mod tests { let now = Utc::now().naive_utc(); let connected_sk_id = NodeId(0); - let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1); + let new_lsn = Lsn(current_lsn.0 + state.conf.max_lsn_wal_lag.get() + 1); let connection_status = WalConnectionStatus { is_connected: true, @@ -1146,7 +1071,7 @@ mod tests { ReconnectReason::LaggingWal { current_commit_lsn: current_lsn, new_commit_lsn: new_lsn, - threshold: state.max_lsn_wal_lag + threshold: state.conf.max_lsn_wal_lag }, "Should select bigger WAL safekeeper if it starts to lag enough" ); @@ -1165,7 +1090,7 @@ mod tests { let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); - let wal_connect_timeout = chrono::Duration::from_std(state.wal_connect_timeout)?; + let wal_connect_timeout = chrono::Duration::from_std(state.conf.wal_connect_timeout)?; let time_over_threshold = Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout; @@ -1208,7 +1133,7 @@ mod tests { .. } => { assert_eq!(last_keep_alive, Some(time_over_threshold)); - assert_eq!(threshold, state.lagging_wal_timeout); + assert_eq!(threshold, state.conf.lagging_wal_timeout); } unexpected => panic!("Unexpected reason: {unexpected:?}"), } @@ -1228,7 +1153,7 @@ mod tests { let new_lsn = Lsn(100_100).align(); let now = Utc::now().naive_utc(); - let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let lagging_wal_timeout = chrono::Duration::from_std(state.conf.lagging_wal_timeout)?; let time_over_threshold = Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; @@ -1275,7 +1200,7 @@ mod tests { assert_eq!(current_commit_lsn, current_lsn); assert_eq!(candidate_commit_lsn, new_lsn); assert_eq!(last_wal_interaction, Some(time_over_threshold)); - assert_eq!(threshold, state.lagging_wal_timeout); + assert_eq!(threshold, state.conf.lagging_wal_timeout); } unexpected => panic!("Unexpected reason: {unexpected:?}"), } @@ -1289,27 +1214,29 @@ mod tests { const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr"; - async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { + async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState { let (tenant, ctx) = harness.load().await; let timeline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx) .expect("Failed to create an empty timeline for dummy wal connection manager"); let timeline = timeline.initialize(&ctx).unwrap(); - WalreceiverState { + ConnectionManagerState { id: TenantTimelineId { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, }, timeline, - wal_connect_timeout: Duration::from_secs(1), - lagging_wal_timeout: Duration::from_secs(1), - max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), + conf: WalReceiverConf { + wal_connect_timeout: Duration::from_secs(1), + lagging_wal_timeout: Duration::from_secs(1), + max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), + auth_token: None, + availability_zone: None, + }, wal_connection: None, wal_stream_candidates: HashMap::new(), wal_connection_retries: HashMap::new(), - auth_token: None, - availability_zone: None, } } @@ -1321,7 +1248,7 @@ mod tests { let harness = TenantHarness::create("switch_to_same_availability_zone")?; let mut state = dummy_state(&harness).await; - state.availability_zone = test_az.clone(); + state.conf.availability_zone = test_az.clone(); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index ea2f2392ea..d5099dc2a5 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -42,7 +42,7 @@ use utils::lsn::Lsn; /// Status of the connection. #[derive(Debug, Clone, Copy)] -pub struct WalConnectionStatus { +pub(super) struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, /// Defines a healthy connection as one on which pageserver received WAL from safekeeper @@ -60,7 +60,7 @@ pub struct WalConnectionStatus { /// Open a connection to the given safekeeper and receive WAL, sending back progress /// messages as we go. -pub async fn handle_walreceiver_connection( +pub(super) async fn handle_walreceiver_connection( timeline: Arc, wal_source_connconf: PgConnectionConfig, events_sender: watch::Sender>, From a64044a7a9a9f2b32a73b97da1fd230f9b510064 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 12 Apr 2023 15:32:38 +0300 Subject: [PATCH 266/426] Update most of the dependencies to their latest versions (#3991) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All non-trivial updates extracted into separate commits, also `carho hakari` data and its manifest format were updated. 3 sets of crates remain unupdated: * `base64` — touches proxy in a lot of places and changed its api (by 0.21 version) quite strongly since our version (0.13). * `opentelemetry` and `opentelemetry-*` crates ``` error[E0308]: mismatched types --> libs/tracing-utils/src/http.rs:65:21 | 65 | span.set_parent(parent_ctx); | ---------- ^^^^^^^^^^ expected struct `opentelemetry_api::context::Context`, found struct `opentelemetry::Context` | | | arguments to this method are incorrect | = note: struct `opentelemetry::Context` and struct `opentelemetry_api::context::Context` have similar names, but are actually distinct types note: struct `opentelemetry::Context` is defined in crate `opentelemetry_api` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/opentelemetry_api-0.19.0/src/context.rs:77:1 | 77 | pub struct Context { | ^^^^^^^^^^^^^^^^^^ note: struct `opentelemetry_api::context::Context` is defined in crate `opentelemetry_api` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/opentelemetry_api-0.18.0/src/context.rs:77:1 | 77 | pub struct Context { | ^^^^^^^^^^^^^^^^^^ = note: perhaps two different versions of crate `opentelemetry_api` are being used? note: associated function defined here --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/tracing-opentelemetry-0.18.0/src/span_ext.rs:43:8 | 43 | fn set_parent(&self, cx: Context); | ^^^^^^^^^^ For more information about this error, try `rustc --explain E0308`. error: could not compile `tracing-utils` due to previous error warning: build failed, waiting for other jobs to finish... error: could not compile `tracing-utils` due to previous error ``` `tracing-opentelemetry` of version `0.19` is not yet released, that is supposed to have the update we need. * similarly, `rustls`, `tokio-rustls`, `rustls-*` and `tls-listener` crates have similar issue: ``` error[E0308]: mismatched types --> libs/postgres_backend/tests/simple_select.rs:112:78 | 112 | let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg); | --------------------------------------------- ^^^^^^^^^^ expected struct `rustls::client::client_conn::ClientConfig`, found struct `ClientConfig` | | | arguments to this function are incorrect | = note: struct `ClientConfig` and struct `rustls::client::client_conn::ClientConfig` have similar names, but are actually distinct types note: struct `ClientConfig` is defined in crate `rustls` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/rustls-0.21.0/src/client/client_conn.rs:125:1 | 125 | pub struct ClientConfig { | ^^^^^^^^^^^^^^^^^^^^^^^ note: struct `rustls::client::client_conn::ClientConfig` is defined in crate `rustls` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/rustls-0.20.8/src/client/client_conn.rs:91:1 | 91 | pub struct ClientConfig { | ^^^^^^^^^^^^^^^^^^^^^^^ = note: perhaps two different versions of crate `rustls` are being used? note: associated function defined here --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-postgres-rustls-0.9.0/src/lib.rs:23:12 | 23 | pub fn new(config: ClientConfig) -> Self { | ^^^ For more information about this error, try `rustc --explain E0308`. error: could not compile `postgres_backend` due to previous error warning: build failed, waiting for other jobs to finish... ``` * aws crates: I could not make new API to work with bucket endpoint overload, and console e2e tests failed. Other our tests passed, further investigation is worth to be done in https://github.com/neondatabase/neon/issues/4008 --- .config/hakari.toml | 2 +- Cargo.lock | 1410 +++++++++++------ Cargo.toml | 26 +- libs/consumption_metrics/Cargo.toml | 17 +- libs/postgres_ffi/build.rs | 6 +- libs/remote_storage/tests/pagination_tests.rs | 7 +- libs/tracing-utils/Cargo.toml | 3 +- libs/utils/Cargo.toml | 2 +- pageserver/src/config.rs | 22 +- pageserver/src/page_service.rs | 2 +- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/config.rs | 4 +- .../tenant/remote_timeline_client/upload.rs | 2 +- storage_broker/src/bin/storage_broker.rs | 3 +- trace/Cargo.toml | 2 - workspace_hack/Cargo.toml | 9 +- 16 files changed, 958 insertions(+), 561 deletions(-) diff --git a/.config/hakari.toml b/.config/hakari.toml index 12d2d1bf9c..15b939e86f 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -4,7 +4,7 @@ hakari-package = "workspace_hack" # Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above. -dep-format-version = "3" +dep-format-version = "4" # Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended. # Hakari works much better with the new feature resolver. diff --git a/Cargo.lock b/Cargo.lock index 668487a9bd..8dde4ebb57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,28 +64,68 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] -name = "anyhow" -version = "1.0.68" +name = "anstream" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61" +checksum = "342258dd14006105c2b75ab1bd7543a03bdf0cfc94383303ac212a04939dff6f" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-wincon", + "concolor-override", + "concolor-query", + "is-terminal", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23ea9e81bd02e310c216d080f6223c179012256e5151c41db88d12c88a1684d2" + +[[package]] +name = "anstyle-parse" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7d1bb534e9efed14f3e5f44e7dd1a4f709384023a4165199a4241e18dff0116" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-wincon" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3127af6145b149f3287bb9a0d10ad9c5692dba8c53ad48285e5bec4063834fa" +dependencies = [ + "anstyle", + "windows-sys 0.45.0", +] + +[[package]] +name = "anyhow" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" dependencies = [ "backtrace", ] [[package]] name = "archery" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02" +checksum = "b6cd774058b1b415c4855d8b86436c04bf050c003156fe24bc326fb3fe75c343" dependencies = [ "static_assertions", ] [[package]] name = "asn1-rs" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4" +checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0" dependencies = [ "asn1-rs-derive", "asn1-rs-impl", @@ -105,7 +145,7 @@ checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "synstructure", ] @@ -117,46 +157,47 @@ checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "async-stream" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" dependencies = [ "async-stream-impl", "futures-core", + "pin-project-lite", ] [[package]] name = "async-stream-impl" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] name = "async-trait" -version = "0.1.64" +version = "0.1.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2" +checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] name = "atomic-polyfill" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d299f547288d6db8d5c3a2916f7b2f66134b15b8c1ac1c4357dd3b8752af7bb2" +checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" dependencies = [ "critical-section", ] @@ -187,13 +228,13 @@ dependencies = [ "aws-http", "aws-sdk-sso", "aws-sdk-sts", - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-http-tower", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", "aws-smithy-json", - "aws-smithy-types", - "aws-types", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "bytes", "hex", "http", @@ -206,15 +247,29 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-credential-types" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77e37e62f59cf3284067337da7467d842df8cfe3f5e5c06487ac7521819cf16d" +dependencies = [ + "aws-smithy-async 0.55.1", + "aws-smithy-types 0.55.1", + "fastrand", + "tokio", + "tracing", + "zeroize", +] + [[package]] name = "aws-endpoint" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ca8f374874f6459aaa88dc861d7f5d834ca1ff97668eae190e97266b5f6c3fb" dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "http", "regex", "tracing", @@ -226,9 +281,9 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78d41e19e779b73463f5f0c21b3aacc995f4ba783ab13a7ae9f5dfb159a551b4" dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "bytes", "http", "http-body", @@ -248,15 +303,15 @@ dependencies = [ "aws-http", "aws-sig-auth", "aws-sigv4", - "aws-smithy-async", + "aws-smithy-async 0.51.0", "aws-smithy-checksums", - "aws-smithy-client", + "aws-smithy-client 0.51.0", "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-http-tower", - "aws-smithy-types", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", + "aws-smithy-types 0.51.0", "aws-smithy-xml", - "aws-types", + "aws-types 0.51.0", "bytes", "bytes-utils", "http", @@ -275,13 +330,13 @@ dependencies = [ "aws-endpoint", "aws-http", "aws-sig-auth", - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-http-tower", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", "aws-smithy-json", - "aws-smithy-types", - "aws-types", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "bytes", "http", "tokio-stream", @@ -297,14 +352,14 @@ dependencies = [ "aws-endpoint", "aws-http", "aws-sig-auth", - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-http-tower", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", "aws-smithy-query", - "aws-smithy-types", + "aws-smithy-types 0.51.0", "aws-smithy-xml", - "aws-types", + "aws-types 0.51.0", "bytes", "http", "tower", @@ -318,20 +373,20 @@ checksum = "12cbe7b2be9e185c1fbce27fc9c41c66b195b32d89aa099f98768d9544221308" dependencies = [ "aws-sigv4", "aws-smithy-eventstream", - "aws-smithy-http", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-types 0.51.0", "http", "tracing", ] [[package]] name = "aws-sigv4" -version = "0.51.0" +version = "0.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ff4cff8c4a101962d593ba94e72cd83891aecd423f0c6e3146bff6fb92c9e3" +checksum = "5c0b2658d2cb66dbf02f0e8dee80810ef1e0ca3530ede463e0ef994c301087d1" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-http", + "aws-smithy-http 0.51.0", "bytes", "form_urlencoded", "hex", @@ -356,14 +411,26 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "aws-smithy-async" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88573bcfbe1dcfd54d4912846df028b42d6255cbf9ce07be216b1bbfd11fc4b9" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", + "tokio-stream", +] + [[package]] name = "aws-smithy-checksums" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc227e36e346f45298288359f37123e1a92628d1cec6b11b5eb335553278bd9e" dependencies = [ - "aws-smithy-http", - "aws-smithy-types", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", "bytes", "crc32c", "crc32fast", @@ -383,10 +450,10 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff28d553714f8f54cd921227934fc13a536a1c03f106e56b362fd57e16d450ad" dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-http-tower", - "aws-smithy-types", + "aws-smithy-async 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", + "aws-smithy-types 0.51.0", "bytes", "fastrand", "http", @@ -400,13 +467,33 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-smithy-client" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2f52352bae50d3337d5d6151b695d31a8c10ebea113eca5bead531f8301b067" +dependencies = [ + "aws-smithy-async 0.55.1", + "aws-smithy-http 0.55.1", + "aws-smithy-http-tower 0.55.1", + "aws-smithy-types 0.55.1", + "bytes", + "fastrand", + "http", + "http-body", + "pin-project-lite", + "tokio", + "tower", + "tracing", +] + [[package]] name = "aws-smithy-eventstream" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7ea0df7161ce65b5c8ca6eb709a1a907376fa18226976e41c748ce02ccccf24" dependencies = [ - "aws-smithy-types", + "aws-smithy-types 0.51.0", "bytes", "crc32fast", ] @@ -418,7 +505,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf58ed4fefa61dbf038e5421a521cbc2c448ef69deff0ab1d915d8a10eda5664" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-types", + "aws-smithy-types 0.51.0", "bytes", "bytes-utils", "futures-core", @@ -434,13 +521,49 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-smithy-http" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03bcc02d7ed9649d855c8ce4a735e9848d7b8f7568aad0504c158e3baa955df8" +dependencies = [ + "aws-smithy-types 0.55.1", + "bytes", + "bytes-utils", + "futures-core", + "http", + "http-body", + "hyper", + "once_cell", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + [[package]] name = "aws-smithy-http-tower" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20c96d7bd35e7cf96aca1134b2f81b1b59ffe493f7c6539c051791cbbf7a42d3" dependencies = [ - "aws-smithy-http", + "aws-smithy-http 0.51.0", + "bytes", + "http", + "http-body", + "pin-project-lite", + "tower", + "tracing", +] + +[[package]] +name = "aws-smithy-http-tower" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da88b3a860f65505996c29192d800f1aeb9480440f56d63aad33a3c12045017a" +dependencies = [ + "aws-smithy-http 0.55.1", + "aws-smithy-types 0.55.1", "bytes", "http", "http-body", @@ -455,7 +578,7 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8324ba98c8a94187723cc16c37aefa09504646ee65c3d2c3af495bab5ea701b" dependencies = [ - "aws-smithy-types", + "aws-smithy-types 0.51.0", ] [[package]] @@ -464,7 +587,7 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83834ed2ff69ea6f6657baf205267dc2c0abe940703503a3e5d60ce23be3d306" dependencies = [ - "aws-smithy-types", + "aws-smithy-types 0.51.0", "urlencoding", ] @@ -480,6 +603,19 @@ dependencies = [ "time", ] +[[package]] +name = "aws-smithy-types" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd0afc731fd1417d791f9145a1e0c30e23ae0beaab9b4814017708ead2fc20f1" +dependencies = [ + "base64-simd", + "itoa", + "num-integer", + "ryu", + "time", +] + [[package]] name = "aws-smithy-xml" version = "0.51.0" @@ -495,10 +631,10 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05701d32da168b44f7ee63147781aed8723e792cc131cb9b18363b5393f17f70" dependencies = [ - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-types", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", "http", "rustc_version", "tracing", @@ -506,10 +642,26 @@ dependencies = [ ] [[package]] -name = "axum" -version = "0.6.4" +name = "aws-types" +version = "0.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc" +checksum = "81fb02591b5075d318e0083dcb76df0e151db4ce48f987ecd00e5b53c7a6ba59" +dependencies = [ + "aws-credential-types", + "aws-smithy-async 0.55.1", + "aws-smithy-client 0.55.1", + "aws-smithy-http 0.55.1", + "aws-smithy-types 0.55.1", + "http", + "rustc_version", + "tracing", +] + +[[package]] +name = "axum" +version = "0.6.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62" dependencies = [ "async-trait", "axum-core", @@ -529,16 +681,15 @@ dependencies = [ "serde", "sync_wrapper", "tower", - "tower-http", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cae3e661676ffbacb30f1a824089a8c9150e71017f7e1e38f2aa32009188d34" +checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" dependencies = [ "async-trait", "bytes", @@ -584,6 +735,16 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + [[package]] name = "bincode" version = "1.3.3" @@ -595,9 +756,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.61.0" +version = "0.65.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" +checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" dependencies = [ "bitflags", "cexpr", @@ -606,12 +767,13 @@ dependencies = [ "lazycell", "log", "peeking_take_while", + "prettyplease 0.2.4", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn", + "syn 2.0.14", "which", ] @@ -623,18 +785,18 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "block-buffer" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "bstr" -version = "1.2.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f0778972c64420fdedc63f09919c8a88bda7b25135357fd25a5d9f3257e832" +checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" dependencies = [ "memchr", "once_cell", @@ -702,9 +864,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" dependencies = [ "iana-time-zone", "num-integer", @@ -742,9 +904,9 @@ dependencies = [ [[package]] name = "clang-sys" -version = "1.4.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" dependencies = [ "glob", "libc", @@ -765,30 +927,38 @@ dependencies = [ [[package]] name = "clap" -version = "4.1.4" +version = "4.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76" +checksum = "046ae530c528f252094e4a77886ee1374437744b2bff1497aa898bbddbbb29b3" dependencies = [ - "bitflags", + "clap_builder", "clap_derive", - "clap_lex 0.3.1", - "is-terminal", "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "223163f58c9a40c3b0a43e1c4b50a9ce09f007ea2cb1ec258a687945b4b7929f" +dependencies = [ + "anstream", + "anstyle", + "bitflags", + "clap_lex 0.4.1", "strsim", - "termcolor", ] [[package]] name = "clap_derive" -version = "4.1.0" +version = "4.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" +checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4" dependencies = [ "heck", - "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] @@ -802,12 +972,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.3.1" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" -dependencies = [ - "os_str_bytes", -] +checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" [[package]] name = "close_fds" @@ -859,7 +1026,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.1.4", + "clap 4.2.1", "compute_api", "futures", "hyper", @@ -883,6 +1050,21 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "concolor-override" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a855d4a1978dc52fb0536a04d384c2c0c1aa273597f08b77c8c4d3b2eec6037f" + +[[package]] +name = "concolor-query" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d11d52c3d7ca2e6d0040212be9e4dbbcd78b6447f535b6b561f449427944cf" +dependencies = [ + "windows-sys 0.45.0", +] + [[package]] name = "const_format" version = "0.2.30" @@ -921,7 +1103,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.1", "comfy-table", "git-version", "nix", @@ -957,15 +1139,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" dependencies = [ "libc", ] @@ -1032,9 +1214,9 @@ checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" [[package]] name = "crossbeam-channel" -version = "0.5.6" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1042,9 +1224,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -1053,22 +1235,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.13" +version = "0.9.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" +checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.7.1", + "memoffset 0.8.0", "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.14" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" +checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" dependencies = [ "cfg-if", ] @@ -1110,9 +1292,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9" +checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" dependencies = [ "cc", "cxxbridge-flags", @@ -1122,9 +1304,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d" +checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" dependencies = [ "cc", "codespan-reporting", @@ -1132,31 +1314,31 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn", + "syn 2.0.14", ] [[package]] name = "cxxbridge-flags" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a" +checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" [[package]] name = "cxxbridge-macro" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2" +checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] name = "darling" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" dependencies = [ "darling_core", "darling_macro", @@ -1164,27 +1346,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn", + "syn 1.0.109", ] [[package]] name = "darling_macro" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" dependencies = [ "darling_core", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1218,9 +1400,9 @@ dependencies = [ [[package]] name = "der-parser" -version = "8.1.0" +version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1" +checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e" dependencies = [ "asn1-rs", "displaydoc", @@ -1249,7 +1431,7 @@ checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1269,9 +1451,9 @@ dependencies = [ [[package]] name = "enum-map" -version = "2.4.2" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c25992259941eb7e57b936157961b217a4fc8597829ddef0596d6c3cd86e1a" +checksum = "988f0d17a0fa38291e5f41f71ea8d46a5d5497b9054d5a759fae2cbb819f2356" dependencies = [ "enum-map-derive", ] @@ -1284,7 +1466,7 @@ checksum = "2a4da76b3b6116d758c7ba93f7ec6a35d2e2cf24feda76c6e38a375f4d5c59f2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1305,7 +1487,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1323,13 +1505,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.2.8" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -1361,23 +1543,23 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" dependencies = [ "instant", ] [[package]] name = "filetime" -version = "0.2.19" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e884668cd0c7480504233e951174ddc3b382f7c2666e3b7310b5c4e7b0c37f9" +checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" dependencies = [ "cfg-if", "libc", - "redox_syscall", - "windows-sys 0.42.0", + "redox_syscall 0.2.16", + "windows-sys 0.48.0", ] [[package]] @@ -1422,9 +1604,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" dependencies = [ "futures-channel", "futures-core", @@ -1437,9 +1619,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", "futures-sink", @@ -1447,15 +1629,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" [[package]] name = "futures-executor" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" dependencies = [ "futures-core", "futures-task", @@ -1464,32 +1646,32 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" [[package]] name = "futures-macro" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] name = "futures-sink" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" [[package]] name = "futures-task" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" [[package]] name = "futures-timer" @@ -1499,9 +1681,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-channel", "futures-core", @@ -1517,9 +1699,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -1527,20 +1709,22 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] name = "gimli" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "221996f774192f0f718773def8201c4ae31f02616a54ccfc2d358bb0e5cefdec" +checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" [[package]] name = "git-version" @@ -1561,7 +1745,7 @@ dependencies = [ "proc-macro-hack", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1572,9 +1756,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.15" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" +checksum = "5be7b54589b581f624f566bf5d8eb2bab1db736c51528720b6bd36b96b55924d" dependencies = [ "bytes", "fnv", @@ -1639,7 +1823,7 @@ dependencies = [ "atomic-polyfill", "hash32", "rustc_version", - "spin 0.9.4", + "spin 0.9.8", "stable_deref_trait", ] @@ -1667,6 +1851,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + [[package]] name = "hex" version = "0.4.3" @@ -1678,9 +1868,9 @@ dependencies = [ [[package]] name = "hex-literal" -version = "0.3.4" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" +checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" [[package]] name = "hmac" @@ -1704,9 +1894,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" dependencies = [ "bytes", "fnv", @@ -1724,12 +1914,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "http-range-header" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" - [[package]] name = "httparse" version = "1.8.0" @@ -1760,9 +1944,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.23" +version = "0.14.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" +checksum = "cc5e554ff619822309ffd57d8734d77cd5ce6238bc956f037ea06c58238c9899" dependencies = [ "bytes", "futures-channel", @@ -1775,7 +1959,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2", + "socket2 0.4.9", "tokio", "tower-service", "tracing", @@ -1791,10 +1975,10 @@ dependencies = [ "http", "hyper", "log", - "rustls", + "rustls 0.20.8", "rustls-native-certs", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] @@ -1824,16 +2008,16 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.53" +version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "winapi", + "windows", ] [[package]] @@ -1864,9 +2048,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.2" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown 0.12.3", @@ -1904,30 +2088,31 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.4" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" dependencies = [ + "hermit-abi 0.3.1", "libc", - "windows-sys 0.42.0", + "windows-sys 0.48.0", ] [[package]] name = "ipnet" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" +checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" [[package]] name = "is-terminal" -version = "0.4.2" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi 0.3.1", "io-lifetimes", - "rustix", - "windows-sys 0.42.0", + "rustix 0.37.11", + "windows-sys 0.48.0", ] [[package]] @@ -1941,9 +2126,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" +checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "js-sys" @@ -1956,11 +2141,11 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.2.0" +version = "8.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" +checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" dependencies = [ - "base64 0.13.1", + "base64 0.21.0", "pem", "ring", "serde", @@ -2002,9 +2187,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5" [[package]] name = "libloading" @@ -2031,6 +2216,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +[[package]] +name = "linux-raw-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f" + [[package]] name = "lock_api" version = "0.4.9" @@ -2123,9 +2314,9 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "mime_guess" @@ -2145,23 +2336,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.6.4" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2e212582ede878b109755efd0773a4f0f4ec851584cf0aefbeb4d9ecc114822" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" +checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" dependencies = [ "libc", "log", "wasi", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -2194,15 +2385,6 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "nom8" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae01545c9c7fc4486ab7debaf2aad7003ac19431791868fb2e8066df97fad2f8" -dependencies = [ - "memchr", -] - [[package]] name = "notify" version = "5.1.0" @@ -2291,9 +2473,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.0" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "oorandom" @@ -2358,8 +2540,8 @@ dependencies = [ "futures-util", "opentelemetry", "prost", - "tonic", - "tonic-build", + "tonic 0.8.3", + "tonic-build 0.8.4", ] [[package]] @@ -2411,9 +2593,9 @@ dependencies = [ [[package]] name = "os_info" -version = "3.6.0" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c424bc68d15e0778838ac013b5b3449544d8133633d8016319e7e05a820b8c0" +checksum = "006e42d5b888366f1880eda20371fedde764ed2213dc8496f49622fa0c99cd5e" dependencies = [ "log", "serde", @@ -2422,9 +2604,15 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.4.1" +version = "6.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" + +[[package]] +name = "outref" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" [[package]] name = "overload" @@ -2442,7 +2630,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.1", "close_fds", "const_format", "consumption_metrics", @@ -2539,7 +2727,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "windows-sys 0.45.0", ] @@ -2567,9 +2755,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "petgraph" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" +checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", "indexmap", @@ -2610,7 +2798,7 @@ checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -2704,14 +2892,14 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.23.4", "tracing", "workspace_hack", ] @@ -2777,36 +2965,22 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.23" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e97e3215779627f01ee256d2fad52f3d95e8e1c11e9fc6fd08f7cd455d5d5c78" +checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86" dependencies = [ "proc-macro2", - "syn", + "syn 1.0.109", ] [[package]] -name = "proc-macro-error" -version = "1.0.4" +name = "prettyplease" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" dependencies = [ "proc-macro2", - "quote", - "version_check", + "syn 2.0.14", ] [[package]] @@ -2817,9 +2991,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.50" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" dependencies = [ "unicode-ident", ] @@ -2834,7 +3008,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix", + "rustix 0.36.12", ] [[package]] @@ -2855,9 +3029,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21dc42e00223fc37204bd4aa177e69420c604ca4a183209a8f9de30c6d934698" +checksum = "e48e50df39172a3e7eb17e14642445da64996989bc212b583015435d39a58537" dependencies = [ "bytes", "prost-derive", @@ -2865,9 +3039,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f8ad728fb08fe212df3c05169e940fbb6d9d16a877ddde14644a983ba2012e" +checksum = "2c828f93f5ca4826f97fedcbd3f9a536c16b12cff3dbbb4a007f932bbad95b12" dependencies = [ "bytes", "heck", @@ -2876,35 +3050,34 @@ dependencies = [ "log", "multimap", "petgraph", - "prettyplease", + "prettyplease 0.1.25", "prost", "prost-types", "regex", - "syn", + "syn 1.0.109", "tempfile", "which", ] [[package]] name = "prost-derive" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bda8c0881ea9f722eb9629376db3d0b903b462477c1aafcb0566610ac28ac5d" +checksum = "4ea9b0f8cbe5e15a8a042d030bd96668db28ecb567ec37d691971ff5731d2b1b" dependencies = [ "anyhow", "itertools", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "prost-types" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e0526209433e96d83d750dd81a99118edbc55739e7e61a46764fd2ad537788" +checksum = "379119666929a1afd7a043aa6cf96fa67a6dce9af60c88095a4686dbce4c9c88" dependencies = [ - "bytes", "prost", ] @@ -2919,7 +3092,7 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.1", "consumption_metrics", "futures", "git-version", @@ -2949,20 +3122,20 @@ dependencies = [ "reqwest-tracing", "routerify", "rstest", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "scopeguard", "serde", "serde_json", "sha2", - "socket2", + "socket2 0.5.2", "sync_wrapper", "thiserror", "tls-listener", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.23.4", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -2970,16 +3143,16 @@ dependencies = [ "url", "utils", "uuid", - "webpki-roots", + "webpki-roots 0.23.0", "workspace_hack", "x509-parser", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" dependencies = [ "proc-macro2", ] @@ -3016,9 +3189,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" dependencies = [ "either", "rayon-core", @@ -3026,9 +3199,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.10.2" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -3058,10 +3231,19 @@ dependencies = [ ] [[package]] -name = "regex" -version = "1.7.1" +name = "redox_syscall" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" dependencies = [ "aho-corasick", "memchr", @@ -3079,9 +3261,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "remote_storage" @@ -3091,8 +3273,8 @@ dependencies = [ "async-trait", "aws-config", "aws-sdk-s3", - "aws-smithy-http", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-types 0.55.0", "hyper", "metrics", "once_cell", @@ -3111,9 +3293,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.14" +version = "0.11.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21eed90ec8570952d53b772ecf8f206aa1ec9a3d76b2521c56c42973f2d91ee9" +checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" dependencies = [ "base64 0.21.0", "bytes", @@ -3133,27 +3315,27 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots", + "webpki-roots 0.22.6", "winreg", ] [[package]] name = "reqwest-middleware" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894" +checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b" dependencies = [ "anyhow", "async-trait", @@ -3166,11 +3348,12 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce" +checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8" dependencies = [ "async-trait", + "getrandom", "opentelemetry", "reqwest", "reqwest-middleware", @@ -3209,18 +3392,18 @@ dependencies = [ [[package]] name = "rpds" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000" +checksum = "9bd6ce569b15c331b1e5fd8cf6adb0bf240678b5f0cdc4d0f41e11683f6feba9" dependencies = [ "archery", ] [[package]] name = "rstest" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07f2d176c472198ec1e6551dc7da28f1c089652f66a7b722676c2238ebc0edf" +checksum = "de1bb486a691878cd320c2f0d319ba91eeaa2e894066d8b5f8f117c000e9d962" dependencies = [ "futures", "futures-timer", @@ -3230,23 +3413,23 @@ dependencies = [ [[package]] name = "rstest_macros" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7229b505ae0706e64f37ffc54a9c163e11022a6636d58fe1f3f52018257ff9f7" +checksum = "290ca1a1c8ca7edb7c3283bd44dc35dd54fdec6253a3912e201ba1072018fca8" dependencies = [ "cfg-if", "proc-macro2", "quote", "rustc_version", - "syn", + "syn 1.0.109", "unicode-ident", ] [[package]] name = "rustc-demangle" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" +checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b" [[package]] name = "rustc-hash" @@ -3274,16 +3457,30 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.7" +version = "0.36.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" +checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", - "linux-raw-sys", - "windows-sys 0.42.0", + "linux-raw-sys 0.1.4", + "windows-sys 0.45.0", +] + +[[package]] +name = "rustix" +version = "0.37.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys 0.3.1", + "windows-sys 0.48.0", ] [[package]] @@ -3298,6 +3495,18 @@ dependencies = [ "webpki", ] +[[package]] +name = "rustls" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -3320,16 +3529,26 @@ dependencies = [ ] [[package]] -name = "rustversion" -version = "1.0.11" +name = "rustls-webpki" +version = "0.100.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70" +checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" [[package]] name = "ryu" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" +checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" [[package]] name = "safekeeper" @@ -3341,7 +3560,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.1", "const_format", "crc32c", "fs2", @@ -3414,9 +3633,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "scratch" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" [[package]] name = "sct" @@ -3453,33 +3672,33 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" +checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "sentry" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6097dc270a9c4555c5d6222ed243eaa97ff38e29299ed7c5cb36099033c604e" +checksum = "b5ce6d3512e2617c209ec1e86b0ca2fea06454cd34653c91092bf0f3ec41f8e3" dependencies = [ "httpdate", "reqwest", - "rustls", + "rustls 0.20.8", "sentry-backtrace", "sentry-contexts", "sentry-core", "sentry-panic", "tokio", "ureq", - "webpki-roots", + "webpki-roots 0.22.6", ] [[package]] name = "sentry-backtrace" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d92d1e4d591534ae4f872d6142f3b500f4ffc179a6aed8a3e86c7cc96d10a6a" +checksum = "0e7fe408d4d1f8de188a9309916e02e129cbe51ca19e55badea5a64899399b1a" dependencies = [ "backtrace", "once_cell", @@ -3489,9 +3708,9 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3afa877b1898ff67dd9878cf4bec4e53cef7d3be9f14b1fc9e4fcdf36f8e4259" +checksum = "5695096a059a89973ec541062d331ff4c9aeef9c2951416c894f0fff76340e7d" dependencies = [ "hostname", "libc", @@ -3503,9 +3722,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc43eb7e4e3a444151a0fe8a0e9ce60eabd905dae33d66e257fa26f1b509c1bd" +checksum = "5b22828bfd118a7b660cf7a155002a494755c0424cebb7061e4743ecde9c7dbc" dependencies = [ "once_cell", "rand", @@ -3516,9 +3735,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccab4fab11e3e63c45f4524bee2e75cde39cdf164cb0b0cbe6ccd1948ceddf66" +checksum = "1f4ced2a7a8c14899d58eec402d946f69d5ed26a3fc363a7e8b1e5cb88473a01" dependencies = [ "sentry-backtrace", "sentry-core", @@ -3526,9 +3745,9 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63708ec450b6bdcb657af760c447416d69c38ce421f34e5e2e9ce8118410bc7" +checksum = "360ee3270f7a4a1eee6c667f7d38360b995431598a73b740dfe420da548d9cc9" dependencies = [ "debugid", "getrandom", @@ -3543,35 +3762,44 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.152" +version = "1.0.160" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.152" +version = "1.0.160" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] name = "serde_json" -version = "1.0.91" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" +checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744" dependencies = [ "itoa", "ryu", "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -3586,9 +3814,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.2.0" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c" +checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0" dependencies = [ "base64 0.13.1", "chrono", @@ -3602,14 +3830,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.2.0" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e" +checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c" dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -3651,9 +3879,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" +checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" dependencies = [ "libc", "signal-hook-registry", @@ -3672,9 +3900,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" dependencies = [ "libc", ] @@ -3699,9 +3927,9 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" dependencies = [ "autocfg", ] @@ -3714,14 +3942,24 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" dependencies = [ "libc", "winapi", ] +[[package]] +name = "socket2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "spin" version = "0.5.2" @@ -3730,9 +3968,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "spin" -version = "0.9.4" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" dependencies = [ "lock_api", ] @@ -3756,7 +3994,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.1.4", + "clap 4.2.1", "const_format", "futures", "futures-core", @@ -3770,8 +4008,8 @@ dependencies = [ "prost", "tokio", "tokio-stream", - "tonic", - "tonic-build", + "tonic 0.9.1", + "tonic-build 0.9.1", "tracing", "utils", "workspace_hack", @@ -3809,7 +4047,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 1.0.109", ] [[package]] @@ -3826,9 +4064,20 @@ checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" [[package]] name = "syn" -version = "1.0.107" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf316d5356ed6847742d036f8a39c3b8435cac10bd528a4bd461928a6ab34d5" dependencies = [ "proc-macro2", "quote", @@ -3849,7 +4098,7 @@ checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "unicode-xid", ] @@ -3866,24 +4115,24 @@ dependencies = [ [[package]] name = "task-local-extensions" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4" +checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" dependencies = [ - "tokio", + "pin-utils", ] [[package]] name = "tempfile" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", - "rustix", - "windows-sys 0.42.0", + "redox_syscall 0.3.5", + "rustix 0.37.11", + "windows-sys 0.45.0", ] [[package]] @@ -3923,7 +4172,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" dependencies = [ "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -3934,38 +4183,39 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] name = "thread_local" -version = "1.1.4" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ + "cfg-if", "once_cell", ] [[package]] name = "time" -version = "0.3.17" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" +checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" dependencies = [ "itoa", "serde", @@ -3981,9 +4231,9 @@ checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" [[package]] name = "time-macros" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" +checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" dependencies = [ "time-core", ] @@ -4009,9 +4259,9 @@ dependencies = [ [[package]] name = "tinyvec_macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tls-listener" @@ -4024,26 +4274,25 @@ dependencies = [ "pin-project-lite", "thiserror", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] name = "tokio" -version = "1.25.0" +version = "1.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e00990ebabbe4c14c08aca901caed183ecd5c09562a12c824bb53d3c3fd3af" +checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001" dependencies = [ "autocfg", "bytes", "libc", - "memchr", "mio", "num_cpus", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.4.9", "tokio-macros", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -4058,13 +4307,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.8.2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.14", ] [[package]] @@ -4085,7 +4334,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "socket2", + "socket2 0.4.9", "tokio", "tokio-util", ] @@ -4098,10 +4347,10 @@ checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7" dependencies = [ "futures", "ring", - "rustls", + "rustls 0.20.8", "tokio", "tokio-postgres", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] @@ -4110,16 +4359,26 @@ version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ - "rustls", + "rustls 0.20.8", "tokio", "webpki", ] [[package]] -name = "tokio-stream" -version = "0.1.11" +name = "tokio-rustls" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" +checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" +dependencies = [ + "rustls 0.21.0", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" dependencies = [ "futures-core", "pin-project-lite", @@ -4134,7 +4393,7 @@ dependencies = [ "filetime", "futures-core", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "tokio", "tokio-stream", "xattr", @@ -4154,9 +4413,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.4" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" +checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" dependencies = [ "bytes", "futures-core", @@ -4168,33 +4427,36 @@ dependencies = [ [[package]] name = "toml" -version = "0.5.11" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" dependencies = [ "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", ] [[package]] name = "toml_datetime" -version = "0.5.1" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4553f467ac8e3d374bc9a177a26801e5d0f9b211aa1673fb137a403afd1c9cf5" +checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.17.1" +version = "0.19.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34cc558345efd7e88b9eda9626df2138b80bb46a7606f695e751c892bc7dac6" +checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" dependencies = [ "indexmap", - "itertools", - "nom8", "serde", + "serde_spanned", "toml_datetime", + "winnow", ] [[package]] @@ -4219,10 +4481,7 @@ dependencies = [ "pin-project", "prost", "prost-derive", - "rustls-native-certs", - "rustls-pemfile", "tokio", - "tokio-rustls", "tokio-stream", "tokio-util", "tower", @@ -4232,17 +4491,62 @@ dependencies = [ "tracing-futures", ] +[[package]] +name = "tonic" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.21.0", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "rustls-native-certs", + "rustls-pemfile", + "tokio", + "tokio-rustls 0.24.0", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic-build" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" dependencies = [ - "prettyplease", + "prettyplease 0.1.25", "proc-macro2", "prost-build", "quote", - "syn", + "syn 1.0.109", +] + +[[package]] +name = "tonic-build" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7" +dependencies = [ + "prettyplease 0.1.25", + "proc-macro2", + "prost-build", + "quote", + "syn 1.0.109", ] [[package]] @@ -4265,25 +4569,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "tower-http" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858" -dependencies = [ - "bitflags", - "bytes", - "futures-core", - "futures-util", - "http", - "http-body", - "http-range-header", - "pin-project-lite", - "tower", - "tower-layer", - "tower-service", -] - [[package]] name = "tower-layer" version = "0.3.2" @@ -4301,7 +4586,7 @@ name = "trace" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.1", "pageserver_api", "utils", "workspace_hack", @@ -4328,7 +4613,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -4474,15 +4759,15 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.10" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unicode-normalization" @@ -4520,10 +4805,10 @@ dependencies = [ "base64 0.13.1", "log", "once_cell", - "rustls", + "rustls 0.20.8", "url", "webpki", - "webpki-roots", + "webpki-roots 0.22.6", ] [[package]] @@ -4550,6 +4835,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + [[package]] name = "utils" version = "0.1.0" @@ -4593,9 +4884,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" +checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb" dependencies = [ "getrandom", "serde", @@ -4613,12 +4904,18 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.1", "env_logger", "log", "once_cell", @@ -4630,12 +4927,11 @@ dependencies = [ [[package]] name = "walkdir" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" dependencies = [ "same-file", - "winapi", "winapi-util", ] @@ -4676,7 +4972,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-shared", ] @@ -4710,7 +5006,7 @@ checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4750,6 +5046,15 @@ dependencies = [ "webpki", ] +[[package]] +name = "webpki-roots" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa54963694b65584e170cf5dc46aeb4dcaa5584e652ff5f3952e56d66aff0125" +dependencies = [ + "rustls-webpki", +] + [[package]] name = "which" version = "4.4.0" @@ -4792,19 +5097,28 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.0", +] + [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -4813,65 +5127,140 @@ version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows-targets", + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.0", ] [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "winnow" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" +dependencies = [ + "memchr", +] [[package]] name = "winreg" @@ -4890,7 +5279,8 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.1", + "clap_builder", "crossbeam-utils", "digest", "either", @@ -4902,7 +5292,6 @@ dependencies = [ "futures-sink", "futures-util", "hashbrown 0.12.3", - "indexmap", "itertools", "libc", "log", @@ -4917,16 +5306,18 @@ dependencies = [ "regex-syntax", "reqwest", "ring", - "rustls", + "rustls 0.20.8", "scopeguard", "serde", "serde_json", - "socket2", - "syn", + "socket2 0.4.9", + "syn 1.0.109", + "syn 2.0.14", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", "tokio-util", - "tonic", + "toml_datetime", + "toml_edit", "tower", "tracing", "tracing-core", @@ -4936,12 +5327,11 @@ dependencies = [ [[package]] name = "x509-parser" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" +checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634" dependencies = [ "asn1-rs", - "base64 0.13.1", "data-encoding", "der-parser", "lazy_static", @@ -4969,15 +5359,15 @@ checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" [[package]] name = "yasna" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed2e7a52e3744ab4d0c05c20aa065258e84c49fd4226f5191b2ed29712710b4" +checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd" dependencies = [ "time", ] [[package]] name = "zeroize" -version = "1.5.7" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" +checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" diff --git a/Cargo.toml b/Cargo.toml index 679605dc1d..0b545e6190 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,10 +24,10 @@ atty = "0.2.14" aws-config = { version = "0.51.0", default-features = false, features=["rustls"] } aws-sdk-s3 = "0.21.0" aws-smithy-http = "0.51.0" -aws-types = "0.51.0" +aws-types = "0.55" base64 = "0.13.0" bincode = "1.3" -bindgen = "0.61" +bindgen = "0.65" bstr = "1.0" byteorder = "1.4" bytes = "1.0" @@ -50,7 +50,7 @@ git-version = "0.3" hashbrown = "0.13" hashlink = "0.8.1" hex = "0.4" -hex-literal = "0.3" +hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" humantime = "2.1" @@ -80,18 +80,18 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } reqwest-middleware = "0.2.0" routerify = "3" -rpds = "0.12.0" +rpds = "0.13" rustls = "0.20" rustls-pemfile = "1" rustls-split = "0.3" scopeguard = "1.1" -sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "2.0" sha2 = "0.10.2" signal-hook = "0.3" -socket2 = "0.4.4" +socket2 = "0.5" strum = "0.24" strum_macros = "0.24" svg_fmt = "0.4.1" @@ -106,17 +106,17 @@ tokio-postgres-rustls = "0.9.0" tokio-rustls = "0.23" tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["io"] } -toml = "0.5" -toml_edit = { version = "0.17", features = ["easy"] } -tonic = {version = "0.8", features = ["tls", "tls-roots"]} +toml = "0.7" +toml_edit = "0.19" +tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" uuid = { version = "1.2", features = ["v4", "serde"] } walkdir = "2.3.2" -webpki-roots = "0.22.5" -x509-parser = "0.14" +webpki-roots = "0.23" +x509-parser = "0.15" ## TODO replace this with tracing env_logger = "0.10" @@ -154,9 +154,9 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.4" rcgen = "0.10" -rstest = "0.16" +rstest = "0.17" tempfile = "3.4" -tonic-build = "0.8" +tonic-build = "0.9" # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index f26aa2fbc5..3f290821c2 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -4,13 +4,12 @@ version = "0.1.0" edition = "2021" license = "Apache-2.0" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] -anyhow = "1.0.68" -chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } -rand = "0.8.3" -serde = "1.0.152" -serde_with = "2.1.0" -utils = { version = "0.1.0", path = "../utils" } -workspace_hack = { version = "0.1.0", path = "../../workspace_hack" } +anyhow.workspace = true +chrono.workspace = true +rand.workspace = true +serde.workspace = true +serde_with.workspace = true +utils.workspace = true + +workspace_hack.workspace = true diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 66221af522..f7e39751ef 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use std::process::Command; use anyhow::{anyhow, Context}; -use bindgen::callbacks::ParseCallbacks; +use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; #[derive(Debug)] struct PostgresFfiCallbacks; @@ -20,7 +20,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { // Add any custom #[derive] attributes to the data structures that bindgen // creates. - fn add_derives(&self, name: &str) -> Vec { + fn add_derives(&self, derive_info: &DeriveInfo) -> Vec { // This is the list of data structures that we want to serialize/deserialize. let serde_list = [ "XLogRecord", @@ -31,7 +31,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { "ControlFileData", ]; - if serde_list.contains(&name) { + if serde_list.contains(&derive_info.name) { vec![ "Default".into(), // Default allows us to easily fill the padding fields with 0. "Serialize".into(), diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs index eb52409c44..048e99d841 100644 --- a/libs/remote_storage/tests/pagination_tests.rs +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -204,12 +204,7 @@ async fn upload_s3_data( let data = format!("remote blob data {i}").into_bytes(); let data_len = data.len(); task_client - .upload( - Box::new(std::io::Cursor::new(data)), - data_len, - &blob_path, - None, - ) + .upload(std::io::Cursor::new(data), data_len, &blob_path, None) .await?; Ok::<_, anyhow::Error>((blob_prefix, blob_path)) diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 8c3d3f9063..b285c9b5b0 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -14,4 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true -workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +workspace_hack.workspace = true diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 391bc52a80..dc6326e73e 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -33,7 +33,7 @@ serde_with.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true -uuid = { version = "1.2", features = ["v4", "serde"] } +uuid.workspace = true metrics.workspace = true workspace_hack.workspace = true diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 19f0f22815..0c87e208c8 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -725,8 +725,9 @@ impl PageServerConf { "disk_usage_based_eviction" => { tracing::info!("disk_usage_based_eviction: {:#?}", &item); builder.disk_usage_based_eviction( - toml_edit::de::from_item(item.clone()) - .context("parse disk_usage_based_eviction")?) + deserialize_from_item_string("disk_usage_based_eviction", item) + .context("parse disk_usage_based_eviction")? + ) }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), @@ -827,14 +828,14 @@ impl PageServerConf { if let Some(eviction_policy) = item.get("eviction_policy") { t_conf.eviction_policy = Some( - toml_edit::de::from_item(eviction_policy.clone()) + deserialize_from_item_string("eviction_policy", eviction_policy) .context("parse eviction_policy")?, ); } if let Some(item) = item.get("min_resident_size_override") { t_conf.min_resident_size_override = Some( - toml_edit::de::from_item(item.clone()) + deserialize_from_item_string("min_resident_size_override", item) .context("parse min_resident_size_override")?, ); } @@ -938,6 +939,19 @@ where }) } +fn deserialize_from_item_string(name: &str, item: &Item) -> anyhow::Result +where + T: serde::de::DeserializeOwned, +{ + // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way + let item_string = item.to_string(); + let deserializer = item_string + .trim() + .parse::() + .with_context(|| format!("parsing item for node {name} as ValueDeserializer"))?; + T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) +} + /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c0e4a2a9cf..bd38a7a2f3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -65,7 +65,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream { // We were requested to shut down. - let msg = format!("pageserver is shutting down"); + let msg = "pageserver is shutting down".to_string(); let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)); Err(QueryError::Other(anyhow::anyhow!(msg))) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 03a4ff8c8e..67bc1b36b0 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1876,7 +1876,7 @@ impl Tenant { .to_string(); // Convert the config to a toml file. - conf_content += &toml_edit::easy::to_string(&tenant_conf)?; + conf_content += &toml_edit::ser::to_string(&tenant_conf)?; let mut target_config_file = VirtualFile::open_with_options( target_config_path, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index cdabb23a7b..9b719db180 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -275,9 +275,9 @@ mod tests { ..TenantConfOpt::default() }; - let toml_form = toml_edit::easy::to_string(&small_conf).unwrap(); + let toml_form = toml_edit::ser::to_string(&small_conf).unwrap(); assert_eq!(toml_form, "gc_horizon = 42\n"); - assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap()); + assert_eq!(small_conf, toml_edit::de::from_str(&toml_form).unwrap()); let json_form = serde_json::to_string(&small_conf).unwrap(); assert_eq!(json_form, "{\"gc_horizon\":42}"); diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index ce9f4d9bf8..699121ccd9 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -74,7 +74,7 @@ pub(super) async fn upload_timeline_layer<'a>( })?; storage - .upload(Box::new(source_file), fs_size, &storage_path, None) + .upload(source_file, fs_size, &storage_path, None) .await .with_context(|| { format!( diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index d7ace28426..de7b634ba0 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -23,7 +23,6 @@ use std::convert::Infallible; use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; -use std::task::Poll; use std::time::Duration; use tokio::sync::broadcast; use tokio::sync::broadcast::error::RecvError; @@ -374,7 +373,7 @@ impl BrokerService for Broker { Ok(info) => yield info, Err(RecvError::Lagged(skipped_msg)) => { missed_msgs += skipped_msg; - if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) { + if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() { warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs); missed_msgs = 0; diff --git a/trace/Cargo.toml b/trace/Cargo.toml index 6ced992d4c..d6eed3f49c 100644 --- a/trace/Cargo.toml +++ b/trace/Cargo.toml @@ -4,8 +4,6 @@ version = "0.1.0" edition.workspace = true license.workspace = true -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] clap.workspace = true anyhow.workspace = true diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f885f4a94d..f735ffed4c 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -18,6 +18,7 @@ byteorder = { version = "1" } bytes = { version = "1", features = ["serde"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } clap = { version = "4", features = ["derive", "string"] } +clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } digest = { version = "0.10", features = ["mac", "std"] } either = { version = "1" } @@ -29,7 +30,6 @@ futures-executor = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hashbrown = { version = "0.12", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -52,7 +52,8 @@ socket2 = { version = "0.4", default-features = false, features = ["all"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] } tokio-rustls = { version = "0.23" } tokio-util = { version = "0.7", features = ["codec", "io"] } -tonic = { version = "0.8", features = ["tls-roots"] } +toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } +toml_edit = { version = "0.19", features = ["serde"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } @@ -64,7 +65,6 @@ anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } either = { version = "1" } hashbrown = { version = "0.12", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -74,6 +74,7 @@ prost = { version = "0.11" } regex = { version = "1" } regex-syntax = { version = "0.6" } serde = { version = "1", features = ["alloc", "derive"] } -syn = { version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] } ### END HAKARI SECTION From 8d295780cb833848f1d4b97bee166cbf80b7d9bd Mon Sep 17 00:00:00 2001 From: Sam Gaw Date: Mon, 10 Apr 2023 17:07:43 +0100 Subject: [PATCH 267/426] Add support for ip4r extension --- Dockerfile.compute-node | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 3473487444..7c64951fa5 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -256,6 +256,21 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control +######################################################################################### +# +# Layer "ip4r-pg-build" +# compile ip4r extension +# +######################################################################################### +FROM build-deps AS ip4r-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \ + mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control + ######################################################################################### # # Layer "prefix-pg-build" @@ -423,6 +438,7 @@ COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ From 218062cebade6dbb68c44a06e022769bf301289c Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 7 Apr 2023 12:04:06 +0100 Subject: [PATCH 268/426] GitHub Workflows: use ref_name instead of ref --- .github/workflows/benchmarking.yml | 12 ++++++------ .github/workflows/build_and_test.yml | 4 ++-- .github/workflows/neon_extra_builds.yml | 2 +- .github/workflows/pg_clients.yml | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 028fe8d8ad..4f3ff15364 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -30,7 +30,7 @@ defaults: concurrency: # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true jobs: @@ -42,7 +42,7 @@ jobs: DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: "neon-staging" runs-on: [ self-hosted, us-east-2, x64 ] @@ -174,7 +174,7 @@ jobs: DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] @@ -317,7 +317,7 @@ jobs: DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] @@ -413,7 +413,7 @@ jobs: DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] @@ -503,7 +503,7 @@ jobs: DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} runs-on: [ self-hosted, us-east-2, x64 ] diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c096aef4a9..691320324e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -13,7 +13,7 @@ defaults: concurrency: # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: @@ -368,7 +368,7 @@ jobs: build_type: ${{ matrix.build_type }} test_selection: performance run_in_parallel: false - save_perf_report: ${{ github.ref == 'refs/heads/main' }} + save_perf_report: ${{ github.ref_name == 'main' }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index ef4c293e31..1196881541 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -12,7 +12,7 @@ defaults: concurrency: # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true env: diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 9f57519589..224b7b4a6d 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -14,7 +14,7 @@ on: concurrency: # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} cancel-in-progress: true jobs: From c94b8998bedb61a7fda5c910412f067afb0d4e57 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 7 Apr 2023 12:23:08 +0100 Subject: [PATCH 269/426] GitHub Workflows: print error messages to stderr --- .github/actions/allure-report/action.yml | 4 ++-- .github/actions/download/action.yml | 2 +- .github/actions/neon-branch-create/action.yml | 4 ++-- .github/actions/neon-branch-delete/action.yml | 2 +- .github/actions/upload/action.yml | 6 +++--- .github/workflows/benchmarking.yml | 8 ++++---- .github/workflows/build_and_test.yml | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index e35cbb20fd..9a1037064a 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -45,12 +45,12 @@ runs: shell: bash -euxo pipefail {0} run: | if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then - echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" + echo >&2 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" exit 1 fi if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then - echo 2>&1 "inputs.test_selection must be set for 'store' action" + echo >&2 "inputs.test_selection must be set for 'store' action" exit 2 fi diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index eb34d4206a..d3f9bc0414 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -37,7 +37,7 @@ runs: echo 'SKIPPED=true' >> $GITHUB_OUTPUT exit 0 else - echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" + echo >&2 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi fi diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index 7ee43a3587..f1eea34ab9 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -58,7 +58,7 @@ runs: done if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then - echo 2>&1 "Failed to create branch after 10 attempts, the latest response was: ${branch}" + echo >&2 "Failed to create branch after 10 attempts, the latest response was: ${branch}" exit 1 fi @@ -122,7 +122,7 @@ runs: done if [ -z "${password}" ] || [ "${password}" == "null" ]; then - echo 2>&1 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}" + echo >&2 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}" exit 1 fi diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index 5689093e2e..f8cd351dd9 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -48,7 +48,7 @@ runs: done if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then - echo 2>&1 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}" + echo >&2 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}" exit 1 fi env: diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 291a2cf3b0..63973dfbe7 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -23,7 +23,7 @@ runs: mkdir -p $(dirname $ARCHIVE) if [ -f ${ARCHIVE} ]; then - echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before" + echo >&2 "File ${ARCHIVE} already exist. Something went wrong before" exit 1 fi @@ -33,10 +33,10 @@ runs: elif [ -f ${SOURCE} ]; then time tar -cf ${ARCHIVE} --zstd ${SOURCE} elif ! ls ${SOURCE} > /dev/null 2>&1; then - echo 2>&1 "${SOURCE} does not exist" + echo >&2 "${SOURCE} does not exist" exit 2 else - echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it" + echo >&2 "${SOURCE} is neither a directory nor a file, do not know how to handle it" exit 3 fi diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 4f3ff15364..8471d802bb 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -226,7 +226,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -356,7 +356,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -452,7 +452,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -542,7 +542,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" + echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 691320324e..3212b76731 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1007,7 +1007,7 @@ jobs: S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then - echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" + echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi From 13e53e5dc8012bf5c2f84d9b7737f1722c5e8f5e Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 12 Apr 2023 13:26:03 +0100 Subject: [PATCH 270/426] GitHub Workflows: use '!cancelled' instead of 'success or failure' --- .github/actions/run-python-test-set/action.yml | 2 +- .github/workflows/benchmarking.yml | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 11f5c78f19..115f555913 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -202,7 +202,7 @@ runs: prefix: latest - name: Create Allure report - if: success() || failure() + if: ${{ !cancelled() }} uses: ./.github/actions/allure-report with: action: store diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 8471d802bb..a5a27e59a8 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -92,7 +92,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report - if: success() || failure() + if: ${{ !cancelled() }} uses: ./.github/actions/allure-report with: action: generate @@ -282,7 +282,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report - if: success() || failure() + if: ${{ !cancelled() }} uses: ./.github/actions/allure-report with: action: generate @@ -305,7 +305,7 @@ jobs: # # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB - if: success() || failure() + if: ${{ !cancelled() }} needs: [ generate-matrices, pgbench-compare ] strategy: @@ -379,7 +379,7 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report - if: success() || failure() + if: ${{ !cancelled() }} uses: ./.github/actions/allure-report with: action: generate @@ -401,7 +401,7 @@ jobs: # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) - if: success() || failure() + if: ${{ !cancelled() }} needs: [ generate-matrices, clickbench-compare ] strategy: @@ -475,7 +475,7 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report - if: success() || failure() + if: ${{ !cancelled() }} uses: ./.github/actions/allure-report with: action: generate @@ -491,7 +491,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: - if: success() || failure() + if: ${{ !cancelled() }} needs: [ generate-matrices, tpch-compare ] strategy: @@ -565,7 +565,7 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report - if: success() || failure() + if: ${{ !cancelled() }} uses: ./.github/actions/allure-report with: action: generate From f7995b3c7054cdbd32ced709ab1f8bbf4a20fce7 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 12 Apr 2023 17:51:59 +0300 Subject: [PATCH 271/426] Revert "Update most of the dependencies to their latest versions (#3991)" (#4013) This reverts commit a64044a7a9a9f2b32a73b97da1fd230f9b510064. See https://neondb.slack.com/archives/C03H1K0PGKH/p1681306682795559 --- .config/hakari.toml | 2 +- Cargo.lock | 1402 ++++++----------- Cargo.toml | 26 +- libs/consumption_metrics/Cargo.toml | 17 +- libs/postgres_ffi/build.rs | 6 +- libs/remote_storage/tests/pagination_tests.rs | 7 +- libs/tracing-utils/Cargo.toml | 3 +- libs/utils/Cargo.toml | 2 +- pageserver/src/config.rs | 22 +- pageserver/src/page_service.rs | 2 +- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/config.rs | 4 +- .../tenant/remote_timeline_client/upload.rs | 2 +- storage_broker/src/bin/storage_broker.rs | 3 +- trace/Cargo.toml | 2 + workspace_hack/Cargo.toml | 9 +- 16 files changed, 557 insertions(+), 954 deletions(-) diff --git a/.config/hakari.toml b/.config/hakari.toml index 15b939e86f..12d2d1bf9c 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -4,7 +4,7 @@ hakari-package = "workspace_hack" # Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above. -dep-format-version = "4" +dep-format-version = "3" # Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended. # Hakari works much better with the new feature resolver. diff --git a/Cargo.lock b/Cargo.lock index 8dde4ebb57..668487a9bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -63,69 +63,29 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" -[[package]] -name = "anstream" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "342258dd14006105c2b75ab1bd7543a03bdf0cfc94383303ac212a04939dff6f" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-wincon", - "concolor-override", - "concolor-query", - "is-terminal", - "utf8parse", -] - -[[package]] -name = "anstyle" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23ea9e81bd02e310c216d080f6223c179012256e5151c41db88d12c88a1684d2" - -[[package]] -name = "anstyle-parse" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7d1bb534e9efed14f3e5f44e7dd1a4f709384023a4165199a4241e18dff0116" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-wincon" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3127af6145b149f3287bb9a0d10ad9c5692dba8c53ad48285e5bec4063834fa" -dependencies = [ - "anstyle", - "windows-sys 0.45.0", -] - [[package]] name = "anyhow" -version = "1.0.70" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" +checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61" dependencies = [ "backtrace", ] [[package]] name = "archery" -version = "0.5.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6cd774058b1b415c4855d8b86436c04bf050c003156fe24bc326fb3fe75c343" +checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02" dependencies = [ "static_assertions", ] [[package]] name = "asn1-rs" -version = "0.5.2" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0" +checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4" dependencies = [ "asn1-rs-derive", "asn1-rs-impl", @@ -145,7 +105,7 @@ checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", "synstructure", ] @@ -157,47 +117,46 @@ checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] name = "async-stream" -version = "0.3.5" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e" dependencies = [ "async-stream-impl", "futures-core", - "pin-project-lite", ] [[package]] name = "async-stream-impl" -version = "0.3.5" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27" dependencies = [ "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] name = "async-trait" -version = "0.1.68" +version = "0.1.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" +checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] name = "atomic-polyfill" -version = "1.0.2" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" +checksum = "d299f547288d6db8d5c3a2916f7b2f66134b15b8c1ac1c4357dd3b8752af7bb2" dependencies = [ "critical-section", ] @@ -228,13 +187,13 @@ dependencies = [ "aws-http", "aws-sdk-sso", "aws-sdk-sts", - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-http-tower", "aws-smithy-json", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", + "aws-smithy-types", + "aws-types", "bytes", "hex", "http", @@ -247,29 +206,15 @@ dependencies = [ "zeroize", ] -[[package]] -name = "aws-credential-types" -version = "0.55.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77e37e62f59cf3284067337da7467d842df8cfe3f5e5c06487ac7521819cf16d" -dependencies = [ - "aws-smithy-async 0.55.1", - "aws-smithy-types 0.55.1", - "fastrand", - "tokio", - "tracing", - "zeroize", -] - [[package]] name = "aws-endpoint" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ca8f374874f6459aaa88dc861d7f5d834ca1ff97668eae190e97266b5f6c3fb" dependencies = [ - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", + "aws-smithy-http", + "aws-smithy-types", + "aws-types", "http", "regex", "tracing", @@ -281,9 +226,9 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78d41e19e779b73463f5f0c21b3aacc995f4ba783ab13a7ae9f5dfb159a551b4" dependencies = [ - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", + "aws-smithy-http", + "aws-smithy-types", + "aws-types", "bytes", "http", "http-body", @@ -303,15 +248,15 @@ dependencies = [ "aws-http", "aws-sig-auth", "aws-sigv4", - "aws-smithy-async 0.51.0", + "aws-smithy-async", "aws-smithy-checksums", - "aws-smithy-client 0.51.0", + "aws-smithy-client", "aws-smithy-eventstream", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", - "aws-smithy-types 0.51.0", + "aws-smithy-http", + "aws-smithy-http-tower", + "aws-smithy-types", "aws-smithy-xml", - "aws-types 0.51.0", + "aws-types", "bytes", "bytes-utils", "http", @@ -330,13 +275,13 @@ dependencies = [ "aws-endpoint", "aws-http", "aws-sig-auth", - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-http-tower", "aws-smithy-json", - "aws-smithy-types 0.51.0", - "aws-types 0.51.0", + "aws-smithy-types", + "aws-types", "bytes", "http", "tokio-stream", @@ -352,14 +297,14 @@ dependencies = [ "aws-endpoint", "aws-http", "aws-sig-auth", - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-http-tower", "aws-smithy-query", - "aws-smithy-types 0.51.0", + "aws-smithy-types", "aws-smithy-xml", - "aws-types 0.51.0", + "aws-types", "bytes", "http", "tower", @@ -373,20 +318,20 @@ checksum = "12cbe7b2be9e185c1fbce27fc9c41c66b195b32d89aa099f98768d9544221308" dependencies = [ "aws-sigv4", "aws-smithy-eventstream", - "aws-smithy-http 0.51.0", - "aws-types 0.51.0", + "aws-smithy-http", + "aws-types", "http", "tracing", ] [[package]] name = "aws-sigv4" -version = "0.51.1" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c0b2658d2cb66dbf02f0e8dee80810ef1e0ca3530ede463e0ef994c301087d1" +checksum = "03ff4cff8c4a101962d593ba94e72cd83891aecd423f0c6e3146bff6fb92c9e3" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-http 0.51.0", + "aws-smithy-http", "bytes", "form_urlencoded", "hex", @@ -411,26 +356,14 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "aws-smithy-async" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88573bcfbe1dcfd54d4912846df028b42d6255cbf9ce07be216b1bbfd11fc4b9" -dependencies = [ - "futures-util", - "pin-project-lite", - "tokio", - "tokio-stream", -] - [[package]] name = "aws-smithy-checksums" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc227e36e346f45298288359f37123e1a92628d1cec6b11b5eb335553278bd9e" dependencies = [ - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", + "aws-smithy-http", + "aws-smithy-types", "bytes", "crc32c", "crc32fast", @@ -450,10 +383,10 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff28d553714f8f54cd921227934fc13a536a1c03f106e56b362fd57e16d450ad" dependencies = [ - "aws-smithy-async 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-http-tower 0.51.0", - "aws-smithy-types 0.51.0", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-tower", + "aws-smithy-types", "bytes", "fastrand", "http", @@ -467,33 +400,13 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-smithy-client" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2f52352bae50d3337d5d6151b695d31a8c10ebea113eca5bead531f8301b067" -dependencies = [ - "aws-smithy-async 0.55.1", - "aws-smithy-http 0.55.1", - "aws-smithy-http-tower 0.55.1", - "aws-smithy-types 0.55.1", - "bytes", - "fastrand", - "http", - "http-body", - "pin-project-lite", - "tokio", - "tower", - "tracing", -] - [[package]] name = "aws-smithy-eventstream" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7ea0df7161ce65b5c8ca6eb709a1a907376fa18226976e41c748ce02ccccf24" dependencies = [ - "aws-smithy-types 0.51.0", + "aws-smithy-types", "bytes", "crc32fast", ] @@ -505,7 +418,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf58ed4fefa61dbf038e5421a521cbc2c448ef69deff0ab1d915d8a10eda5664" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-types 0.51.0", + "aws-smithy-types", "bytes", "bytes-utils", "futures-core", @@ -521,49 +434,13 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-smithy-http" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03bcc02d7ed9649d855c8ce4a735e9848d7b8f7568aad0504c158e3baa955df8" -dependencies = [ - "aws-smithy-types 0.55.1", - "bytes", - "bytes-utils", - "futures-core", - "http", - "http-body", - "hyper", - "once_cell", - "percent-encoding", - "pin-project-lite", - "pin-utils", - "tracing", -] - [[package]] name = "aws-smithy-http-tower" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20c96d7bd35e7cf96aca1134b2f81b1b59ffe493f7c6539c051791cbbf7a42d3" dependencies = [ - "aws-smithy-http 0.51.0", - "bytes", - "http", - "http-body", - "pin-project-lite", - "tower", - "tracing", -] - -[[package]] -name = "aws-smithy-http-tower" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da88b3a860f65505996c29192d800f1aeb9480440f56d63aad33a3c12045017a" -dependencies = [ - "aws-smithy-http 0.55.1", - "aws-smithy-types 0.55.1", + "aws-smithy-http", "bytes", "http", "http-body", @@ -578,7 +455,7 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8324ba98c8a94187723cc16c37aefa09504646ee65c3d2c3af495bab5ea701b" dependencies = [ - "aws-smithy-types 0.51.0", + "aws-smithy-types", ] [[package]] @@ -587,7 +464,7 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83834ed2ff69ea6f6657baf205267dc2c0abe940703503a3e5d60ce23be3d306" dependencies = [ - "aws-smithy-types 0.51.0", + "aws-smithy-types", "urlencoding", ] @@ -603,19 +480,6 @@ dependencies = [ "time", ] -[[package]] -name = "aws-smithy-types" -version = "0.55.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0afc731fd1417d791f9145a1e0c30e23ae0beaab9b4814017708ead2fc20f1" -dependencies = [ - "base64-simd", - "itoa", - "num-integer", - "ryu", - "time", -] - [[package]] name = "aws-smithy-xml" version = "0.51.0" @@ -631,37 +495,21 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05701d32da168b44f7ee63147781aed8723e792cc131cb9b18363b5393f17f70" dependencies = [ - "aws-smithy-async 0.51.0", - "aws-smithy-client 0.51.0", - "aws-smithy-http 0.51.0", - "aws-smithy-types 0.51.0", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-types", "http", "rustc_version", "tracing", "zeroize", ] -[[package]] -name = "aws-types" -version = "0.55.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fb02591b5075d318e0083dcb76df0e151db4ce48f987ecd00e5b53c7a6ba59" -dependencies = [ - "aws-credential-types", - "aws-smithy-async 0.55.1", - "aws-smithy-client 0.55.1", - "aws-smithy-http 0.55.1", - "aws-smithy-types 0.55.1", - "http", - "rustc_version", - "tracing", -] - [[package]] name = "axum" -version = "0.6.15" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62" +checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc" dependencies = [ "async-trait", "axum-core", @@ -681,15 +529,16 @@ dependencies = [ "serde", "sync_wrapper", "tower", + "tower-http", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.3.4" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" +checksum = "1cae3e661676ffbacb30f1a824089a8c9150e71017f7e1e38f2aa32009188d34" dependencies = [ "async-trait", "bytes", @@ -735,16 +584,6 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" -[[package]] -name = "base64-simd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" -dependencies = [ - "outref", - "vsimd", -] - [[package]] name = "bincode" version = "1.3.3" @@ -756,9 +595,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.65.1" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" dependencies = [ "bitflags", "cexpr", @@ -767,13 +606,12 @@ dependencies = [ "lazycell", "log", "peeking_take_while", - "prettyplease 0.2.4", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn 2.0.14", + "syn", "which", ] @@ -785,18 +623,18 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "block-buffer" -version = "0.10.4" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" dependencies = [ "generic-array", ] [[package]] name = "bstr" -version = "1.4.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" +checksum = "b7f0778972c64420fdedc63f09919c8a88bda7b25135357fd25a5d9f3257e832" dependencies = [ "memchr", "once_cell", @@ -864,9 +702,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.24" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" dependencies = [ "iana-time-zone", "num-integer", @@ -904,9 +742,9 @@ dependencies = [ [[package]] name = "clang-sys" -version = "1.6.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" dependencies = [ "glob", "libc", @@ -927,38 +765,30 @@ dependencies = [ [[package]] name = "clap" -version = "4.2.1" +version = "4.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046ae530c528f252094e4a77886ee1374437744b2bff1497aa898bbddbbb29b3" +checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76" dependencies = [ - "clap_builder", - "clap_derive", - "once_cell", -] - -[[package]] -name = "clap_builder" -version = "4.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "223163f58c9a40c3b0a43e1c4b50a9ce09f007ea2cb1ec258a687945b4b7929f" -dependencies = [ - "anstream", - "anstyle", "bitflags", - "clap_lex 0.4.1", + "clap_derive", + "clap_lex 0.3.1", + "is-terminal", + "once_cell", "strsim", + "termcolor", ] [[package]] name = "clap_derive" -version = "4.2.0" +version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4" +checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" dependencies = [ "heck", + "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] @@ -972,9 +802,12 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.4.1" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" +checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" +dependencies = [ + "os_str_bytes", +] [[package]] name = "close_fds" @@ -1026,7 +859,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.2.1", + "clap 4.1.4", "compute_api", "futures", "hyper", @@ -1050,21 +883,6 @@ dependencies = [ "workspace_hack", ] -[[package]] -name = "concolor-override" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a855d4a1978dc52fb0536a04d384c2c0c1aa273597f08b77c8c4d3b2eec6037f" - -[[package]] -name = "concolor-query" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d11d52c3d7ca2e6d0040212be9e4dbbcd78b6447f535b6b561f449427944cf" -dependencies = [ - "windows-sys 0.45.0", -] - [[package]] name = "const_format" version = "0.2.30" @@ -1103,7 +921,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.1", + "clap 4.1.4", "comfy-table", "git-version", "nix", @@ -1139,15 +957,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" [[package]] name = "cpufeatures" -version = "0.2.6" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" +checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" dependencies = [ "libc", ] @@ -1214,9 +1032,9 @@ checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" [[package]] name = "crossbeam-channel" -version = "0.5.8" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1224,9 +1042,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -1235,22 +1053,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.14" +version = "0.9.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" +checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.8.0", + "memoffset 0.7.1", "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" +checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" dependencies = [ "cfg-if", ] @@ -1292,9 +1110,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.94" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" +checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9" dependencies = [ "cc", "cxxbridge-flags", @@ -1304,9 +1122,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.94" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" +checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d" dependencies = [ "cc", "codespan-reporting", @@ -1314,31 +1132,31 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn 2.0.14", + "syn", ] [[package]] name = "cxxbridge-flags" -version = "1.0.94" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" +checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a" [[package]] name = "cxxbridge-macro" -version = "1.0.94" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" +checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] name = "darling" -version = "0.14.4" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" dependencies = [ "darling_core", "darling_macro", @@ -1346,27 +1164,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.4" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn 1.0.109", + "syn", ] [[package]] name = "darling_macro" -version = "0.14.4" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" dependencies = [ "darling_core", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -1400,9 +1218,9 @@ dependencies = [ [[package]] name = "der-parser" -version = "8.2.0" +version = "8.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e" +checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1" dependencies = [ "asn1-rs", "displaydoc", @@ -1431,7 +1249,7 @@ checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -1451,9 +1269,9 @@ dependencies = [ [[package]] name = "enum-map" -version = "2.5.0" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "988f0d17a0fa38291e5f41f71ea8d46a5d5497b9054d5a759fae2cbb819f2356" +checksum = "50c25992259941eb7e57b936157961b217a4fc8597829ddef0596d6c3cd86e1a" dependencies = [ "enum-map-derive", ] @@ -1466,7 +1284,7 @@ checksum = "2a4da76b3b6116d758c7ba93f7ec6a35d2e2cf24feda76c6e38a375f4d5c59f2" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -1487,7 +1305,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -1505,13 +1323,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.1" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" dependencies = [ "errno-dragonfly", "libc", - "windows-sys 0.48.0", + "winapi", ] [[package]] @@ -1543,23 +1361,23 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "1.9.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" dependencies = [ "instant", ] [[package]] name = "filetime" -version = "0.2.21" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" +checksum = "4e884668cd0c7480504233e951174ddc3b382f7c2666e3b7310b5c4e7b0c37f9" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.16", - "windows-sys 0.48.0", + "redox_syscall", + "windows-sys 0.42.0", ] [[package]] @@ -1604,9 +1422,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" dependencies = [ "futures-channel", "futures-core", @@ -1619,9 +1437,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" dependencies = [ "futures-core", "futures-sink", @@ -1629,15 +1447,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" [[package]] name = "futures-executor" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" dependencies = [ "futures-core", "futures-task", @@ -1646,32 +1464,32 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" dependencies = [ "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" [[package]] name = "futures-timer" @@ -1681,9 +1499,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" dependencies = [ "futures-channel", "futures-core", @@ -1699,9 +1517,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.7" +version = "0.14.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" dependencies = [ "typenum", "version_check", @@ -1709,22 +1527,20 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.9" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if", - "js-sys", "libc", "wasi", - "wasm-bindgen", ] [[package]] name = "gimli" -version = "0.27.2" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" +checksum = "221996f774192f0f718773def8201c4ae31f02616a54ccfc2d358bb0e5cefdec" [[package]] name = "git-version" @@ -1745,7 +1561,7 @@ dependencies = [ "proc-macro-hack", "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -1756,9 +1572,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.16" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5be7b54589b581f624f566bf5d8eb2bab1db736c51528720b6bd36b96b55924d" +checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" dependencies = [ "bytes", "fnv", @@ -1823,7 +1639,7 @@ dependencies = [ "atomic-polyfill", "hash32", "rustc_version", - "spin 0.9.8", + "spin 0.9.4", "stable_deref_trait", ] @@ -1851,12 +1667,6 @@ dependencies = [ "libc", ] -[[package]] -name = "hermit-abi" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" - [[package]] name = "hex" version = "0.4.3" @@ -1868,9 +1678,9 @@ dependencies = [ [[package]] name = "hex-literal" -version = "0.4.1" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" [[package]] name = "hmac" @@ -1894,9 +1704,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.9" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" +checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", @@ -1914,6 +1724,12 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-range-header" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" + [[package]] name = "httparse" version = "1.8.0" @@ -1944,9 +1760,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.25" +version = "0.14.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc5e554ff619822309ffd57d8734d77cd5ce6238bc956f037ea06c58238c9899" +checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" dependencies = [ "bytes", "futures-channel", @@ -1959,7 +1775,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.4.9", + "socket2", "tokio", "tower-service", "tracing", @@ -1975,10 +1791,10 @@ dependencies = [ "http", "hyper", "log", - "rustls 0.20.8", + "rustls", "rustls-native-certs", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls", ] [[package]] @@ -2008,16 +1824,16 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.56" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "winapi", ] [[package]] @@ -2048,9 +1864,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.3" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" dependencies = [ "autocfg", "hashbrown 0.12.3", @@ -2088,31 +1904,30 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.10" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" +checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" dependencies = [ - "hermit-abi 0.3.1", "libc", - "windows-sys 0.48.0", + "windows-sys 0.42.0", ] [[package]] name = "ipnet" -version = "2.7.2" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" +checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" [[package]] name = "is-terminal" -version = "0.4.7" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" +checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" dependencies = [ - "hermit-abi 0.3.1", + "hermit-abi 0.2.6", "io-lifetimes", - "rustix 0.37.11", - "windows-sys 0.48.0", + "rustix", + "windows-sys 0.42.0", ] [[package]] @@ -2126,9 +1941,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" [[package]] name = "js-sys" @@ -2141,11 +1956,11 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.3.0" +version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" +checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" dependencies = [ - "base64 0.21.0", + "base64 0.13.1", "pem", "ring", "serde", @@ -2187,9 +2002,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.141" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libloading" @@ -2216,12 +2031,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" -[[package]] -name = "linux-raw-sys" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f" - [[package]] name = "lock_api" version = "0.4.9" @@ -2314,9 +2123,9 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.17" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "mime_guess" @@ -2336,23 +2145,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.6.2" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +checksum = "f2e212582ede878b109755efd0773a4f0f4ec851584cf0aefbeb4d9ecc114822" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.6" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc", "log", "wasi", - "windows-sys 0.45.0", + "windows-sys 0.42.0", ] [[package]] @@ -2385,6 +2194,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom8" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae01545c9c7fc4486ab7debaf2aad7003ac19431791868fb2e8066df97fad2f8" +dependencies = [ + "memchr", +] + [[package]] name = "notify" version = "5.1.0" @@ -2473,9 +2291,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.1" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" [[package]] name = "oorandom" @@ -2540,8 +2358,8 @@ dependencies = [ "futures-util", "opentelemetry", "prost", - "tonic 0.8.3", - "tonic-build 0.8.4", + "tonic", + "tonic-build", ] [[package]] @@ -2593,9 +2411,9 @@ dependencies = [ [[package]] name = "os_info" -version = "3.7.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "006e42d5b888366f1880eda20371fedde764ed2213dc8496f49622fa0c99cd5e" +checksum = "5c424bc68d15e0778838ac013b5b3449544d8133633d8016319e7e05a820b8c0" dependencies = [ "log", "serde", @@ -2604,15 +2422,9 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.5.0" +version = "6.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" - -[[package]] -name = "outref" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" [[package]] name = "overload" @@ -2630,7 +2442,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.2.1", + "clap 4.1.4", "close_fds", "const_format", "consumption_metrics", @@ -2727,7 +2539,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.16", + "redox_syscall", "smallvec", "windows-sys 0.45.0", ] @@ -2755,9 +2567,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "petgraph" -version = "0.6.3" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" +checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" dependencies = [ "fixedbitset", "indexmap", @@ -2798,7 +2610,7 @@ checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -2892,14 +2704,14 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls 0.20.8", + "rustls", "rustls-pemfile", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.23.4", + "tokio-rustls", "tracing", "workspace_hack", ] @@ -2965,22 +2777,36 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.25" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86" +checksum = "e97e3215779627f01ee256d2fad52f3d95e8e1c11e9fc6fd08f7cd455d5d5c78" dependencies = [ "proc-macro2", - "syn 1.0.109", + "syn", ] [[package]] -name = "prettyplease" -version = "0.2.4" +name = "proc-macro-error" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", - "syn 2.0.14", + "quote", + "version_check", ] [[package]] @@ -2991,9 +2817,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.56" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" +checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" dependencies = [ "unicode-ident", ] @@ -3008,7 +2834,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix 0.36.12", + "rustix", ] [[package]] @@ -3029,9 +2855,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.11.8" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e48e50df39172a3e7eb17e14642445da64996989bc212b583015435d39a58537" +checksum = "21dc42e00223fc37204bd4aa177e69420c604ca4a183209a8f9de30c6d934698" dependencies = [ "bytes", "prost-derive", @@ -3039,9 +2865,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.11.8" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c828f93f5ca4826f97fedcbd3f9a536c16b12cff3dbbb4a007f932bbad95b12" +checksum = "a3f8ad728fb08fe212df3c05169e940fbb6d9d16a877ddde14644a983ba2012e" dependencies = [ "bytes", "heck", @@ -3050,34 +2876,35 @@ dependencies = [ "log", "multimap", "petgraph", - "prettyplease 0.1.25", + "prettyplease", "prost", "prost-types", "regex", - "syn 1.0.109", + "syn", "tempfile", "which", ] [[package]] name = "prost-derive" -version = "0.11.8" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ea9b0f8cbe5e15a8a042d030bd96668db28ecb567ec37d691971ff5731d2b1b" +checksum = "8bda8c0881ea9f722eb9629376db3d0b903b462477c1aafcb0566610ac28ac5d" dependencies = [ "anyhow", "itertools", "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] name = "prost-types" -version = "0.11.8" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "379119666929a1afd7a043aa6cf96fa67a6dce9af60c88095a4686dbce4c9c88" +checksum = "a5e0526209433e96d83d750dd81a99118edbc55739e7e61a46764fd2ad537788" dependencies = [ + "bytes", "prost", ] @@ -3092,7 +2919,7 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 4.2.1", + "clap 4.1.4", "consumption_metrics", "futures", "git-version", @@ -3122,20 +2949,20 @@ dependencies = [ "reqwest-tracing", "routerify", "rstest", - "rustls 0.20.8", + "rustls", "rustls-pemfile", "scopeguard", "serde", "serde_json", "sha2", - "socket2 0.5.2", + "socket2", "sync_wrapper", "thiserror", "tls-listener", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.23.4", + "tokio-rustls", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -3143,16 +2970,16 @@ dependencies = [ "url", "utils", "uuid", - "webpki-roots 0.23.0", + "webpki-roots", "workspace_hack", "x509-parser", ] [[package]] name = "quote" -version = "1.0.26" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" dependencies = [ "proc-macro2", ] @@ -3189,9 +3016,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.7.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" dependencies = [ "either", "rayon-core", @@ -3199,9 +3026,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.11.0" +version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -3230,20 +3057,11 @@ dependencies = [ "bitflags", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags", -] - [[package]] name = "regex" -version = "1.7.3" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" +checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" dependencies = [ "aho-corasick", "memchr", @@ -3261,9 +3079,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.29" +version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" [[package]] name = "remote_storage" @@ -3273,8 +3091,8 @@ dependencies = [ "async-trait", "aws-config", "aws-sdk-s3", - "aws-smithy-http 0.51.0", - "aws-types 0.55.0", + "aws-smithy-http", + "aws-types", "hyper", "metrics", "once_cell", @@ -3293,9 +3111,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.16" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" +checksum = "21eed90ec8570952d53b772ecf8f206aa1ec9a3d76b2521c56c42973f2d91ee9" dependencies = [ "base64 0.21.0", "bytes", @@ -3315,27 +3133,27 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.20.8", + "rustls", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots 0.22.6", + "webpki-roots", "winreg", ] [[package]] name = "reqwest-middleware" -version = "0.2.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b" +checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894" dependencies = [ "anyhow", "async-trait", @@ -3348,12 +3166,11 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.1" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8" +checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce" dependencies = [ "async-trait", - "getrandom", "opentelemetry", "reqwest", "reqwest-middleware", @@ -3392,18 +3209,18 @@ dependencies = [ [[package]] name = "rpds" -version = "0.13.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bd6ce569b15c331b1e5fd8cf6adb0bf240678b5f0cdc4d0f41e11683f6feba9" +checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000" dependencies = [ "archery", ] [[package]] name = "rstest" -version = "0.17.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de1bb486a691878cd320c2f0d319ba91eeaa2e894066d8b5f8f117c000e9d962" +checksum = "b07f2d176c472198ec1e6551dc7da28f1c089652f66a7b722676c2238ebc0edf" dependencies = [ "futures", "futures-timer", @@ -3413,23 +3230,23 @@ dependencies = [ [[package]] name = "rstest_macros" -version = "0.17.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290ca1a1c8ca7edb7c3283bd44dc35dd54fdec6253a3912e201ba1072018fca8" +checksum = "7229b505ae0706e64f37ffc54a9c163e11022a6636d58fe1f3f52018257ff9f7" dependencies = [ "cfg-if", "proc-macro2", "quote", "rustc_version", - "syn 1.0.109", + "syn", "unicode-ident", ] [[package]] name = "rustc-demangle" -version = "0.1.22" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" [[package]] name = "rustc-hash" @@ -3457,30 +3274,16 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.12" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25" +checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", - "linux-raw-sys 0.1.4", - "windows-sys 0.45.0", -] - -[[package]] -name = "rustix" -version = "0.37.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" -dependencies = [ - "bitflags", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.3.1", - "windows-sys 0.48.0", + "linux-raw-sys", + "windows-sys 0.42.0", ] [[package]] @@ -3495,18 +3298,6 @@ dependencies = [ "webpki", ] -[[package]] -name = "rustls" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d" -dependencies = [ - "log", - "ring", - "rustls-webpki", - "sct", -] - [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -3528,27 +3319,17 @@ dependencies = [ "base64 0.21.0", ] -[[package]] -name = "rustls-webpki" -version = "0.100.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "rustversion" -version = "1.0.12" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" +checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70" [[package]] name = "ryu" -version = "1.0.13" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" [[package]] name = "safekeeper" @@ -3560,7 +3341,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.2.1", + "clap 4.1.4", "const_format", "crc32c", "fs2", @@ -3633,9 +3414,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "scratch" -version = "1.0.5" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" +checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" [[package]] name = "sct" @@ -3672,33 +3453,33 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.17" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" [[package]] name = "sentry" -version = "0.30.0" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5ce6d3512e2617c209ec1e86b0ca2fea06454cd34653c91092bf0f3ec41f8e3" +checksum = "a6097dc270a9c4555c5d6222ed243eaa97ff38e29299ed7c5cb36099033c604e" dependencies = [ "httpdate", "reqwest", - "rustls 0.20.8", + "rustls", "sentry-backtrace", "sentry-contexts", "sentry-core", "sentry-panic", "tokio", "ureq", - "webpki-roots 0.22.6", + "webpki-roots", ] [[package]] name = "sentry-backtrace" -version = "0.30.0" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7fe408d4d1f8de188a9309916e02e129cbe51ca19e55badea5a64899399b1a" +checksum = "9d92d1e4d591534ae4f872d6142f3b500f4ffc179a6aed8a3e86c7cc96d10a6a" dependencies = [ "backtrace", "once_cell", @@ -3708,9 +3489,9 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.30.0" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5695096a059a89973ec541062d331ff4c9aeef9c2951416c894f0fff76340e7d" +checksum = "3afa877b1898ff67dd9878cf4bec4e53cef7d3be9f14b1fc9e4fcdf36f8e4259" dependencies = [ "hostname", "libc", @@ -3722,9 +3503,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.30.0" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b22828bfd118a7b660cf7a155002a494755c0424cebb7061e4743ecde9c7dbc" +checksum = "fc43eb7e4e3a444151a0fe8a0e9ce60eabd905dae33d66e257fa26f1b509c1bd" dependencies = [ "once_cell", "rand", @@ -3735,9 +3516,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.30.0" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4ced2a7a8c14899d58eec402d946f69d5ed26a3fc363a7e8b1e5cb88473a01" +checksum = "ccab4fab11e3e63c45f4524bee2e75cde39cdf164cb0b0cbe6ccd1948ceddf66" dependencies = [ "sentry-backtrace", "sentry-core", @@ -3745,9 +3526,9 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.30.0" +version = "0.29.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360ee3270f7a4a1eee6c667f7d38360b995431598a73b740dfe420da548d9cc9" +checksum = "f63708ec450b6bdcb657af760c447416d69c38ce421f34e5e2e9ce8118410bc7" dependencies = [ "debugid", "getrandom", @@ -3762,44 +3543,35 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.160" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" +checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.160" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" +checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] name = "serde_json" -version = "1.0.95" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744" +checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" dependencies = [ "itoa", "ryu", "serde", ] -[[package]] -name = "serde_spanned" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" -dependencies = [ - "serde", -] - [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -3814,9 +3586,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.3.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0" +checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c" dependencies = [ "base64 0.13.1", "chrono", @@ -3830,14 +3602,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.3.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c" +checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e" dependencies = [ "darling", "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -3879,9 +3651,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.15" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" +checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ "libc", "signal-hook-registry", @@ -3900,9 +3672,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ "libc", ] @@ -3927,9 +3699,9 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.8" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" dependencies = [ "autocfg", ] @@ -3942,24 +3714,14 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" -version = "0.4.9" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ "libc", "winapi", ] -[[package]] -name = "socket2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" -dependencies = [ - "libc", - "windows-sys 0.48.0", -] - [[package]] name = "spin" version = "0.5.2" @@ -3968,9 +3730,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "spin" -version = "0.9.8" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" dependencies = [ "lock_api", ] @@ -3994,7 +3756,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.2.1", + "clap 4.1.4", "const_format", "futures", "futures-core", @@ -4008,8 +3770,8 @@ dependencies = [ "prost", "tokio", "tokio-stream", - "tonic 0.9.1", - "tonic-build 0.9.1", + "tonic", + "tonic-build", "tracing", "utils", "workspace_hack", @@ -4047,7 +3809,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 1.0.109", + "syn", ] [[package]] @@ -4064,20 +3826,9 @@ checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" [[package]] name = "syn" -version = "1.0.109" +version = "1.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf316d5356ed6847742d036f8a39c3b8435cac10bd528a4bd461928a6ab34d5" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" dependencies = [ "proc-macro2", "quote", @@ -4098,7 +3849,7 @@ checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", "unicode-xid", ] @@ -4115,24 +3866,24 @@ dependencies = [ [[package]] name = "task-local-extensions" -version = "0.1.4" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" +checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4" dependencies = [ - "pin-utils", + "tokio", ] [[package]] name = "tempfile" -version = "3.5.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.3.5", - "rustix 0.37.11", - "windows-sys 0.45.0", + "redox_syscall", + "rustix", + "windows-sys 0.42.0", ] [[package]] @@ -4172,7 +3923,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" dependencies = [ "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -4183,39 +3934,38 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] name = "thread_local" -version = "1.1.7" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" +checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" dependencies = [ - "cfg-if", "once_cell", ] [[package]] name = "time" -version = "0.3.20" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" +checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" dependencies = [ "itoa", "serde", @@ -4231,9 +3981,9 @@ checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" [[package]] name = "time-macros" -version = "0.2.8" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" +checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" dependencies = [ "time-core", ] @@ -4259,9 +4009,9 @@ dependencies = [ [[package]] name = "tinyvec_macros" -version = "0.1.1" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tls-listener" @@ -4274,25 +4024,26 @@ dependencies = [ "pin-project-lite", "thiserror", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls", ] [[package]] name = "tokio" -version = "1.27.0" +version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001" +checksum = "c8e00990ebabbe4c14c08aca901caed183ecd5c09562a12c824bb53d3c3fd3af" dependencies = [ "autocfg", "bytes", "libc", + "memchr", "mio", "num_cpus", "pin-project-lite", "signal-hook-registry", - "socket2 0.4.9", + "socket2", "tokio-macros", - "windows-sys 0.45.0", + "windows-sys 0.42.0", ] [[package]] @@ -4307,13 +4058,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.0.0" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" +checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.14", + "syn", ] [[package]] @@ -4334,7 +4085,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "socket2 0.4.9", + "socket2", "tokio", "tokio-util", ] @@ -4347,10 +4098,10 @@ checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7" dependencies = [ "futures", "ring", - "rustls 0.20.8", + "rustls", "tokio", "tokio-postgres", - "tokio-rustls 0.23.4", + "tokio-rustls", ] [[package]] @@ -4359,26 +4110,16 @@ version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ - "rustls 0.20.8", + "rustls", "tokio", "webpki", ] -[[package]] -name = "tokio-rustls" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" -dependencies = [ - "rustls 0.21.0", - "tokio", -] - [[package]] name = "tokio-stream" -version = "0.1.12" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -4393,7 +4134,7 @@ dependencies = [ "filetime", "futures-core", "libc", - "redox_syscall 0.2.16", + "redox_syscall", "tokio", "tokio-stream", "xattr", @@ -4413,9 +4154,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.7" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" dependencies = [ "bytes", "futures-core", @@ -4427,36 +4168,33 @@ dependencies = [ [[package]] name = "toml" -version = "0.7.3" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" dependencies = [ "serde", - "serde_spanned", - "toml_datetime", - "toml_edit", ] [[package]] name = "toml_datetime" -version = "0.6.1" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" +checksum = "4553f467ac8e3d374bc9a177a26801e5d0f9b211aa1673fb137a403afd1c9cf5" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.19.8" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" +checksum = "a34cc558345efd7e88b9eda9626df2138b80bb46a7606f695e751c892bc7dac6" dependencies = [ "indexmap", + "itertools", + "nom8", "serde", - "serde_spanned", "toml_datetime", - "winnow", ] [[package]] @@ -4481,7 +4219,10 @@ dependencies = [ "pin-project", "prost", "prost-derive", + "rustls-native-certs", + "rustls-pemfile", "tokio", + "tokio-rustls", "tokio-stream", "tokio-util", "tower", @@ -4491,62 +4232,17 @@ dependencies = [ "tracing-futures", ] -[[package]] -name = "tonic" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3" -dependencies = [ - "async-stream", - "async-trait", - "axum", - "base64 0.21.0", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "rustls-native-certs", - "rustls-pemfile", - "tokio", - "tokio-rustls 0.24.0", - "tokio-stream", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - [[package]] name = "tonic-build" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" dependencies = [ - "prettyplease 0.1.25", + "prettyplease", "proc-macro2", "prost-build", "quote", - "syn 1.0.109", -] - -[[package]] -name = "tonic-build" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7" -dependencies = [ - "prettyplease 0.1.25", - "proc-macro2", - "prost-build", - "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -4569,6 +4265,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower-http" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858" +dependencies = [ + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.2" @@ -4586,7 +4301,7 @@ name = "trace" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.1", + "clap 4.1.4", "pageserver_api", "utils", "workspace_hack", @@ -4613,7 +4328,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -4759,15 +4474,15 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" [[package]] name = "unicode-normalization" @@ -4805,10 +4520,10 @@ dependencies = [ "base64 0.13.1", "log", "once_cell", - "rustls 0.20.8", + "rustls", "url", "webpki", - "webpki-roots 0.22.6", + "webpki-roots", ] [[package]] @@ -4835,12 +4550,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" -[[package]] -name = "utf8parse" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" - [[package]] name = "utils" version = "0.1.0" @@ -4884,9 +4593,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb" +checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" dependencies = [ "getrandom", "serde", @@ -4904,18 +4613,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" -[[package]] -name = "vsimd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" - [[package]] name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.2.1", + "clap 4.1.4", "env_logger", "log", "once_cell", @@ -4927,11 +4630,12 @@ dependencies = [ [[package]] name = "walkdir" -version = "2.3.3" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" dependencies = [ "same-file", + "winapi", "winapi-util", ] @@ -4972,7 +4676,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn", "wasm-bindgen-shared", ] @@ -5006,7 +4710,7 @@ checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5046,15 +4750,6 @@ dependencies = [ "webpki", ] -[[package]] -name = "webpki-roots" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa54963694b65584e170cf5dc46aeb4dcaa5584e652ff5f3952e56d66aff0125" -dependencies = [ - "rustls-webpki", -] - [[package]] name = "which" version = "4.4.0" @@ -5097,28 +4792,19 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" -dependencies = [ - "windows-targets 0.48.0", -] - [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] [[package]] @@ -5127,140 +4813,65 @@ version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows-targets 0.42.2", -] - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.0", + "windows-targets", ] [[package]] name = "windows-targets" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - -[[package]] -name = "windows-targets" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" -dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" [[package]] name = "windows_aarch64_msvc" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" [[package]] name = "windows_i686_gnu" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - -[[package]] -name = "windows_i686_gnu" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" [[package]] name = "windows_i686_msvc" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" [[package]] name = "windows_x86_64_gnu" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" [[package]] name = "windows_x86_64_msvc" -version = "0.42.2" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" - -[[package]] -name = "winnow" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" -dependencies = [ - "memchr", -] +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" [[package]] name = "winreg" @@ -5279,8 +4890,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.2.1", - "clap_builder", + "clap 4.1.4", "crossbeam-utils", "digest", "either", @@ -5292,6 +4902,7 @@ dependencies = [ "futures-sink", "futures-util", "hashbrown 0.12.3", + "indexmap", "itertools", "libc", "log", @@ -5306,18 +4917,16 @@ dependencies = [ "regex-syntax", "reqwest", "ring", - "rustls 0.20.8", + "rustls", "scopeguard", "serde", "serde_json", - "socket2 0.4.9", - "syn 1.0.109", - "syn 2.0.14", + "socket2", + "syn", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls", "tokio-util", - "toml_datetime", - "toml_edit", + "tonic", "tower", "tracing", "tracing-core", @@ -5327,11 +4936,12 @@ dependencies = [ [[package]] name = "x509-parser" -version = "0.15.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634" +checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" dependencies = [ "asn1-rs", + "base64 0.13.1", "data-encoding", "der-parser", "lazy_static", @@ -5359,15 +4969,15 @@ checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" [[package]] name = "yasna" -version = "0.5.2" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd" +checksum = "aed2e7a52e3744ab4d0c05c20aa065258e84c49fd4226f5191b2ed29712710b4" dependencies = [ "time", ] [[package]] name = "zeroize" -version = "1.6.0" +version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" +checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" diff --git a/Cargo.toml b/Cargo.toml index 0b545e6190..679605dc1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,10 +24,10 @@ atty = "0.2.14" aws-config = { version = "0.51.0", default-features = false, features=["rustls"] } aws-sdk-s3 = "0.21.0" aws-smithy-http = "0.51.0" -aws-types = "0.55" +aws-types = "0.51.0" base64 = "0.13.0" bincode = "1.3" -bindgen = "0.65" +bindgen = "0.61" bstr = "1.0" byteorder = "1.4" bytes = "1.0" @@ -50,7 +50,7 @@ git-version = "0.3" hashbrown = "0.13" hashlink = "0.8.1" hex = "0.4" -hex-literal = "0.4" +hex-literal = "0.3" hmac = "0.12.1" hostname = "0.3.1" humantime = "2.1" @@ -80,18 +80,18 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } reqwest-middleware = "0.2.0" routerify = "3" -rpds = "0.13" +rpds = "0.12.0" rustls = "0.20" rustls-pemfile = "1" rustls-split = "0.3" scopeguard = "1.1" -sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "2.0" sha2 = "0.10.2" signal-hook = "0.3" -socket2 = "0.5" +socket2 = "0.4.4" strum = "0.24" strum_macros = "0.24" svg_fmt = "0.4.1" @@ -106,17 +106,17 @@ tokio-postgres-rustls = "0.9.0" tokio-rustls = "0.23" tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["io"] } -toml = "0.7" -toml_edit = "0.19" -tonic = {version = "0.9", features = ["tls", "tls-roots"]} +toml = "0.5" +toml_edit = { version = "0.17", features = ["easy"] } +tonic = {version = "0.8", features = ["tls", "tls-roots"]} tracing = "0.1" tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" uuid = { version = "1.2", features = ["v4", "serde"] } walkdir = "2.3.2" -webpki-roots = "0.23" -x509-parser = "0.15" +webpki-roots = "0.22.5" +x509-parser = "0.14" ## TODO replace this with tracing env_logger = "0.10" @@ -154,9 +154,9 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.4" rcgen = "0.10" -rstest = "0.17" +rstest = "0.16" tempfile = "3.4" -tonic-build = "0.9" +tonic-build = "0.8" # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index 3f290821c2..f26aa2fbc5 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -4,12 +4,13 @@ version = "0.1.0" edition = "2021" license = "Apache-2.0" -[dependencies] -anyhow.workspace = true -chrono.workspace = true -rand.workspace = true -serde.workspace = true -serde_with.workspace = true -utils.workspace = true +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -workspace_hack.workspace = true +[dependencies] +anyhow = "1.0.68" +chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } +rand = "0.8.3" +serde = "1.0.152" +serde_with = "2.1.0" +utils = { version = "0.1.0", path = "../utils" } +workspace_hack = { version = "0.1.0", path = "../../workspace_hack" } diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index f7e39751ef..66221af522 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use std::process::Command; use anyhow::{anyhow, Context}; -use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; +use bindgen::callbacks::ParseCallbacks; #[derive(Debug)] struct PostgresFfiCallbacks; @@ -20,7 +20,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { // Add any custom #[derive] attributes to the data structures that bindgen // creates. - fn add_derives(&self, derive_info: &DeriveInfo) -> Vec { + fn add_derives(&self, name: &str) -> Vec { // This is the list of data structures that we want to serialize/deserialize. let serde_list = [ "XLogRecord", @@ -31,7 +31,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { "ControlFileData", ]; - if serde_list.contains(&derive_info.name) { + if serde_list.contains(&name) { vec![ "Default".into(), // Default allows us to easily fill the padding fields with 0. "Serialize".into(), diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs index 048e99d841..eb52409c44 100644 --- a/libs/remote_storage/tests/pagination_tests.rs +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -204,7 +204,12 @@ async fn upload_s3_data( let data = format!("remote blob data {i}").into_bytes(); let data_len = data.len(); task_client - .upload(std::io::Cursor::new(data), data_len, &blob_path, None) + .upload( + Box::new(std::io::Cursor::new(data)), + data_len, + &blob_path, + None, + ) .await?; Ok::<_, anyhow::Error>((blob_prefix, blob_path)) diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index b285c9b5b0..8c3d3f9063 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -14,5 +14,4 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true - -workspace_hack.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index dc6326e73e..391bc52a80 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -33,7 +33,7 @@ serde_with.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true -uuid.workspace = true +uuid = { version = "1.2", features = ["v4", "serde"] } metrics.workspace = true workspace_hack.workspace = true diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 0c87e208c8..19f0f22815 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -725,9 +725,8 @@ impl PageServerConf { "disk_usage_based_eviction" => { tracing::info!("disk_usage_based_eviction: {:#?}", &item); builder.disk_usage_based_eviction( - deserialize_from_item_string("disk_usage_based_eviction", item) - .context("parse disk_usage_based_eviction")? - ) + toml_edit::de::from_item(item.clone()) + .context("parse disk_usage_based_eviction")?) }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), @@ -828,14 +827,14 @@ impl PageServerConf { if let Some(eviction_policy) = item.get("eviction_policy") { t_conf.eviction_policy = Some( - deserialize_from_item_string("eviction_policy", eviction_policy) + toml_edit::de::from_item(eviction_policy.clone()) .context("parse eviction_policy")?, ); } if let Some(item) = item.get("min_resident_size_override") { t_conf.min_resident_size_override = Some( - deserialize_from_item_string("min_resident_size_override", item) + toml_edit::de::from_item(item.clone()) .context("parse min_resident_size_override")?, ); } @@ -939,19 +938,6 @@ where }) } -fn deserialize_from_item_string(name: &str, item: &Item) -> anyhow::Result -where - T: serde::de::DeserializeOwned, -{ - // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way - let item_string = item.to_string(); - let deserializer = item_string - .trim() - .parse::() - .with_context(|| format!("parsing item for node {name} as ValueDeserializer"))?; - T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) -} - /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bd38a7a2f3..c0e4a2a9cf 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -65,7 +65,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream { // We were requested to shut down. - let msg = "pageserver is shutting down".to_string(); + let msg = format!("pageserver is shutting down"); let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)); Err(QueryError::Other(anyhow::anyhow!(msg))) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 67bc1b36b0..03a4ff8c8e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1876,7 +1876,7 @@ impl Tenant { .to_string(); // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string(&tenant_conf)?; + conf_content += &toml_edit::easy::to_string(&tenant_conf)?; let mut target_config_file = VirtualFile::open_with_options( target_config_path, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 9b719db180..cdabb23a7b 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -275,9 +275,9 @@ mod tests { ..TenantConfOpt::default() }; - let toml_form = toml_edit::ser::to_string(&small_conf).unwrap(); + let toml_form = toml_edit::easy::to_string(&small_conf).unwrap(); assert_eq!(toml_form, "gc_horizon = 42\n"); - assert_eq!(small_conf, toml_edit::de::from_str(&toml_form).unwrap()); + assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap()); let json_form = serde_json::to_string(&small_conf).unwrap(); assert_eq!(json_form, "{\"gc_horizon\":42}"); diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 699121ccd9..ce9f4d9bf8 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -74,7 +74,7 @@ pub(super) async fn upload_timeline_layer<'a>( })?; storage - .upload(source_file, fs_size, &storage_path, None) + .upload(Box::new(source_file), fs_size, &storage_path, None) .await .with_context(|| { format!( diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index de7b634ba0..d7ace28426 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -23,6 +23,7 @@ use std::convert::Infallible; use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; +use std::task::Poll; use std::time::Duration; use tokio::sync::broadcast; use tokio::sync::broadcast::error::RecvError; @@ -373,7 +374,7 @@ impl BrokerService for Broker { Ok(info) => yield info, Err(RecvError::Lagged(skipped_msg)) => { missed_msgs += skipped_msg; - if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() { + if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) { warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs); missed_msgs = 0; diff --git a/trace/Cargo.toml b/trace/Cargo.toml index d6eed3f49c..6ced992d4c 100644 --- a/trace/Cargo.toml +++ b/trace/Cargo.toml @@ -4,6 +4,8 @@ version = "0.1.0" edition.workspace = true license.workspace = true +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + [dependencies] clap.workspace = true anyhow.workspace = true diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f735ffed4c..f885f4a94d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -18,7 +18,6 @@ byteorder = { version = "1" } bytes = { version = "1", features = ["serde"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } clap = { version = "4", features = ["derive", "string"] } -clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } digest = { version = "0.10", features = ["mac", "std"] } either = { version = "1" } @@ -30,6 +29,7 @@ futures-executor = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hashbrown = { version = "0.12", features = ["raw"] } +indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -52,8 +52,7 @@ socket2 = { version = "0.4", default-features = false, features = ["all"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] } tokio-rustls = { version = "0.23" } tokio-util = { version = "0.7", features = ["codec", "io"] } -toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } -toml_edit = { version = "0.19", features = ["serde"] } +tonic = { version = "0.8", features = ["tls-roots"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } @@ -65,6 +64,7 @@ anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } either = { version = "1" } hashbrown = { version = "0.12", features = ["raw"] } +indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -74,7 +74,6 @@ prost = { version = "0.11" } regex = { version = "1" } regex-syntax = { version = "0.6" } serde = { version = "1", features = ["alloc", "derive"] } -syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] } +syn = { version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } ### END HAKARI SECTION From 5d0ecadf7cb56039ad541f515135e94d634f1752 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 12 Apr 2023 16:16:39 +0300 Subject: [PATCH 272/426] Add support for non-SNI case in multi-cert proxy When no SNI is provided use the default certificate, otherwise we can't get to the options parameter which can be used to set endpoint name too. That means that non-SNI flow will not work for CNAME domains in verify-full mode. --- proxy/src/config.rs | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index ad51502b49..0ceb556ca1 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -40,7 +40,7 @@ pub fn configure_tls( let mut cert_resolver = CertResolver::new(); // add default certificate - cert_resolver.add_cert(key_path, cert_path)?; + cert_resolver.add_cert(key_path, cert_path, true)?; // add extra certificates if let Some(certs_dir) = certs_dir { @@ -52,8 +52,11 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver - .add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; + cert_resolver.add_cert( + &key_path.to_string_lossy(), + &cert_path.to_string_lossy(), + false, + )?; } } } @@ -78,16 +81,23 @@ pub fn configure_tls( struct CertResolver { certs: HashMap>, + default: Option>, } impl CertResolver { fn new() -> Self { Self { certs: HashMap::new(), + default: None, } } - fn add_cert(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + fn add_cert( + &mut self, + key_path: &str, + cert_path: &str, + is_default: bool, + ) -> anyhow::Result<()> { let priv_key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) @@ -136,10 +146,13 @@ impl CertResolver { "Failed to parse common name from certificate at '{cert_path}'." ))?; - self.certs.insert( - common_name, - Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)), - ); + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + if is_default { + self.default = Some(cert.clone()); + } + + self.certs.insert(common_name, cert); Ok(()) } @@ -172,7 +185,17 @@ impl rustls::server::ResolvesServerCert for CertResolver { } } } else { - None + // No SNI, use the default certificate, otherwise we can't get to + // options parameter which can be used to set endpoint name too. + // That means that non-SNI flow will not work for CNAME domains in + // verify-full mode. + // + // If that will be a problem we can: + // + // a) Instead of multi-cert approach use single cert with extra + // domains listed in Subject Alternative Name (SAN). + // b) Deploy separate proxy instances for extra domains. + self.default.as_ref().cloned() } } } From 732acc54c1fa744fc0c5c48158c7716371e70b89 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 13 Apr 2023 10:19:34 +0300 Subject: [PATCH 273/426] Add check for duplicates of generated image layers (#3869) ## Describe your changes ## Issue ticket number and link #3673 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: Heikki Linnakangas --- pageserver/benches/bench_layer_map.rs | 4 +-- pageserver/src/tenant.rs | 7 +++++- pageserver/src/tenant/layer_map.rs | 21 ++++++++++------ .../layer_map/historic_layer_coverage.rs | 8 ++++++ pageserver/src/tenant/timeline.rs | 25 ++++++++++++------- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5edfa84d8a..4882fc518f 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { min_lsn = min(min_lsn, lsn_range.start); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer)).unwrap(); } println!("min: {min_lsn}, max: {max_lsn}"); @@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) { is_incremental: false, short_id: format!("Layer {}", i), }; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer)).unwrap(); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 03a4ff8c8e..7e88a12963 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -267,7 +267,10 @@ impl UninitializedTimeline<'_> { .await .context("Failed to flush after basebackup import")?; - self.initialize(ctx) + // Initialize without loading the layer map. We started with an empty layer map, and already + // updated it for the layers that we created during the import. + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(ctx, &mut timelines, false, true) } fn raw_timeline(&self) -> anyhow::Result<&Arc> { @@ -2308,6 +2311,8 @@ impl Tenant { ) })?; + // Initialize the timeline without loading the layer map, because we already updated the layer + // map above, when we imported the datadir. let timeline = { let mut timelines = self.timelines.lock().unwrap(); raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)? diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 4c659be9aa..02159ee291 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -52,7 +52,7 @@ use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; -use anyhow::Result; +use anyhow::{bail, Result}; use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; @@ -126,7 +126,7 @@ where /// /// Insert an on-disk layer. /// - pub fn insert_historic(&mut self, layer: Arc) { + pub fn insert_historic(&mut self, layer: Arc) -> anyhow::Result<()> { self.layer_map.insert_historic_noflush(layer) } @@ -274,17 +274,22 @@ where /// /// Helper function for BatchedUpdates::insert_historic /// - pub(self) fn insert_historic_noflush(&mut self, layer: Arc) { - self.historic.insert( - historic_layer_coverage::LayerKey::from(&*layer), - Arc::clone(&layer), - ); + pub(self) fn insert_historic_noflush(&mut self, layer: Arc) -> anyhow::Result<()> { + let key = historic_layer_coverage::LayerKey::from(&*layer); + if self.historic.contains(&key) { + bail!( + "Attempt to insert duplicate layer {} in layer map", + layer.short_id() + ); + } + self.historic.insert(key, Arc::clone(&layer)); if Self::is_l0(&layer) { self.l0_delta_layers.push(layer); } NUM_ONDISK_LAYERS.inc(); + Ok(()) } /// @@ -838,7 +843,7 @@ mod tests { let expected_in_counts = (1, usize::from(expected_l0)); - map.batch_update().insert_historic(remote.clone()); + map.batch_update().insert_historic(remote.clone()).unwrap(); assert_eq!(count_layer_in(&map, &remote), expected_in_counts); let replaced = map diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index b63c361314..1fdcd5e5a4 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -417,6 +417,14 @@ impl BufferedHistoricLayerCoverage { } } + pub fn contains(&self, layer_key: &LayerKey) -> bool { + match self.buffer.get(layer_key) { + Some(None) => false, // layer remove was buffered + Some(_) => true, // layer insert was buffered + None => self.layers.contains_key(layer_key), // no buffered ops for this layer + } + } + pub fn insert(&mut self, layer_key: LayerKey, value: Value) { self.buffer.insert(layer_key, Some(value)); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4b0d7a6994..29d8b544cc 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1446,7 +1446,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer))?; num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. @@ -1478,7 +1478,7 @@ impl Timeline { trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - updates.insert_historic(Arc::new(layer)); + updates.insert_historic(Arc::new(layer))?; num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these @@ -1552,7 +1552,7 @@ impl Timeline { // remote index file? // If so, rename_to_backup those files & replace their local layer with // a RemoteLayer in the layer map so that we re-download them on-demand. - if let Some(local_layer) = local_layer { + if let Some(local_layer) = &local_layer { let local_layer_path = local_layer .local_path() .expect("caller must ensure that local_layers only contains local layers"); @@ -1577,7 +1577,6 @@ impl Timeline { anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { self.metrics.resident_physical_size_gauge.sub(local_size); - updates.remove_historic(local_layer); // fall-through to adding the remote layer } } else { @@ -1613,7 +1612,11 @@ impl Timeline { ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer); + if let Some(local_layer) = &local_layer { + updates.replace_historic(local_layer, remote_layer)?; + } else { + updates.insert_historic(remote_layer)?; + } } LayerFileName::Delta(deltafilename) => { // Create a RemoteLayer for the delta file. @@ -1637,7 +1640,11 @@ impl Timeline { LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted), ); let remote_layer = Arc::new(remote_layer); - updates.insert_historic(remote_layer); + if let Some(local_layer) = &local_layer { + updates.replace_historic(local_layer, remote_layer)?; + } else { + updates.insert_historic(remote_layer)?; + } } } } @@ -2684,7 +2691,7 @@ impl Timeline { .write() .unwrap() .batch_update() - .insert_historic(Arc::new(new_delta)); + .insert_historic(Arc::new(new_delta))?; // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); @@ -2889,7 +2896,7 @@ impl Timeline { self.metrics .resident_physical_size_gauge .add(metadata.len()); - updates.insert_historic(Arc::new(l)); + updates.insert_historic(Arc::new(l))?; } updates.flush(); drop(layers); @@ -3322,7 +3329,7 @@ impl Timeline { new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); let x: Arc = Arc::new(l); - updates.insert_historic(x); + updates.insert_historic(x)?; } // Now that we have reshuffled the data to set of new delta layers, we can From 15d1f85552231a5f6d4d500ac75010669dbec540 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 13 Apr 2023 12:11:43 +0300 Subject: [PATCH 274/426] Add reason to TenantState::Broken (#3954) Reason and backtrace are added to the Broken state. Backtrace is automatically collected when tenant entered the broken state. The format for API, CLI and metrics is changed and unified to return tenant state name in camel case. Previously snake case was used for metrics and camel case was used for everything else. Now tenant state field in TenantInfo swagger spec is changed to contain state name in "slug" field and other fields (currently only reason and backtrace for Broken variant in "data" field). To allow for this breaking change state was removed from TenantInfo swagger spec because it was not used anywhere. Please note that the tenant's broken reason is not persisted on disk so the reason is lost when pageserver is restarted. Requires changes to grafana dashboard that monitors tenant states. Closes #3001 --------- Co-authored-by: theirix --- Cargo.lock | 2 + libs/pageserver_api/Cargo.toml | 4 +- libs/pageserver_api/src/models.rs | 106 ++++++++++++++---- pageserver/src/http/openapi_spec.yml | 3 - pageserver/src/http/routes.rs | 6 +- pageserver/src/metrics.rs | 14 +-- pageserver/src/tenant.rs | 56 +++++---- pageserver/src/tenant/mgr.rs | 4 +- pageserver/src/tenant/tasks.rs | 2 +- test_runner/fixtures/pageserver/utils.py | 26 +++-- test_runner/regress/test_ondemand_download.py | 6 +- .../regress/test_pageserver_restart.py | 2 +- test_runner/regress/test_tenant_conf.py | 4 +- test_runner/regress/test_tenant_relocation.py | 6 +- test_runner/regress/test_tenant_tasks.py | 15 +-- test_runner/regress/test_tenants.py | 12 +- .../test_tenants_with_remote_storage.py | 6 +- test_runner/regress/test_timeline_size.py | 4 +- test_runner/regress/test_wal_acceptor.py | 2 +- 19 files changed, 181 insertions(+), 99 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 668487a9bd..fc587c57bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2517,6 +2517,8 @@ dependencies = [ "serde", "serde_json", "serde_with", + "strum", + "strum_macros", "utils", "workspace_hack", ] diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 7709da1072..f97ec54e91 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] serde.workspace = true serde_with.workspace = true +serde_json.workspace = true const_format.workspace = true anyhow.workspace = true bytes.workspace = true @@ -14,6 +15,7 @@ byteorder.workspace = true utils.workspace = true postgres_ffi.workspace = true enum-map.workspace = true -serde_json.workspace = true +strum.workspace = true +strum_macros.workspace = true workspace_hack.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 98a4b56858..a351761f4a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -7,6 +7,7 @@ use std::{ use byteorder::{BigEndian, ReadBytesExt}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; +use strum_macros; use utils::{ history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, @@ -18,11 +19,23 @@ use anyhow::bail; use bytes::{BufMut, Bytes, BytesMut}; /// A state of a tenant in pageserver's memory. -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive( + Clone, + PartialEq, + Eq, + serde::Serialize, + serde::Deserialize, + strum_macros::Display, + strum_macros::EnumString, + strum_macros::EnumVariantNames, + strum_macros::AsRefStr, + strum_macros::IntoStaticStr, +)] +#[serde(tag = "slug", content = "data")] pub enum TenantState { - // This tenant is being loaded from local disk + /// This tenant is being loaded from local disk Loading, - // This tenant is being downloaded from cloud storage. + /// This tenant is being downloaded from cloud storage. Attaching, /// Tenant is fully operational Active, @@ -31,15 +44,7 @@ pub enum TenantState { Stopping, /// A tenant is recognized by the pageserver, but can no longer be used for /// any operations, because it failed to be activated. - Broken, -} - -pub mod state { - pub const LOADING: &str = "loading"; - pub const ATTACHING: &str = "attaching"; - pub const ACTIVE: &str = "active"; - pub const STOPPING: &str = "stopping"; - pub const BROKEN: &str = "broken"; + Broken { reason: String, backtrace: String }, } impl TenantState { @@ -49,17 +54,26 @@ impl TenantState { Self::Attaching => true, Self::Active => false, Self::Stopping => false, - Self::Broken => false, + Self::Broken { .. } => false, } } - pub fn as_str(&self) -> &'static str { + pub fn broken_from_reason(reason: String) -> Self { + let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); + Self::Broken { + reason, + backtrace: backtrace_str, + } + } +} + +impl std::fmt::Debug for TenantState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - TenantState::Loading => state::LOADING, - TenantState::Attaching => state::ATTACHING, - TenantState::Active => state::ACTIVE, - TenantState::Stopping => state::STOPPING, - TenantState::Broken => state::BROKEN, + Self::Broken { reason, backtrace } if !reason.is_empty() => { + write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}") + } + _ => write!(f, "{self}"), } } } @@ -615,6 +629,7 @@ impl PagestreamBeMessage { #[cfg(test)] mod tests { use bytes::Buf; + use serde_json::json; use super::*; @@ -665,4 +680,57 @@ mod tests { assert!(msg == reconstructed); } } + + #[test] + fn test_tenantinfo_serde() { + // Test serialization/deserialization of TenantInfo + let original_active = TenantInfo { + id: TenantId::generate(), + state: TenantState::Active, + current_physical_size: Some(42), + has_in_progress_downloads: Some(false), + }; + let expected_active = json!({ + "id": original_active.id.to_string(), + "state": { + "slug": "Active", + }, + "current_physical_size": 42, + "has_in_progress_downloads": false, + }); + + let original_broken = TenantInfo { + id: TenantId::generate(), + state: TenantState::Broken { + reason: "reason".into(), + backtrace: "backtrace info".into(), + }, + current_physical_size: Some(42), + has_in_progress_downloads: Some(false), + }; + let expected_broken = json!({ + "id": original_broken.id.to_string(), + "state": { + "slug": "Broken", + "data": { + "backtrace": "backtrace info", + "reason": "reason", + } + }, + "current_physical_size": 42, + "has_in_progress_downloads": false, + }); + + assert_eq!( + serde_json::to_value(&original_active).unwrap(), + expected_active + ); + + assert_eq!( + serde_json::to_value(&original_broken).unwrap(), + expected_broken + ); + assert!(format!("{:?}", &original_broken.state).contains("reason")); + assert!(format!("{:?}", &original_broken.state).contains("backtrace info")); + } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 478e9d228a..b0e4e1ca85 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -829,12 +829,9 @@ components: type: object required: - id - - state properties: id: type: string - state: - type: string current_physical_size: type: integer has_in_progress_downloads: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2db60f557d..e7a86e4822 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -465,7 +465,7 @@ async fn tenant_list_handler(request: Request) -> Result, A .iter() .map(|(id, state)| TenantInfo { id: *id, - state: *state, + state: state.clone(), current_physical_size: None, has_in_progress_downloads: Some(state.has_in_progress_downloads()), }) @@ -490,7 +490,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro let state = tenant.current_state(); Ok(TenantInfo { id: tenant_id, - state, + state: state.clone(), current_physical_size: Some(current_physical_size), has_in_progress_downloads: Some(state.has_in_progress_downloads()), }) @@ -931,7 +931,7 @@ async fn handle_tenant_break(r: Request) -> Result, ApiErro .await .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; - tenant.set_broken("broken from test"); + tenant.set_broken("broken from test".to_owned()); json_response(StatusCode::OK, ()) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 1f31e5a8fb..dfb38387ea 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -6,7 +6,8 @@ use metrics::{ UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; -use pageserver_api::models::state; +use pageserver_api::models::TenantState; +use strum::VariantNames; use utils::id::{TenantId, TimelineId}; /// Prometheus histogram buckets (in seconds) for operations in the critical @@ -147,15 +148,6 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); -// Metrics collected on tenant states. -const TENANT_STATE_OPTIONS: &[&str] = &[ - state::LOADING, - state::ATTACHING, - state::ACTIVE, - state::STOPPING, - state::BROKEN, -]; - pub static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", @@ -707,7 +699,7 @@ impl Drop for TimelineMetrics { pub fn remove_tenant_metrics(tenant_id: &TenantId) { let tid = tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); - for state in TENANT_STATE_OPTIONS { + for state in TenantState::VARIANTS { let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]); } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7e88a12963..d98aa5c566 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -622,7 +622,7 @@ impl Tenant { match tenant_clone.attach(ctx).await { Ok(_) => {} Err(e) => { - tenant_clone.set_broken(&e.to_string()); + tenant_clone.set_broken(e.to_string()); error!("error attaching tenant: {:?}", e); } } @@ -830,7 +830,10 @@ impl Tenant { pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc { let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); Arc::new(Tenant::new( - TenantState::Broken, + TenantState::Broken { + reason: "create_broken_tenant".into(), + backtrace: String::new(), + }, conf, TenantConfOpt::default(), wal_redo_manager, @@ -891,7 +894,7 @@ impl Tenant { match tenant_clone.load(&ctx).await { Ok(()) => {} Err(err) => { - tenant_clone.set_broken(&err.to_string()); + tenant_clone.set_broken(err.to_string()); error!("could not load tenant {tenant_id}: {err:?}"); } } @@ -1443,7 +1446,7 @@ impl Tenant { } pub fn current_state(&self) -> TenantState { - *self.state.borrow() + self.state.borrow().clone() } pub fn is_active(&self) -> bool { @@ -1454,15 +1457,15 @@ impl Tenant { fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> { let mut result = Ok(()); self.state.send_modify(|current_state| { - match *current_state { + match &*current_state { TenantState::Active => { // activate() was called on an already Active tenant. Shouldn't happen. result = Err(anyhow::anyhow!("Tenant is already active")); } - TenantState::Broken => { + TenantState::Broken { reason, .. } => { // This shouldn't happen either result = Err(anyhow::anyhow!( - "Could not activate tenant because it is in broken state" + "Could not activate tenant because it is in broken state due to: {reason}", )); } TenantState::Stopping => { @@ -1496,7 +1499,10 @@ impl Tenant { timeline.timeline_id, e ); timeline.set_state(TimelineState::Broken); - *current_state = TenantState::Broken; + *current_state = TenantState::broken_from_reason(format!( + "failed to activate timeline {}: {}", + timeline.timeline_id, e + )); } } } @@ -1509,7 +1515,7 @@ impl Tenant { /// Change tenant status to Stopping, to mark that it is being shut down pub fn set_stopping(&self) { self.state.send_modify(|current_state| { - match *current_state { + match current_state { TenantState::Active | TenantState::Loading | TenantState::Attaching => { *current_state = TenantState::Stopping; @@ -1525,8 +1531,8 @@ impl Tenant { timeline.set_state(TimelineState::Stopping); } } - TenantState::Broken => { - info!("Cannot set tenant to Stopping state, it is already in Broken state"); + TenantState::Broken { reason, .. } => { + info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"); } TenantState::Stopping => { // The tenant was detached, or system shutdown was requested, while we were @@ -1537,7 +1543,7 @@ impl Tenant { }); } - pub fn set_broken(&self, reason: &str) { + pub fn set_broken(&self, reason: String) { self.state.send_modify(|current_state| { match *current_state { TenantState::Active => { @@ -1545,24 +1551,24 @@ impl Tenant { // while loading or attaching a tenant. A tenant that has already been // activated should never be marked as broken. We cope with it the best // we can, but it shouldn't happen. - *current_state = TenantState::Broken; warn!("Changing Active tenant to Broken state, reason: {}", reason); + *current_state = TenantState::broken_from_reason(reason); } - TenantState::Broken => { + TenantState::Broken { .. } => { // This shouldn't happen either warn!("Tenant is already in Broken state"); } TenantState::Stopping => { // This shouldn't happen either - *current_state = TenantState::Broken; warn!( "Marking Stopping tenant as Broken state, reason: {}", reason ); + *current_state = TenantState::broken_from_reason(reason); } TenantState::Loading | TenantState::Attaching => { info!("Setting tenant as Broken state, reason: {}", reason); - *current_state = TenantState::Broken; + *current_state = TenantState::broken_from_reason(reason); } } }); @@ -1575,7 +1581,7 @@ impl Tenant { pub async fn wait_to_become_active(&self) -> anyhow::Result<()> { let mut receiver = self.state.subscribe(); loop { - let current_state = *receiver.borrow_and_update(); + let current_state = receiver.borrow_and_update().clone(); match current_state { TenantState::Loading | TenantState::Attaching => { // in these states, there's a chance that we can reach ::Active @@ -1584,12 +1590,12 @@ impl Tenant { TenantState::Active { .. } => { return Ok(()); } - TenantState::Broken | TenantState::Stopping => { + TenantState::Broken { .. } | TenantState::Stopping => { // There's no chance the tenant can transition back into ::Active anyhow::bail!( "Tenant {} will not become active. Current state: {:?}", self.tenant_id, - current_state, + ¤t_state, ); } } @@ -1770,21 +1776,23 @@ impl Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - let current_state = *rx.borrow_and_update(); + let mut current_state: &'static str = From::from(&*rx.borrow_and_update()); let tid = tenant_id.to_string(); TENANT_STATE_METRIC - .with_label_values(&[&tid, current_state.as_str()]) + .with_label_values(&[&tid, current_state]) .inc(); loop { match rx.changed().await { Ok(()) => { - let new_state = *rx.borrow(); + let new_state: &'static str = From::from(&*rx.borrow_and_update()); TENANT_STATE_METRIC - .with_label_values(&[&tid, current_state.as_str()]) + .with_label_values(&[&tid, current_state]) .dec(); TENANT_STATE_METRIC - .with_label_values(&[&tid, new_state.as_str()]) + .with_label_values(&[&tid, new_state]) .inc(); + + current_state = new_state; } Err(_sender_dropped_error) => { info!("Tenant dropped the state updates sender, quitting waiting for tenant state change"); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4971186206..754316b3cd 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -537,7 +537,7 @@ where Some(tenant) => match tenant.current_state() { TenantState::Attaching | TenantState::Loading - | TenantState::Broken + | TenantState::Broken { .. } | TenantState::Active => tenant.set_stopping(), TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)), }, @@ -565,7 +565,7 @@ where let tenants_accessor = TENANTS.read().await; match tenants_accessor.get(&tenant_id) { Some(tenant) => { - tenant.set_broken(&e.to_string()); + tenant.set_broken(e.to_string()); } None => { warn!("Tenant {tenant_id} got removed from memory"); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 8aeacc12f5..7e7dbd3c5c 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -209,7 +209,7 @@ async fn wait_for_active_tenant( loop { match tenant_state_updates.changed().await { Ok(()) => { - let new_state = *tenant_state_updates.borrow(); + let new_state = &*tenant_state_updates.borrow(); match new_state { TenantState::Active => { debug!("Tenant state changed to active, continuing the task loop"); diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 65eda5b636..c060fc8dea 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,16 +1,20 @@ import time +from typing import Optional from fixtures.log_helper import log from fixtures.pageserver.http import PageserverHttpClient from fixtures.types import Lsn, TenantId, TimelineId -def assert_tenant_status( - pageserver_http: PageserverHttpClient, tenant: TenantId, expected_status: str +def assert_tenant_state( + pageserver_http: PageserverHttpClient, + tenant: TenantId, + expected_state: str, + message: Optional[str] = None, ): tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") - assert tenant_status["state"] == expected_status, tenant_status + assert tenant_status["state"]["slug"] == expected_state, message or tenant_status def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId): @@ -68,6 +72,7 @@ def wait_until_tenant_state( tenant_id: TenantId, expected_state: str, iterations: int, + period: float = 1.0, ) -> bool: """ Does not use `wait_until` for debugging purposes @@ -76,21 +81,28 @@ def wait_until_tenant_state( try: tenant = pageserver_http.tenant_status(tenant_id=tenant_id) log.debug(f"Tenant {tenant_id} data: {tenant}") - if tenant["state"] == expected_state: + if tenant["state"]["slug"] == expected_state: return True except Exception as e: log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") - time.sleep(1) + time.sleep(period) raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") def wait_until_tenant_active( - pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int = 30 + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + iterations: int = 30, + period: float = 1.0, ): wait_until_tenant_state( - pageserver_http, tenant_id, expected_state="Active", iterations=iterations + pageserver_http, + tenant_id, + expected_state="Active", + iterations=iterations, + period=period, ) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 90ab8e68d8..07410b64df 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( - assert_tenant_status, + assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, wait_until_tenant_state, @@ -239,7 +239,7 @@ def test_ondemand_download_timetravel( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) # The current_physical_size reports the sum of layers loaded in the layer # map, regardless of where the layer files are located. So even though we @@ -392,7 +392,7 @@ def test_download_remote_layers_api( ] ) - wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) ###### Phase 1: exercise download error code path assert ( diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 453ddec0d4..77db729880 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -59,7 +59,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() tenant_status = client.tenant_status(env.initial_tenant) log.info("Tenant status : %s", tenant_status) - assert tenant_status["state"] == "Loading" + assert tenant_status["state"]["slug"] == "Loading" # Try to read. This waits until the loading finishes, and then return normally. cur.execute("SELECT count(*) FROM foo") diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 67aba227e5..80d4b99504 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -7,7 +7,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, ) -from fixtures.pageserver.utils import assert_tenant_status, wait_for_upload +from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.types import Lsn from fixtures.utils import wait_until @@ -278,7 +278,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_tenant_status(http_client, tenant_id, "Active"), + func=lambda: assert_tenant_state(http_client, tenant_id, "Active"), ) env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"}) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 8ad4bd1c11..3569ab0c53 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -15,7 +15,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( - assert_tenant_status, + assert_tenant_state, tenant_exists, wait_for_last_record_lsn, wait_for_upload, @@ -416,11 +416,11 @@ def test_tenant_relocation( # wait for tenant to finish attaching tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status["state"] in ["Attaching", "Active"] + assert tenant_status["state"]["slug"] in ["Attaching", "Active"] wait_until( number_of_iterations=10, interval=1, - func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"), + func=lambda: assert_tenant_state(new_pageserver_http, tenant_id, "Active"), ) check_timeline_attached( diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 24b211e368..8c89100745 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,5 +1,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pageserver.utils import assert_tenant_state, wait_until_tenant_active from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until @@ -25,16 +26,16 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for t in timelines: client.timeline_delete(tenant, t) - def assert_active(tenant): - assert get_state(tenant) == "Active" - # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) - assert ( - get_state(tenant) == "Active" - ), "Pageserver should activate a tenant and start background jobs if timelines are loaded" + assert_tenant_state( + client, + tenant, + expected_state="Active", + message="Pageserver should activate a tenant and start background jobs if timelines are loaded", + ) # Stop compute pg.stop() @@ -47,7 +48,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): for tenant_info in client.tenant_list(): tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) - wait_until(10, 0.2, lambda: assert_active(tenant_id)) + wait_until_tenant_active(client, tenant_id, iterations=10, period=0.2) # Assert that all tasks finish quickly after tenant is detached task_starts = client.get_metric_value("pageserver_tenant_task_events_total", {"event": "start"}) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 8021bf9914..2162520217 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -332,24 +332,24 @@ def test_pageserver_with_empty_tenants( [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)] assert ( - broken_tenant["state"] == "Broken" + broken_tenant["state"]["slug"] == "Broken" ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" broken_tenant_status = client.tenant_status(tenant_without_timelines_dir) assert ( - broken_tenant_status["state"] == "Broken" + broken_tenant_status["state"]["slug"] == "Broken" ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*") [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)] assert ( - loaded_tenant["state"] == "Active" + loaded_tenant["state"]["slug"] == "Active" ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation" loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir) assert ( - loaded_tenant_status["state"] == "Active" + loaded_tenant_status["state"]["slug"] == "Active" ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active" time.sleep(1) # to allow metrics propagation @@ -357,11 +357,11 @@ def test_pageserver_with_empty_tenants( ps_metrics = client.get_metrics() broken_tenants_metric_filter = { "tenant_id": str(tenant_without_timelines_dir), - "state": "broken", + "state": "Broken", } active_tenants_metric_filter = { "tenant_id": str(tenant_with_empty_timelines_dir), - "state": "active", + "state": "Active", } tenant_active_count = int( diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index ec1c12a0d8..514e2b6fa0 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -24,7 +24,7 @@ from fixtures.neon_fixtures import ( wait_for_sk_commit_lsn_to_reach_remote_storage, ) from fixtures.pageserver.utils import ( - assert_tenant_status, + assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, ) @@ -202,7 +202,7 @@ def test_tenants_attached_after_download( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_tenant_status(client, tenant_id, "Active"), + func=lambda: assert_tenant_state(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -286,7 +286,7 @@ def test_tenant_redownloads_truncated_file_on_startup( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_tenant_status(client, tenant_id, "Active"), + func=lambda: assert_tenant_state(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 7c77e1fe59..28da3c5a48 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -23,7 +23,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( - assert_tenant_status, + assert_tenant_state, wait_for_upload_queue_empty, wait_until_tenant_active, ) @@ -333,7 +333,7 @@ def test_timeline_physical_size_init( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"), + func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"), ) assert_physical_size_invariants( diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 306c492e8f..c24c77bb95 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -587,7 +587,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re raise RuntimeError("Timed out waiting for WAL redo") tenant_status = ps_cli.tenant_status(tenant_id) - if tenant_status["state"] == "Loading": + if tenant_status["state"]["slug"] == "Loading": log.debug(f"Tenant {tenant_id} is still loading, retrying") else: pageserver_lsn = Lsn( From c237a2f5fb12c3940543f49dd4b9ba740744bf3b Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 12 Apr 2023 16:24:34 +0300 Subject: [PATCH 275/426] Compile `pg_hint_plan extension` --- Dockerfile.compute-node | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 7c64951fa5..5a223ae432 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -337,6 +337,35 @@ RUN apt-get update && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control +######################################################################################### +# +# Layer "pg-hint-plan-pg-build" +# compile pg_hint_plan extension +# +######################################################################################### +FROM build-deps AS pg-hint-plan-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ARG PG_VERSION +ENV PATH "/usr/local/pgsql/bin:$PATH" + +RUN case "${PG_VERSION}" in \ + "v14") \ + export PG_HINT_PLAN_VERSION=14_1_4_1 \ + ;; \ + "v15") \ + export PG_HINT_PLAN_VERSION=15_1_5_0 \ + ;; \ + *) \ + echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ + mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control + ######################################################################################### # # Layer "rust extensions" @@ -443,6 +472,7 @@ COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 356439aa333595021ca00bb42840e91233e3d54d Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Thu, 13 Apr 2023 13:13:24 +0300 Subject: [PATCH 276/426] Add note about `manual_release_instructions` label (#4015) ## Describe your changes Do not forget to process required manual stuff after release ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Dmitry Rodionov --- .github/PULL_REQUEST_TEMPLATE/release-pr.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md index a848077e6a..1e18fd5d44 100644 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -10,6 +10,7 @@ ### Checklist after release +- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them). - [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files)) - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) From 53f438a8a879ed9b72642bd0ee37a4c45ce94927 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 13 Apr 2023 13:45:43 +0300 Subject: [PATCH 277/426] Rename "Postgres nodes" in control_plane to endpoints. We use the term "endpoint" in for compute Postgres nodes in the web UI and user-facing documentation now. Adjust the nomenclature in the code. This changes the name of the "neon_local pg" command to "neon_local endpoint". Also adjust names of classes, variables etc. in the python tests accordingly. This also changes the directory structure so that endpoints are now stored in: .neon/endpoints/ instead of: .neon/pgdatadirs/tenants// The tenant ID is no longer part of the path. That means that you cannot have two endpoints with the same name/ID in two different tenants anymore. That's consistent with how we treat endpoints in the real control plane and proxy: the endpoint ID must be globally unique. --- README.md | 22 +- control_plane/src/bin/neon_local.rs | 132 +++++++----- control_plane/src/{compute.rs => endpoint.rs} | 108 +++++----- control_plane/src/lib.rs | 2 +- control_plane/src/local_env.rs | 12 +- test_runner/fixtures/compare_fixtures.py | 2 +- test_runner/fixtures/neon_fixtures.py | 201 ++++++++++-------- .../performance/test_branch_creation.py | 18 +- test_runner/performance/test_branching.py | 32 +-- .../performance/test_bulk_tenant_create.py | 4 +- test_runner/performance/test_bulk_update.py | 12 +- test_runner/performance/test_compaction.py | 6 +- test_runner/performance/test_latency.py | 4 +- test_runner/performance/test_layer_map.py | 4 +- test_runner/performance/test_startup.py | 24 +-- test_runner/regress/test_ancestor_branch.py | 12 +- test_runner/regress/test_auth.py | 6 +- test_runner/regress/test_backpressure.py | 14 +- test_runner/regress/test_basebackup_error.py | 2 +- test_runner/regress/test_branch_and_gc.py | 12 +- test_runner/regress/test_branch_behind.py | 16 +- test_runner/regress/test_branching.py | 34 +-- test_runner/regress/test_broken_timeline.py | 12 +- test_runner/regress/test_clog_truncate.py | 14 +- test_runner/regress/test_close_fds.py | 4 +- test_runner/regress/test_compatibility.py | 59 +++-- test_runner/regress/test_compute_ctl.py | 13 +- test_runner/regress/test_config.py | 4 +- test_runner/regress/test_crafted_wal_end.py | 14 +- test_runner/regress/test_createdropdb.py | 30 +-- test_runner/regress/test_createuser.py | 8 +- .../regress/test_disk_usage_eviction.py | 10 +- test_runner/regress/test_fsm_truncate.py | 4 +- test_runner/regress/test_fullbackup.py | 4 +- test_runner/regress/test_gc_aggressive.py | 22 +- test_runner/regress/test_gc_cutoff.py | 4 +- test_runner/regress/test_import.py | 32 +-- test_runner/regress/test_large_schema.py | 16 +- test_runner/regress/test_layer_eviction.py | 22 +- .../regress/test_layer_writers_fail.py | 6 +- test_runner/regress/test_lsn_mapping.py | 14 +- test_runner/regress/test_metric_collection.py | 6 +- test_runner/regress/test_multixact.py | 12 +- test_runner/regress/test_neon_local_cli.py | 6 +- test_runner/regress/test_next_xid.py | 12 +- test_runner/regress/test_normal_work.py | 12 +- test_runner/regress/test_old_request_lsn.py | 4 +- test_runner/regress/test_ondemand_download.py | 70 +++--- test_runner/regress/test_pageserver_api.py | 4 +- .../regress/test_pageserver_catchup.py | 8 +- .../regress/test_pageserver_restart.py | 12 +- ...test_pageserver_restarts_under_workload.py | 9 +- test_runner/regress/test_parallel_copy.py | 16 +- test_runner/regress/test_pg_regress.py | 42 ++-- test_runner/regress/test_pitr_gc.py | 8 +- test_runner/regress/test_read_trace.py | 12 +- test_runner/regress/test_read_validation.py | 8 +- test_runner/regress/test_readonly_node.py | 44 ++-- test_runner/regress/test_recovery.py | 6 +- test_runner/regress/test_remote_storage.py | 56 ++--- test_runner/regress/test_subxacts.py | 6 +- test_runner/regress/test_tenant_conf.py | 6 +- test_runner/regress/test_tenant_detach.py | 88 ++++---- test_runner/regress/test_tenant_relocation.py | 50 ++--- test_runner/regress/test_tenant_size.py | 122 +++++------ test_runner/regress/test_tenant_tasks.py | 4 +- test_runner/regress/test_tenants.py | 30 +-- .../test_tenants_with_remote_storage.py | 48 ++--- test_runner/regress/test_timeline_size.py | 80 +++---- test_runner/regress/test_truncate.py | 4 +- test_runner/regress/test_twophase.py | 18 +- test_runner/regress/test_unlogged.py | 12 +- test_runner/regress/test_vm_bits.py | 10 +- test_runner/regress/test_wal_acceptor.py | 164 +++++++------- .../regress/test_wal_acceptor_async.py | 80 +++---- test_runner/regress/test_wal_restore.py | 6 +- .../test_walredo_not_left_behind_on_detach.py | 4 +- test_runner/test_broken.py | 2 +- 78 files changed, 1061 insertions(+), 991 deletions(-) rename control_plane/src/{compute.rs => endpoint.rs} (88%) diff --git a/README.md b/README.md index 55df67f6c7..810937aff7 100644 --- a/README.md +++ b/README.md @@ -147,15 +147,15 @@ Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one # start postgres compute node -> ./target/debug/neon_local pg start main -Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ... +> ./target/debug/neon_local endpoint start main +Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 -Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' +Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances -> ./target/debug/neon_local pg list - NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS - main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running +> ./target/debug/neon_local endpoint list + ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` 2. Now, it is possible to connect to postgres and run some queries: @@ -184,14 +184,14 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] # start postgres on that branch -> ./target/debug/neon_local pg start migration_check --branch-name migration_check -Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... +> ./target/debug/neon_local endpoint start migration_check --branch-name migration_check +Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 -Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' +Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' # check the new list of running postgres instances -> ./target/debug/neon_local pg list - NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS +> ./target/debug/neon_local endpoint list + ENDPOINT ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index a9b66f479a..665cad8783 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -7,7 +7,7 @@ //! use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; -use control_plane::compute::ComputeControlPlane; +use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; @@ -106,8 +106,9 @@ fn main() -> Result<()> { "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), - "pg" => handle_pg(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), + "endpoint" => handle_endpoint(sub_args, &env), + "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), _ => bail!("unexpected subcommand {sub_name}"), }; @@ -470,10 +471,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let mut cplane = ComputeControlPlane::load(env.clone())?; println!("Importing timeline into pageserver ..."); pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; - println!("Creating node for imported timeline ..."); env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; + println!("Creating endpoint for imported timeline ..."); + cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } Some(("branch", branch_match)) => { @@ -521,10 +522,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - Ok(()) } -fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match pg_match.subcommand() { - Some(pg_subcommand_data) => pg_subcommand_data, - None => bail!("no pg subcommand provided"), +fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let (sub_name, sub_args) = match ep_match.subcommand() { + Some(ep_subcommand_data) => ep_subcommand_data, + None => bail!("no endpoint subcommand provided"), }; let mut cplane = ComputeControlPlane::load(env.clone())?; @@ -546,7 +547,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { table.load_preset(comfy_table::presets::NOTHING); table.set_header([ - "NODE", + "ENDPOINT", "ADDRESS", "TIMELINE", "BRANCH NAME", @@ -554,39 +555,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { "STATUS", ]); - for ((_, node_name), node) in cplane - .nodes + for (endpoint_id, endpoint) in cplane + .endpoints .iter() - .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) + .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) { - let lsn_str = match node.lsn { + let lsn_str = match endpoint.lsn { None => { - // -> primary node + // -> primary endpoint // Use the LSN at the end of the timeline. timeline_infos - .get(&node.timeline_id) + .get(&endpoint.timeline_id) .map(|bi| bi.last_record_lsn.to_string()) .unwrap_or_else(|| "?".to_string()) } Some(lsn) => { - // -> read-only node - // Use the node's LSN. + // -> read-only endpoint + // Use the endpoint's LSN. lsn.to_string() } }; let branch_name = timeline_name_mappings - .get(&TenantTimelineId::new(tenant_id, node.timeline_id)) + .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id)) .map(|name| name.as_str()) .unwrap_or("?"); table.add_row([ - node_name.as_str(), - &node.address.to_string(), - &node.timeline_id.to_string(), + endpoint_id.as_str(), + &endpoint.address.to_string(), + &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), - node.status(), + endpoint.status(), ]); } @@ -597,10 +598,10 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .get_one::("branch-name") .map(|s| s.as_str()) .unwrap_or(DEFAULT_BRANCH_NAME); - let node_name = sub_args - .get_one::("node") - .map(|node_name| node_name.to_string()) - .unwrap_or_else(|| format!("{branch_name}_node")); + let endpoint_id = sub_args + .get_one::("endpoint_id") + .map(String::to_string) + .unwrap_or_else(|| format!("ep-{branch_name}")); let lsn = sub_args .get_one::("lsn") @@ -618,15 +619,15 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .copied() .context("Failed to parse postgres version from the argument string")?; - cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; + cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?; } "start" => { let port: Option = sub_args.get_one::("port").copied(); - let node_name = sub_args - .get_one::("node") - .ok_or_else(|| anyhow!("No node name was provided to start"))?; + let endpoint_id = sub_args + .get_one::("endpoint_id") + .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; - let node = cplane.nodes.get(&(tenant_id, node_name.to_string())); + let endpoint = cplane.endpoints.get(endpoint_id.as_str()); let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); @@ -636,9 +637,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { None }; - if let Some(node) = node { - println!("Starting existing postgres {node_name}..."); - node.start(&auth_token)?; + if let Some(endpoint) = endpoint { + println!("Starting existing endpoint {endpoint_id}..."); + endpoint.start(&auth_token)?; } else { let branch_name = sub_args .get_one::("branch-name") @@ -663,27 +664,33 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { // start --port X // stop // start <-- will also use port X even without explicit port argument - println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ..."); + println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ..."); - let node = - cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; - node.start(&auth_token)?; + let ep = cplane.new_endpoint( + tenant_id, + endpoint_id, + timeline_id, + lsn, + port, + pg_version, + )?; + ep.start(&auth_token)?; } } "stop" => { - let node_name = sub_args - .get_one::("node") - .ok_or_else(|| anyhow!("No node name was provided to stop"))?; + let endpoint_id = sub_args + .get_one::("endpoint_id") + .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?; let destroy = sub_args.get_flag("destroy"); - let node = cplane - .nodes - .get(&(tenant_id, node_name.to_string())) - .with_context(|| format!("postgres {node_name} is not found"))?; - node.stop(destroy)?; + let endpoint = cplane + .endpoints + .get(endpoint_id.as_str()) + .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; + endpoint.stop(destroy)?; } - _ => bail!("Unexpected pg subcommand '{sub_name}'"), + _ => bail!("Unexpected endpoint subcommand '{sub_name}'"), } Ok(()) @@ -802,7 +809,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul } fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { - // Postgres nodes are not started automatically + // Endpoints are not started automatically broker::start_broker_process(env)?; @@ -836,10 +843,10 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { let pageserver = PageServerNode::from_env(env); - // Stop all compute nodes + // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { - for (_k, node) in cplane.nodes { + for (_k, node) in cplane.endpoints { if let Err(e) = node.stop(false) { eprintln!("postgres stop failed: {e:#}"); } @@ -872,7 +879,9 @@ fn cli() -> Command { .help("Name of the branch to be created or used as an alias for other services") .required(false); - let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); + let endpoint_id_arg = Arg::new("endpoint_id") + .help("Postgres endpoint id") + .required(false); let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); @@ -1026,27 +1035,27 @@ fn cli() -> Command { ) ) .subcommand( - Command::new("pg") + Command::new("endpoint") .arg_required_else_help(true) .about("Manage postgres instances") .subcommand(Command::new("list").arg(tenant_id_arg.clone())) .subcommand(Command::new("create") - .about("Create a postgres compute node") - .arg(pg_node_arg.clone()) + .about("Create a compute endpoint") + .arg(endpoint_id_arg.clone()) .arg(branch_name_arg.clone()) .arg(tenant_id_arg.clone()) .arg(lsn_arg.clone()) .arg(port_arg.clone()) .arg( Arg::new("config-only") - .help("Don't do basebackup, create compute node with only config files") + .help("Don't do basebackup, create endpoint directory with only config files") .long("config-only") .required(false)) .arg(pg_version_arg.clone()) ) .subcommand(Command::new("start") - .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") - .arg(pg_node_arg.clone()) + .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") + .arg(endpoint_id_arg.clone()) .arg(tenant_id_arg.clone()) .arg(branch_name_arg) .arg(timeline_id_arg) @@ -1056,7 +1065,7 @@ fn cli() -> Command { ) .subcommand( Command::new("stop") - .arg(pg_node_arg) + .arg(endpoint_id_arg) .arg(tenant_id_arg) .arg( Arg::new("destroy") @@ -1068,6 +1077,13 @@ fn cli() -> Command { ) ) + // Obsolete old name for 'endpoint'. We now just print an error if it's used. + .subcommand( + Command::new("pg") + .hide(true) + .arg(Arg::new("ignore-rest").allow_hyphen_values(true).num_args(0..).required(false)) + .trailing_var_arg(true) + ) .subcommand( Command::new("start") .about("Start page server and safekeepers") diff --git a/control_plane/src/compute.rs b/control_plane/src/endpoint.rs similarity index 88% rename from control_plane/src/compute.rs rename to control_plane/src/endpoint.rs index bc81107706..9e85138e68 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/endpoint.rs @@ -25,54 +25,45 @@ use crate::postgresql_conf::PostgresConf; // pub struct ComputeControlPlane { base_port: u16, - pageserver: Arc, - pub nodes: BTreeMap<(TenantId, String), Arc>, + + // endpoint ID is the key + pub endpoints: BTreeMap>, + env: LocalEnv, + pageserver: Arc, } impl ComputeControlPlane { - // Load current nodes with ports from data directories on disk - // Directory structure has the following layout: - // pgdatadirs - // |- tenants - // | |- - // | | |- + // Load current endpoints from the endpoints/ subdirectories pub fn load(env: LocalEnv) -> Result { let pageserver = Arc::new(PageServerNode::from_env(&env)); - let mut nodes = BTreeMap::default(); - let pgdatadirspath = &env.pg_data_dirs_path(); - - for tenant_dir in fs::read_dir(pgdatadirspath) - .with_context(|| format!("failed to list {}", pgdatadirspath.display()))? + let mut endpoints = BTreeMap::default(); + for endpoint_dir in fs::read_dir(env.endpoints_path()) + .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { - let tenant_dir = tenant_dir?; - for timeline_dir in fs::read_dir(tenant_dir.path()) - .with_context(|| format!("failed to list {}", tenant_dir.path().display()))? - { - let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?; - nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node)); - } + let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?; + endpoints.insert(ep.name.clone(), Arc::new(ep)); } Ok(ComputeControlPlane { base_port: 55431, - pageserver, - nodes, + endpoints, env, + pageserver, }) } fn get_port(&mut self) -> u16 { 1 + self - .nodes + .endpoints .values() - .map(|node| node.address.port()) + .map(|ep| ep.address.port()) .max() .unwrap_or(self.base_port) } - pub fn new_node( + pub fn new_endpoint( &mut self, tenant_id: TenantId, name: &str, @@ -80,9 +71,9 @@ impl ComputeControlPlane { lsn: Option, port: Option, pg_version: u32, - ) -> Result> { + ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); - let node = Arc::new(PostgresNode { + let ep = Arc::new(Endpoint { name: name.to_owned(), address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), env: self.env.clone(), @@ -93,39 +84,45 @@ impl ComputeControlPlane { pg_version, }); - node.create_pgdata()?; - node.setup_pg_conf()?; + ep.create_pgdata()?; + ep.setup_pg_conf()?; - self.nodes - .insert((tenant_id, node.name.clone()), Arc::clone(&node)); + self.endpoints.insert(ep.name.clone(), Arc::clone(&ep)); - Ok(node) + Ok(ep) } } /////////////////////////////////////////////////////////////////////////////// #[derive(Debug)] -pub struct PostgresNode { - pub address: SocketAddr, +pub struct Endpoint { + /// used as the directory name name: String, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary. + pub lsn: Option, + + // port and address of the Postgres server + pub address: SocketAddr, + pg_version: u32, + + // These are not part of the endpoint as such, but the environment + // the endpoint runs in. pub env: LocalEnv, pageserver: Arc, - pub timeline_id: TimelineId, - pub lsn: Option, // if it's a read-only node. None for primary - pub tenant_id: TenantId, - pg_version: u32, } -impl PostgresNode { +impl Endpoint { fn from_dir_entry( entry: std::fs::DirEntry, env: &LocalEnv, pageserver: &Arc, - ) -> Result { + ) -> Result { if !entry.file_type()?.is_dir() { anyhow::bail!( - "PostgresNode::from_dir_entry failed: '{}' is not a directory", + "Endpoint::from_dir_entry failed: '{}' is not a directory", entry.path().display() ); } @@ -135,7 +132,7 @@ impl PostgresNode { let name = fname.to_str().unwrap().to_string(); // Read config file into memory - let cfg_path = entry.path().join("postgresql.conf"); + let cfg_path = entry.path().join("pgdata").join("postgresql.conf"); let cfg_path_str = cfg_path.to_string_lossy(); let mut conf_file = File::open(&cfg_path) .with_context(|| format!("failed to open config file in {}", cfg_path_str))?; @@ -161,7 +158,7 @@ impl PostgresNode { conf.parse_field_optional("recovery_target_lsn", &context)?; // ok now - Ok(PostgresNode { + Ok(Endpoint { address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), name, env: env.clone(), @@ -269,7 +266,7 @@ impl PostgresNode { } // Write postgresql.conf with default configuration - // and PG_VERSION file to the data directory of a new node. + // and PG_VERSION file to the data directory of a new endpoint. fn setup_pg_conf(&self) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); @@ -289,7 +286,7 @@ impl PostgresNode { // walproposer panics when basebackup is invalid, it is pointless to restart in this case. conf.append("restart_after_crash", "off"); - // Configure the node to fetch pages from pageserver + // Configure the Neon Postgres extension to fetch pages from pageserver let pageserver_connstr = { let config = &self.pageserver.pg_connection_config; let (host, port) = (config.host(), config.port()); @@ -325,7 +322,7 @@ impl PostgresNode { conf.append("max_replication_flush_lag", "10GB"); if !self.env.safekeepers.is_empty() { - // Configure the node to connect to the safekeepers + // Configure Postgres to connect to the safekeepers conf.append("synchronous_standby_names", "walproposer"); let safekeepers = self @@ -380,8 +377,12 @@ impl PostgresNode { Ok(()) } + pub fn endpoint_path(&self) -> PathBuf { + self.env.endpoints_path().join(&self.name) + } + pub fn pgdata(&self) -> PathBuf { - self.env.pg_data_dir(&self.tenant_id, &self.name) + self.endpoint_path().join("pgdata") } pub fn status(&self) -> &str { @@ -443,12 +444,11 @@ impl PostgresNode { } pub fn start(&self, auth_token: &Option) -> Result<()> { - // Bail if the node already running. if self.status() == "running" { - anyhow::bail!("The node is already running"); + anyhow::bail!("The endpoint is already running"); } - // 1. We always start compute node from scratch, so + // 1. We always start Postgres from scratch, so // if old dir exists, preserve 'postgresql.conf' and drop the directory let postgresql_conf_path = self.pgdata().join("postgresql.conf"); let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| { @@ -470,8 +470,8 @@ impl PostgresNode { File::create(self.pgdata().join("standby.signal"))?; } - // 4. Finally start the compute node postgres - println!("Starting postgres node at '{}'", self.connstr()); + // 4. Finally start postgres + println!("Starting postgres at '{}'", self.connstr()); self.pg_ctl(&["start"], auth_token) } @@ -480,7 +480,7 @@ impl PostgresNode { // use immediate shutdown mode, otherwise, // shutdown gracefully to leave the data directory sane. // - // Compute node always starts from scratch, so stop + // Postgres is always started from scratch, so stop // without destroy only used for testing and debugging. // if destroy { @@ -489,7 +489,7 @@ impl PostgresNode { "Destroying postgres data directory '{}'", self.pgdata().to_str().unwrap() ); - fs::remove_dir_all(self.pgdata())?; + fs::remove_dir_all(self.endpoint_path())?; } else { self.pg_ctl(&["stop"], &None)?; } diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 6829479ad5..a773b8dcc3 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -9,7 +9,7 @@ mod background_process; pub mod broker; -pub mod compute; +pub mod endpoint; pub mod local_env; pub mod pageserver; pub mod postgresql_conf; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 8cc6329ce6..2b1eec7c4b 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -200,14 +200,8 @@ impl LocalEnv { self.neon_distrib_dir.join("storage_broker") } - pub fn pg_data_dirs_path(&self) -> PathBuf { - self.base_data_dir.join("pgdatadirs").join("tenants") - } - - pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf { - self.pg_data_dirs_path() - .join(tenant_id.to_string()) - .join(branch_name) + pub fn endpoints_path(&self) -> PathBuf { + self.base_data_dir.join("endpoints") } // TODO: move pageserver files into ./pageserver @@ -427,7 +421,7 @@ impl LocalEnv { } } - fs::create_dir_all(self.pg_data_dirs_path())?; + fs::create_dir_all(self.endpoints_path())?; for safekeeper in &self.safekeepers { fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index b328cea5c6..f0d9ce4af2 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -114,7 +114,7 @@ class NeonCompare(PgCompare): self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant) # Start pg - self._pg = self.env.postgres.create_start(branch_name, "main", self.tenant) + self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant) @property def pg(self) -> PgProtocol: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5b6f2e5c96..e9f0363843 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -830,7 +830,7 @@ class NeonEnvBuilder: # Stop all the nodes. if self.env: log.info("Cleaning up all storage and compute nodes") - self.env.postgres.stop_all() + self.env.endpoints.stop_all() for sk in self.env.safekeepers: sk.stop(immediate=True) self.env.pageserver.stop(immediate=True) @@ -894,7 +894,7 @@ class NeonEnv: self.port_distributor = config.port_distributor self.s3_mock_server = config.mock_s3_server self.neon_cli = NeonCli(env=self) - self.postgres = PostgresFactory(self) + self.endpoints = EndpointFactory(self) self.safekeepers: List[Safekeeper] = [] self.broker = config.broker self.remote_storage = config.remote_storage @@ -902,6 +902,7 @@ class NeonEnv: self.pg_version = config.pg_version self.neon_binpath = config.neon_binpath self.pg_distrib_dir = config.pg_distrib_dir + self.endpoint_counter = 0 # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -1015,6 +1016,13 @@ class NeonEnv: priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text() return AuthKeys(pub=pub, priv=priv) + def generate_endpoint_id(self) -> str: + """ + Generate a unique endpoint ID + """ + self.endpoint_counter += 1 + return "ep-" + str(self.endpoint_counter) + @pytest.fixture(scope=shareable_scope) def _shared_simple_env( @@ -1073,7 +1081,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: """ yield _shared_simple_env - _shared_simple_env.postgres.stop_all() + _shared_simple_env.endpoints.stop_all() @pytest.fixture(scope="function") @@ -1097,7 +1105,7 @@ def neon_env_builder( neon_env_builder.init_start(). After the initialization, you can launch compute nodes by calling - the functions in the 'env.postgres' factory object, stop/start the + the functions in the 'env.endpoints' factory object, stop/start the nodes, etc. """ @@ -1438,16 +1446,16 @@ class NeonCli(AbstractNeonCli): args.extend(["-m", "immediate"]) return self.raw_cli(args) - def pg_create( + def endpoint_create( self, branch_name: str, - node_name: Optional[str] = None, + endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ - "pg", + "endpoint", "create", "--tenant-id", str(tenant_id or self.env.initial_tenant), @@ -1460,22 +1468,22 @@ class NeonCli(AbstractNeonCli): args.extend(["--lsn", str(lsn)]) if port is not None: args.extend(["--port", str(port)]) - if node_name is not None: - args.append(node_name) + if endpoint_id is not None: + args.append(endpoint_id) res = self.raw_cli(args) res.check_returncode() return res - def pg_start( + def endpoint_start( self, - node_name: str, + endpoint_id: str, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ - "pg", + "endpoint", "start", "--tenant-id", str(tenant_id or self.env.initial_tenant), @@ -1486,30 +1494,30 @@ class NeonCli(AbstractNeonCli): args.append(f"--lsn={lsn}") if port is not None: args.append(f"--port={port}") - if node_name is not None: - args.append(node_name) + if endpoint_id is not None: + args.append(endpoint_id) res = self.raw_cli(args) res.check_returncode() return res - def pg_stop( + def endpoint_stop( self, - node_name: str, + endpoint_id: str, tenant_id: Optional[TenantId] = None, destroy=False, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": args = [ - "pg", + "endpoint", "stop", "--tenant-id", str(tenant_id or self.env.initial_tenant), ] if destroy: args.append("--destroy") - if node_name is not None: - args.append(node_name) + if endpoint_id is not None: + args.append(endpoint_id) return self.raw_cli(args, check_return_code=check_return_code) @@ -2167,8 +2175,8 @@ def static_proxy( yield proxy -class Postgres(PgProtocol): - """An object representing a running postgres daemon.""" +class Endpoint(PgProtocol): + """An object representing a Postgres compute endpoint managed by the control plane.""" def __init__( self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True @@ -2176,33 +2184,40 @@ class Postgres(PgProtocol): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env self.running = False - self.node_name: Optional[str] = None # dubious, see asserts below + self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id self.port = port self.check_stop_result = check_stop_result - # path to conf is /pgdatadirs/tenants///postgresql.conf + # path to conf is /endpoints//pgdata/postgresql.conf def create( self, branch_name: str, - node_name: Optional[str] = None, + endpoint_id: Optional[str] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, - ) -> "Postgres": + ) -> "Endpoint": """ - Create the pg data directory. + Create a new Postgres endpoint. Returns self. """ if not config_lines: config_lines = [] - self.node_name = node_name or f"{branch_name}_pg_node" - self.env.neon_cli.pg_create( - branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port + if endpoint_id is None: + endpoint_id = self.env.generate_endpoint_id() + self.endpoint_id = endpoint_id + + self.env.neon_cli.endpoint_create( + branch_name, + endpoint_id=self.endpoint_id, + tenant_id=self.tenant_id, + lsn=lsn, + port=self.port, ) - path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name + path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -2215,26 +2230,30 @@ class Postgres(PgProtocol): return self - def start(self) -> "Postgres": + def start(self) -> "Endpoint": """ Start the Postgres instance. Returns self. """ - assert self.node_name is not None + assert self.endpoint_id is not None - log.info(f"Starting postgres node {self.node_name}") + log.info(f"Starting postgres endpoint {self.endpoint_id}") - self.env.neon_cli.pg_start(self.node_name, tenant_id=self.tenant_id, port=self.port) + self.env.neon_cli.endpoint_start(self.endpoint_id, tenant_id=self.tenant_id, port=self.port) self.running = True return self + def endpoint_path(self) -> Path: + """Path to endpoint directory""" + assert self.endpoint_id + path = Path("endpoints") / self.endpoint_id + return self.env.repo_dir / path + def pg_data_dir_path(self) -> str: - """Path to data directory""" - assert self.node_name - path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name - return os.path.join(self.env.repo_dir, path) + """Path to Postgres data directory""" + return os.path.join(self.endpoint_path(), "pgdata") def pg_xact_dir_path(self) -> str: """Path to pg_xact dir""" @@ -2248,7 +2267,7 @@ class Postgres(PgProtocol): """Path to postgresql.conf""" return os.path.join(self.pg_data_dir_path(), "postgresql.conf") - def adjust_for_safekeepers(self, safekeepers: str) -> "Postgres": + def adjust_for_safekeepers(self, safekeepers: str) -> "Endpoint": """ Adjust instance config for working with wal acceptors instead of pageserver (pre-configured by CLI) directly. @@ -2272,7 +2291,7 @@ class Postgres(PgProtocol): f.write("neon.safekeepers = '{}'\n".format(safekeepers)) return self - def config(self, lines: List[str]) -> "Postgres": + def config(self, lines: List[str]) -> "Endpoint": """ Add lines to postgresql.conf. Lines should be an array of valid postgresql.conf rows. @@ -2286,32 +2305,32 @@ class Postgres(PgProtocol): return self - def stop(self) -> "Postgres": + def stop(self) -> "Endpoint": """ Stop the Postgres instance if it's running. Returns self. """ if self.running: - assert self.node_name is not None - self.env.neon_cli.pg_stop( - self.node_name, self.tenant_id, check_return_code=self.check_stop_result + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, self.tenant_id, check_return_code=self.check_stop_result ) self.running = False return self - def stop_and_destroy(self) -> "Postgres": + def stop_and_destroy(self) -> "Endpoint": """ - Stop the Postgres instance, then destroy it. + Stop the Postgres instance, then destroy the endpoint. Returns self. """ - assert self.node_name is not None - self.env.neon_cli.pg_stop( - self.node_name, self.tenant_id, True, check_return_code=self.check_stop_result + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, self.tenant_id, True, check_return_code=self.check_stop_result ) - self.node_name = None + self.endpoint_id = None self.running = False return self @@ -2319,13 +2338,12 @@ class Postgres(PgProtocol): def create_start( self, branch_name: str, - node_name: Optional[str] = None, + endpoint_id: Optional[str] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, - ) -> "Postgres": + ) -> "Endpoint": """ - Create a Postgres instance, apply config - and then start it. + Create an endpoint, apply config, and start Postgres. Returns self. """ @@ -2333,7 +2351,7 @@ class Postgres(PgProtocol): self.create( branch_name=branch_name, - node_name=node_name, + endpoint_id=endpoint_id, config_lines=config_lines, lsn=lsn, ).start() @@ -2342,7 +2360,7 @@ class Postgres(PgProtocol): return self - def __enter__(self) -> "Postgres": + def __enter__(self) -> "Endpoint": return self def __exit__( @@ -2354,33 +2372,33 @@ class Postgres(PgProtocol): self.stop() -class PostgresFactory: - """An object representing multiple running postgres daemons.""" +class EndpointFactory: + """An object representing multiple compute endpoints.""" def __init__(self, env: NeonEnv): self.env = env self.num_instances: int = 0 - self.instances: List[Postgres] = [] + self.endpoints: List[Endpoint] = [] def create_start( self, branch_name: str, - node_name: Optional[str] = None, + endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, - ) -> Postgres: - pg = Postgres( + ) -> Endpoint: + ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, port=self.env.port_distributor.get_port(), ) self.num_instances += 1 - self.instances.append(pg) + self.endpoints.append(ep) - return pg.create_start( + return ep.create_start( branch_name=branch_name, - node_name=node_name, + endpoint_id=endpoint_id, config_lines=config_lines, lsn=lsn, ) @@ -2388,30 +2406,33 @@ class PostgresFactory: def create( self, branch_name: str, - node_name: Optional[str] = None, + endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, - ) -> Postgres: - pg = Postgres( + ) -> Endpoint: + ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, port=self.env.port_distributor.get_port(), ) - self.num_instances += 1 - self.instances.append(pg) + if endpoint_id is None: + endpoint_id = self.env.generate_endpoint_id() - return pg.create( + self.num_instances += 1 + self.endpoints.append(ep) + + return ep.create( branch_name=branch_name, - node_name=node_name, + endpoint_id=endpoint_id, lsn=lsn, config_lines=config_lines, ) - def stop_all(self) -> "PostgresFactory": - for pg in self.instances: - pg.stop() + def stop_all(self) -> "EndpointFactory": + for ep in self.endpoints: + ep.stop() return self @@ -2786,16 +2807,16 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: def check_restored_datadir_content( test_output_dir: Path, env: NeonEnv, - pg: Postgres, + endpoint: Endpoint, ): # Get the timeline ID. We need it for the 'basebackup' command - timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) + timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) # stop postgres to ensure that files won't change - pg.stop() + endpoint.stop() # Take a basebackup from pageserver - restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" + restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) @@ -2805,7 +2826,7 @@ def check_restored_datadir_content( {psql_path} \ --no-psqlrc \ postgres://localhost:{env.pageserver.service_port.pg} \ - -c 'basebackup {pg.tenant_id} {timeline}' \ + -c 'basebackup {endpoint.tenant_id} {timeline}' \ | tar -x -C {restored_dir_path} """ @@ -2822,8 +2843,8 @@ def check_restored_datadir_content( assert result.returncode == 0 # list files we're going to compare - assert pg.pgdata_dir - pgdata_files = list_files_to_compare(Path(pg.pgdata_dir)) + assert endpoint.pgdata_dir + pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir)) restored_files = list_files_to_compare(restored_dir_path) # check that file sets are equal @@ -2834,12 +2855,12 @@ def check_restored_datadir_content( # We've already filtered all mismatching files in list_files_to_compare(), # so here expect that the content is identical (match, mismatch, error) = filecmp.cmpfiles( - pg.pgdata_dir, restored_dir_path, pgdata_files, shallow=False + endpoint.pgdata_dir, restored_dir_path, pgdata_files, shallow=False ) log.info(f"filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}") for f in mismatch: - f1 = os.path.join(pg.pgdata_dir, f) + f1 = os.path.join(endpoint.pgdata_dir, f) f2 = os.path.join(restored_dir_path, f) stdout_filename = "{}.filediff".format(f2) @@ -2854,24 +2875,24 @@ def check_restored_datadir_content( def wait_for_last_flush_lsn( - env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId + env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" - last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) def wait_for_wal_insert_lsn( - env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId + env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" - last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) def fork_at_current_lsn( env: NeonEnv, - pg: Postgres, + endpoint: Endpoint, new_branch_name: str, ancestor_branch_name: str, tenant_id: Optional[TenantId] = None, @@ -2881,7 +2902,7 @@ def fork_at_current_lsn( The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the the WAL up to that LSN to arrive in the pageserver before creating the branch. """ - current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0] + current_lsn = endpoint.safe_psql("SELECT pg_current_wal_lsn()")[0][0] return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 16c5438b8f..6edcb8f1f2 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -52,13 +52,13 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) def run_pgbench(branch: str): log.info(f"Start a pgbench workload on branch {branch}") - pg = env.postgres.create_start(branch, tenant_id=tenant) - connstr = pg.connstr() + endpoint = env.endpoints.create_start(branch, tenant_id=tenant) + connstr = endpoint.connstr() pg_bin.run_capture(["pgbench", "-i", connstr]) pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr]) - pg.stop() + endpoint.stop() env.neon_cli.create_branch("b0", tenant_id=tenant) @@ -96,8 +96,8 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): env.neon_cli.create_branch("b0") - pg = env.postgres.create_start("b0") - neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", pg.connstr()]) + endpoint = env.endpoints.create_start("b0") + neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) branch_creation_durations = [] @@ -124,15 +124,15 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare): timeline_id = env.neon_cli.create_branch("root") - pg = env.postgres.create_start("root") - with closing(pg.connect()) as conn: + endpoint = env.endpoints.create_start("root") + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: for i in range(10000): cur.execute(f"CREATE TABLE t{i} as SELECT g FROM generate_series(1, 1000) g") # Wait for the pageserver to finish processing all the pending WALs, # as we don't want the LSN wait time to be included during the branch creation - flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn( env.pageserver.http_client(), env.initial_tenant, timeline_id, flush_lsn ) @@ -142,7 +142,7 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare): # run a concurrent insertion to make the ancestor "busy" during the branch creation thread = threading.Thread( - target=pg.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",) + target=endpoint.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",) ) thread.start() diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index 4eaec40096..667d1a4c4a 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -42,41 +42,41 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare): neon_compare.zenbenchmark.record_pg_bench_result(branch, res) env.neon_cli.create_branch("root") - pg_root = env.postgres.create_start("root") - pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"]) + endpoint_root = env.endpoints.create_start("root") + pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"]) - fork_at_current_lsn(env, pg_root, "child", "root") + fork_at_current_lsn(env, endpoint_root, "child", "root") - pg_child = env.postgres.create_start("child") + endpoint_child = env.endpoints.create_start("child") - run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()]) - run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", pg_child.connstr()]) + run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", endpoint_root.connstr()]) + run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", endpoint_child.connstr()]) def test_compare_child_and_root_write_perf(neon_compare: NeonCompare): env = neon_compare.env env.neon_cli.create_branch("root") - pg_root = env.postgres.create_start("root") + endpoint_root = env.endpoints.create_start("root") - pg_root.safe_psql( + endpoint_root.safe_psql( "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", ) env.neon_cli.create_branch("child", "root") - pg_child = env.postgres.create_start("child") + endpoint_child = env.endpoints.create_start("child") with neon_compare.record_duration("root_run_duration"): - pg_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") + endpoint_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") with neon_compare.record_duration("child_run_duration"): - pg_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") + endpoint_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") def test_compare_child_and_root_read_perf(neon_compare: NeonCompare): env = neon_compare.env env.neon_cli.create_branch("root") - pg_root = env.postgres.create_start("root") + endpoint_root = env.endpoints.create_start("root") - pg_root.safe_psql_many( + endpoint_root.safe_psql_many( [ "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", "INSERT INTO foo SELECT FROM generate_series(1,1000000)", @@ -84,12 +84,12 @@ def test_compare_child_and_root_read_perf(neon_compare: NeonCompare): ) env.neon_cli.create_branch("child", "root") - pg_child = env.postgres.create_start("child") + endpoint_child = env.endpoints.create_start("child") with neon_compare.record_duration("root_run_duration"): - pg_root.safe_psql("SELECT count(*) from foo") + endpoint_root.safe_psql("SELECT count(*) from foo") with neon_compare.record_duration("child_run_duration"): - pg_child.safe_psql("SELECT count(*) from foo") + endpoint_child.safe_psql("SELECT count(*) from foo") # ----------------------------------------------------------------------- diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index cef7ce0c6b..9b05903cfa 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -35,14 +35,14 @@ def test_bulk_tenant_create( # if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) - pg_tenant = env.postgres.create_start( + endpoint_tenant = env.endpoints.create_start( f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant ) end = timeit.default_timer() time_slices.append(end - start) - pg_tenant.stop() + endpoint_tenant.stop() zenbenchmark.record( "tenant_creation_time", diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py index 7aa6f09a40..2ace31a2d7 100644 --- a/test_runner/performance/test_bulk_update.py +++ b/test_runner/performance/test_bulk_update.py @@ -18,8 +18,8 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor) timeline_id = env.neon_cli.create_branch("test_bulk_update") tenant_id = env.initial_tenant - pg = env.postgres.create_start("test_bulk_update") - cur = pg.connect().cursor() + endpoint = env.endpoints.create_start("test_bulk_update") + cur = endpoint.connect().cursor() cur.execute("set statement_timeout=0") cur.execute(f"create table t(x integer) WITH (fillfactor={fillfactor})") @@ -28,13 +28,13 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor) cur.execute(f"insert into t values (generate_series(1,{n_records}))") cur.execute("vacuum t") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) with zenbenchmark.record_duration("update-no-prefetch"): cur.execute("update t set x=x+1") cur.execute("vacuum t") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) with zenbenchmark.record_duration("delete-no-prefetch"): cur.execute("delete from t") @@ -50,13 +50,13 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor) cur.execute(f"insert into t2 values (generate_series(1,{n_records}))") cur.execute("vacuum t2") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) with zenbenchmark.record_duration("update-with-prefetch"): cur.execute("update t2 set x=x+1") cur.execute("vacuum t2") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) with zenbenchmark.record_duration("delete-with-prefetch"): cur.execute("delete from t2") diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 89818ee8bd..326c4f5c6f 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -33,11 +33,11 @@ def test_compaction(neon_compare: NeonCompare): # Create some tables, and run a bunch of INSERTs and UPDATes on them, # to generate WAL and layers - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"] ) - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: for i in range(100): cur.execute(f"create table tbl{i} (i int, j int);") @@ -45,7 +45,7 @@ def test_compaction(neon_compare: NeonCompare): for j in range(100): cur.execute(f"update tbl{i} set j = {j};") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) # First compaction generates L1 layers with neon_compare.zenbenchmark.record_duration("compaction"): diff --git a/test_runner/performance/test_latency.py b/test_runner/performance/test_latency.py index 257e0421af..6c94ecc482 100644 --- a/test_runner/performance/test_latency.py +++ b/test_runner/performance/test_latency.py @@ -2,13 +2,13 @@ import threading import pytest from fixtures.compare_fixtures import PgCompare -from fixtures.neon_fixtures import Postgres +from fixtures.neon_fixtures import PgProtocol from performance.test_perf_pgbench import get_scales_matrix from performance.test_wal_backpressure import record_read_latency -def start_write_workload(pg: Postgres, scale: int = 10): +def start_write_workload(pg: PgProtocol, scale: int = 10): with pg.connect().cursor() as cur: cur.execute(f"create table big as select generate_series(1,{scale*100_000})") diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index fb29c05273..18308e1077 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -25,8 +25,8 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): ) env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) - pg = env.postgres.create_start("test_layer_map", tenant_id=tenant) - cur = pg.connect().cursor() + endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant) + cur = endpoint.connect().cursor() cur.execute("create table t(x integer)") for i in range(n_iters): cur.execute(f"insert into t values (generate_series(1,{n_records}))") diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index e91b180154..fa2e058491 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -14,19 +14,19 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker # Start env.neon_cli.create_branch("test_startup") with zenbenchmark.record_duration("startup_time"): - pg = env.postgres.create_start("test_startup") - pg.safe_psql("select 1;") + endpoint = env.endpoints.create_start("test_startup") + endpoint.safe_psql("select 1;") # Restart - pg.stop_and_destroy() + endpoint.stop_and_destroy() with zenbenchmark.record_duration("restart_time"): - pg.create_start("test_startup") - pg.safe_psql("select 1;") + endpoint.create_start("test_startup") + endpoint.safe_psql("select 1;") # Fill up num_rows = 1000000 # 30 MB num_tables = 100 - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: for i in range(num_tables): cur.execute(f"create table t_{i} (i integer);") @@ -34,18 +34,18 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker # Read with zenbenchmark.record_duration("read_time"): - pg.safe_psql("select * from t_0;") + endpoint.safe_psql("select * from t_0;") # Read again with zenbenchmark.record_duration("second_read_time"): - pg.safe_psql("select * from t_0;") + endpoint.safe_psql("select * from t_0;") # Restart - pg.stop_and_destroy() + endpoint.stop_and_destroy() with zenbenchmark.record_duration("restart_with_data"): - pg.create_start("test_startup") - pg.safe_psql("select 1;") + endpoint.create_start("test_startup") + endpoint.safe_psql("select 1;") # Read with zenbenchmark.record_duration("read_after_restart"): - pg.safe_psql("select * from t_0;") + endpoint.safe_psql("select * from t_0;") diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 2406102756..e8c1a2f34c 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -22,8 +22,8 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)")) - pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) - branch0_cur = pg_branch0.connect().cursor() + endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant) + branch0_cur = endpoint_branch0.connect().cursor() branch0_timeline = TimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) log.info(f"b0 timeline {branch0_timeline}") @@ -44,10 +44,10 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch1. env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) - pg_branch1 = env.postgres.create_start("branch1", tenant_id=tenant) + endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant) log.info("postgres is running on 'branch1' branch") - branch1_cur = pg_branch1.connect().cursor() + branch1_cur = endpoint_branch1.connect().cursor() branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) log.info(f"b1 timeline {branch1_timeline}") @@ -67,9 +67,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch2. env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) - pg_branch2 = env.postgres.create_start("branch2", tenant_id=tenant) + endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant) log.info("postgres is running on 'branch2' branch") - branch2_cur = pg_branch2.connect().cursor() + branch2_cur = endpoint_branch2.connect().cursor() branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) log.info(f"b2 timeline {branch2_timeline}") diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index f7c4736e04..3305869dce 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -64,9 +64,9 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): branch = "test_compute_auth_to_pageserver" env.neon_cli.create_branch(branch) - pg = env.postgres.create_start(branch) + endpoint = env.endpoints.create_start(branch) - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -83,7 +83,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): branch = f"test_auth_failures_auth_enabled_{auth_enabled}" timeline_id = env.neon_cli.create_branch(branch) - env.postgres.create_start(branch) + env.endpoints.create_start(branch) tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) invalid_tenant_token = env.auth_keys.generate_tenant_token(TenantId.generate()) diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index a81fa380a9..352e149171 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -5,7 +5,7 @@ from contextlib import closing, contextmanager import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, Postgres +from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder pytest_plugins = "fixtures.neon_fixtures" @@ -20,10 +20,10 @@ def pg_cur(pg): # Periodically check that all backpressure lags are below the configured threshold, # assert if they are not. # If the check query fails, stop the thread. Main thread should notice that and stop the test. -def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interval=5): +def check_backpressure(endpoint: Endpoint, stop_event: threading.Event, polling_interval=5): log.info("checks started") - with pg_cur(pg) as cur: + with pg_cur(endpoint) as cur: cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))") @@ -41,7 +41,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv max_replication_apply_lag_bytes = res[0] log.info(f"max_replication_apply_lag: {max_replication_apply_lag_bytes} bytes") - with pg_cur(pg) as cur: + with pg_cur(endpoint) as cur: while not stop_event.is_set(): try: cur.execute( @@ -102,14 +102,14 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): # Create a branch for us env.neon_cli.create_branch("test_backpressure") - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( "test_backpressure", config_lines=["max_replication_write_lag=30MB"] ) log.info("postgres is running on 'test_backpressure' branch") # setup check thread check_stop_event = threading.Event() - check_thread = threading.Thread(target=check_backpressure, args=(pg, check_stop_event)) + check_thread = threading.Thread(target=check_backpressure, args=(endpoint, check_stop_event)) check_thread.start() # Configure failpoint to slow down walreceiver ingest @@ -125,7 +125,7 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): # because of the lag and waiting for lsn to replay to arrive. time.sleep(2) - with pg_cur(pg) as cur: + with pg_cur(endpoint) as cur: # Create and initialize test table cur.execute("CREATE TABLE foo(x bigint)") diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 94d3999d17..170b494884 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -15,4 +15,4 @@ def test_basebackup_error(neon_simple_env: NeonEnv): pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): - env.postgres.create_start("test_basebackup_error") + env.endpoints.create_start("test_basebackup_error") diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index cc807b7ff3..4a03421fcf 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -67,9 +67,9 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): ) timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant) - pg_main = env.postgres.create_start("test_main", tenant_id=tenant) + endpoint_main = env.endpoints.create_start("test_main", tenant_id=tenant) - main_cur = pg_main.connect().cursor() + main_cur = endpoint_main.connect().cursor() main_cur.execute( "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" @@ -90,9 +90,9 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): env.neon_cli.create_branch( "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 ) - pg_branch = env.postgres.create_start("test_branch", tenant_id=tenant) + endpoint_branch = env.endpoints.create_start("test_branch", tenant_id=tenant) - branch_cur = pg_branch.connect().cursor() + branch_cur = endpoint_branch.connect().cursor() branch_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") assert query_scalar(branch_cur, "SELECT count(*) FROM foo") == 200000 @@ -142,8 +142,8 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): ) b0 = env.neon_cli.create_branch("b0", tenant_id=tenant) - pg0 = env.postgres.create_start("b0", tenant_id=tenant) - res = pg0.safe_psql_many( + endpoint0 = env.endpoints.create_start("b0", tenant_id=tenant) + res = endpoint0.safe_psql_many( queries=[ "CREATE TABLE t(key serial primary key)", "INSERT INTO t SELECT FROM generate_series(1, 100000)", diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index d19f6a7d39..3f7d49ab03 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -18,10 +18,10 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Branch at the point where only 100 rows were inserted env.neon_cli.create_branch("test_branch_behind") - pgmain = env.postgres.create_start("test_branch_behind") + endpoint_main = env.endpoints.create_start("test_branch_behind") log.info("postgres is running on 'test_branch_behind' branch") - main_cur = pgmain.connect().cursor() + main_cur = endpoint_main.connect().cursor() timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) @@ -74,15 +74,15 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): "test_branch_behind_more", "test_branch_behind", ancestor_start_lsn=lsn_b ) - pg_hundred = env.postgres.create_start("test_branch_behind_hundred") - pg_more = env.postgres.create_start("test_branch_behind_more") + endpoint_hundred = env.endpoints.create_start("test_branch_behind_hundred") + endpoint_more = env.endpoints.create_start("test_branch_behind_more") # On the 'hundred' branch, we should see only 100 rows - hundred_cur = pg_hundred.connect().cursor() + hundred_cur = endpoint_hundred.connect().cursor() assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 # On the 'more' branch, we should see 100200 rows - more_cur = pg_more.connect().cursor() + more_cur = endpoint_more.connect().cursor() assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100 # All the rows are visible on the main branch @@ -94,8 +94,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch( "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn=Lsn("0/3000000") ) - pg = env.postgres.create_start("test_branch_segment_boundary") - assert pg.safe_psql("SELECT 1")[0][0] == 1 + endpoint = env.endpoints.create_start("test_branch_segment_boundary") + assert endpoint.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn: .*"): diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 3b78700e9f..31f9df6ebe 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -5,7 +5,7 @@ from typing import List import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres +from fixtures.neon_fixtures import Endpoint, NeonEnv, PgBin from fixtures.types import Lsn from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix @@ -40,20 +40,20 @@ def test_branching_with_pgbench( } ) - def run_pgbench(pg: Postgres): - connstr = pg.connstr() - + def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", "-T15", connstr]) env.neon_cli.create_branch("b0", tenant_id=tenant) - pgs: List[Postgres] = [] - pgs.append(env.postgres.create_start("b0", tenant_id=tenant)) + endpoints: List[Endpoint] = [] + endpoints.append(env.endpoints.create_start("b0", tenant_id=tenant)) threads: List[threading.Thread] = [] - threads.append(threading.Thread(target=run_pgbench, args=(pgs[0],), daemon=True)) + threads.append( + threading.Thread(target=run_pgbench, args=(endpoints[0].connstr(),), daemon=True) + ) threads[-1].start() thread_limit = 4 @@ -79,16 +79,18 @@ def test_branching_with_pgbench( else: env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) - pgs.append(env.postgres.create_start("b{}".format(i + 1), tenant_id=tenant)) + endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant)) - threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1],), daemon=True)) + threads.append( + threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True) + ) threads[-1].start() for thread in threads: thread.join() - for pg in pgs: - res = pg.safe_psql("SELECT count(*) from pgbench_accounts") + for ep in endpoints: + res = ep.safe_psql("SELECT count(*) from pgbench_accounts") assert res[0] == (100000 * scale,) @@ -110,11 +112,11 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi env = neon_simple_env env.neon_cli.create_branch("b0") - pg0 = env.postgres.create_start("b0") + endpoint0 = env.endpoints.create_start("b0") - pg_bin.run_capture(["pgbench", "-i", pg0.connstr()]) + pg_bin.run_capture(["pgbench", "-i", endpoint0.connstr()]) - with pg0.cursor() as cur: + with endpoint0.cursor() as cur: curr_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # Specify the `start_lsn` as a number that is divided by `XLOG_BLCKSZ` @@ -123,6 +125,6 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi log.info(f"Branching b1 from b0 starting at lsn {start_lsn}...") env.neon_cli.create_branch("b1", "b0", ancestor_start_lsn=start_lsn) - pg1 = env.postgres.create_start("b1") + endpoint1 = env.endpoints.create_start("b1") - pg_bin.run_capture(["pgbench", "-i", pg1.connstr()]) + pg_bin.run_capture(["pgbench", "-i", endpoint1.connstr()]) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index d12a0223a1..fb592bfbc3 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -4,7 +4,7 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder from fixtures.types import TenantId, TimelineId @@ -24,17 +24,17 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): ] ) - tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = [] + tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = [] for n in range(4): tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start("main", tenant_id=tenant_id) - with pg.cursor() as cur: + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - pg.stop() - tenant_timelines.append((tenant_id, timeline_id, pg)) + endpoint.stop() + tenant_timelines.append((tenant_id, timeline_id, endpoint)) # Stop the pageserver env.pageserver.stop() diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index f47e4a99bf..f22eca02cc 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -24,14 +24,14 @@ def test_clog_truncate(neon_simple_env: NeonEnv): "autovacuum_freeze_max_age=100000", ] - pg = env.postgres.create_start("test_clog_truncate", config_lines=config) + endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config) log.info("postgres is running on test_clog_truncate branch") # Install extension containing function needed for test - pg.safe_psql("CREATE EXTENSION neon_test_utils") + endpoint.safe_psql("CREATE EXTENSION neon_test_utils") # Consume many xids to advance clog - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("select test_consume_xids(1000*1000*10);") log.info("xids consumed") @@ -44,7 +44,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): # wait for autovacuum to truncate the pg_xact # XXX Is it worth to add a timeout here? - pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), "0000") + pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000") log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") while os.path.isfile(pg_xact_0000_path): @@ -52,7 +52,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): time.sleep(5) # checkpoint to advance latest lsn - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("CHECKPOINT;") lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()") @@ -61,10 +61,10 @@ def test_clog_truncate(neon_simple_env: NeonEnv): env.neon_cli.create_branch( "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation ) - pg2 = env.postgres.create_start("test_clog_truncate_new") + endpoint2 = env.endpoints.create_start("test_clog_truncate_new") log.info("postgres is running on test_clog_truncate_new branch") # check that new node doesn't contain truncated segment - pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), "0000") + pg_xact_0000_path_new = os.path.join(endpoint2.pg_xact_dir_path(), "0000") log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}") assert os.path.isfile(pg_xact_0000_path_new) is False diff --git a/test_runner/regress/test_close_fds.py b/test_runner/regress/test_close_fds.py index 22f245f79b..7059f3360e 100644 --- a/test_runner/regress/test_close_fds.py +++ b/test_runner/regress/test_close_fds.py @@ -24,8 +24,8 @@ def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): def start_workload(): env.neon_cli.create_branch("test_lsof_pageserver_pid") - pg = env.postgres.create_start("test_lsof_pageserver_pid") - with closing(pg.connect()) as conn: + endpoint = env.endpoints.create_start("test_lsof_pageserver_pid") + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x") cur.execute("update foo set x=x+1") diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 0cc111bd8c..e262202a73 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,3 +1,4 @@ +import copy import os import shutil import subprocess @@ -55,29 +56,31 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o neon_env_builder.preserve_database_files = True env = neon_env_builder.init_start() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") # FIXME: Is this expected? env.pageserver.allowed_errors.append( ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" ) - pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()]) - pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()]) - pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]) + pg_bin.run(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) + pg_bin.run(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) + pg_bin.run( + ["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] + ) snapshot_config = toml.load(test_output_dir / "repo" / "config") tenant_id = snapshot_config["default_tenant_id"] timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] pageserver_http = env.pageserver.http_client() - lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) - env.postgres.stop_all() + env.endpoints.stop_all() for sk in env.safekeepers: sk.stop() env.pageserver.stop() @@ -98,6 +101,9 @@ def test_backward_compatibility( pg_version: str, request: FixtureRequest, ): + """ + Test that the new binaries can read old data + """ compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") assert ( compatibility_snapshot_dir_env is not None @@ -120,6 +126,7 @@ def test_backward_compatibility( check_neon_works( test_output_dir / "compatibility_snapshot" / "repo", neon_binpath, + neon_binpath, pg_distrib_dir, pg_version, port_distributor, @@ -148,7 +155,11 @@ def test_forward_compatibility( port_distributor: PortDistributor, pg_version: str, request: FixtureRequest, + neon_binpath: Path, ): + """ + Test that the old binaries can read new data + """ compatibility_neon_bin_env = os.environ.get("COMPATIBILITY_NEON_BIN") assert compatibility_neon_bin_env is not None, ( "COMPATIBILITY_NEON_BIN is not set. It should be set to a path with Neon binaries " @@ -183,6 +194,7 @@ def test_forward_compatibility( check_neon_works( test_output_dir / "compatibility_snapshot" / "repo", compatibility_neon_bin, + neon_binpath, compatibility_postgres_distrib_dir, pg_version, port_distributor, @@ -223,9 +235,13 @@ def prepare_snapshot( for logfile in repo_dir.glob("**/*.log"): logfile.unlink() - # Remove tenants data for compute - for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"): - shutil.rmtree(tenant) + # Remove old computes in 'endpoints'. Old versions of the control plane used a directory + # called "pgdatadirs". Delete it, too. + if (repo_dir / "endpoints").exists(): + shutil.rmtree(repo_dir / "endpoints") + if (repo_dir / "pgdatadirs").exists(): + shutil.rmtree(repo_dir / "pgdatadirs") + os.mkdir(repo_dir / "endpoints") # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create # them anymore, but old versions did. @@ -326,7 +342,8 @@ def get_neon_version(neon_binpath: Path): def check_neon_works( repo_dir: Path, - neon_binpath: Path, + neon_target_binpath: Path, + neon_current_binpath: Path, pg_distrib_dir: Path, pg_version: str, port_distributor: PortDistributor, @@ -336,7 +353,7 @@ def check_neon_works( ): snapshot_config_toml = repo_dir / "config" snapshot_config = toml.load(snapshot_config_toml) - snapshot_config["neon_distrib_dir"] = str(neon_binpath) + snapshot_config["neon_distrib_dir"] = str(neon_target_binpath) snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir) with (snapshot_config_toml).open("w") as f: toml.dump(snapshot_config, f) @@ -347,17 +364,25 @@ def check_neon_works( config.repo_dir = repo_dir config.pg_version = pg_version config.initial_tenant = snapshot_config["default_tenant_id"] - config.neon_binpath = neon_binpath config.pg_distrib_dir = pg_distrib_dir config.preserve_database_files = True - cli = NeonCli(config) - cli.raw_cli(["start"]) - request.addfinalizer(lambda: cli.raw_cli(["stop"])) + # Use the "target" binaries to launch the storage nodes + config_target = config + config_target.neon_binpath = neon_target_binpath + cli_target = NeonCli(config_target) + + # And the current binaries to launch computes + config_current = copy.copy(config) + config_current.neon_binpath = neon_current_binpath + cli_current = NeonCli(config_current) + + cli_target.raw_cli(["start"]) + request.addfinalizer(lambda: cli_target.raw_cli(["stop"])) pg_port = port_distributor.get_port() - cli.pg_start("main", port=pg_port) - request.addfinalizer(lambda: cli.pg_stop("main")) + cli_current.endpoint_start("main", port=pg_port) + request.addfinalizer(lambda: cli_current.endpoint_stop("main")) connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]) diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py index 05ac3841dc..aa99a01c83 100644 --- a/test_runner/regress/test_compute_ctl.py +++ b/test_runner/regress/test_compute_ctl.py @@ -13,10 +13,10 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ctl = ComputeCtl(env) env.neon_cli.create_branch("test_compute_ctl", "main") - pg = env.postgres.create_start("test_compute_ctl") - pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + endpoint = env.endpoints.create_start("test_compute_ctl") + endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") - with open(pg.config_file_path(), "r") as f: + with open(endpoint.config_file_path(), "r") as f: cfg_lines = f.readlines() cfg_map = {} for line in cfg_lines: @@ -24,10 +24,13 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): k, v = line.split("=") cfg_map[k] = v.strip("\n '\"") log.info(f"postgres config: {cfg_map}") - pgdata = pg.pg_data_dir_path() + pgdata = endpoint.pg_data_dir_path() pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres") - pg.stop_and_destroy() + endpoint.stop_and_destroy() + + # stop_and_destroy removes the whole endpoint directory. Recreate it. + Path(pgdata).mkdir(parents=True) spec = ( """ diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 3477d96b89..0ea5784b67 100755 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -12,10 +12,10 @@ def test_config(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_config", "empty") # change config - pg = env.postgres.create_start("test_config", config_lines=["log_min_messages=debug1"]) + endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) log.info("postgres is running on test_config branch") - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute( """ diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 9899d424d1..7ec901af34 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -21,11 +21,11 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") - pg = env.postgres.create("test_crafted_wal_end") + endpoint = env.endpoints.create("test_crafted_wal_end") wal_craft = WalCraft(env) - pg.config(wal_craft.postgres_config()) - pg.start() - res = pg.safe_psql_many( + endpoint.config(wal_craft.postgres_config()) + endpoint.start() + res = endpoint.safe_psql_many( queries=[ "CREATE TABLE keys(key int primary key)", "INSERT INTO keys SELECT generate_series(1, 100)", @@ -34,7 +34,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): ) assert res[-1][0] == (5050,) - wal_craft.in_existing(wal_type, pg.connstr()) + wal_craft.in_existing(wal_type, endpoint.connstr()) log.info("Restarting all safekeepers and pageservers") env.pageserver.stop() @@ -43,7 +43,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env.pageserver.start() log.info("Trying more queries") - res = pg.safe_psql_many( + res = endpoint.safe_psql_many( queries=[ "SELECT SUM(key) FROM keys", "INSERT INTO keys SELECT generate_series(101, 200)", @@ -60,7 +60,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env.pageserver.start() log.info("Trying more queries (again)") - res = pg.safe_psql_many( + res = endpoint.safe_psql_many( queries=[ "SELECT SUM(key) FROM keys", "INSERT INTO keys SELECT generate_series(201, 300)", diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index 036e50e6e8..68035b1b14 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -13,10 +13,10 @@ def test_createdb(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_createdb", "empty") - pg = env.postgres.create_start("test_createdb") + endpoint = env.endpoints.create_start("test_createdb") log.info("postgres is running on 'test_createdb' branch") - with pg.cursor() as cur: + with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch cur.execute("VACUUM FULL pg_class") @@ -26,10 +26,10 @@ def test_createdb(neon_simple_env: NeonEnv): # Create a branch env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start("test_createdb2") + endpoint2 = env.endpoints.create_start("test_createdb2") # Test that you can connect to the new database on both branches - for db in (pg, pg2): + for db in (endpoint, endpoint2): with db.cursor(dbname="foodb") as cur: # Check database size in both branches cur.execute( @@ -55,17 +55,17 @@ def test_createdb(neon_simple_env: NeonEnv): def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_dropdb", "empty") - pg = env.postgres.create_start("test_dropdb") + endpoint = env.endpoints.create_start("test_dropdb") log.info("postgres is running on 'test_dropdb' branch") - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("CREATE DATABASE foodb") lsn_before_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") dboid = query_scalar(cur, "SELECT oid FROM pg_database WHERE datname='foodb';") - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("DROP DATABASE foodb") cur.execute("CHECKPOINT") @@ -76,29 +76,29 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch( "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop ) - pg_before = env.postgres.create_start("test_before_dropdb") + endpoint_before = env.endpoints.create_start("test_before_dropdb") env.neon_cli.create_branch( "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop ) - pg_after = env.postgres.create_start("test_after_dropdb") + endpoint_after = env.endpoints.create_start("test_after_dropdb") # Test that database exists on the branch before drop - pg_before.connect(dbname="foodb").close() + endpoint_before.connect(dbname="foodb").close() # Test that database subdir exists on the branch before drop - assert pg_before.pgdata_dir - dbpath = pathlib.Path(pg_before.pgdata_dir) / "base" / str(dboid) + assert endpoint_before.pgdata_dir + dbpath = pathlib.Path(endpoint_before.pgdata_dir) / "base" / str(dboid) log.info(dbpath) assert os.path.isdir(dbpath) is True # Test that database subdir doesn't exist on the branch after drop - assert pg_after.pgdata_dir - dbpath = pathlib.Path(pg_after.pgdata_dir) / "base" / str(dboid) + assert endpoint_after.pgdata_dir + dbpath = pathlib.Path(endpoint_after.pgdata_dir) / "base" / str(dboid) log.info(dbpath) assert os.path.isdir(dbpath) is False # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) + check_restored_datadir_content(test_output_dir, env, endpoint) diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index c5f8246f5b..f1bc405287 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -9,10 +9,10 @@ from fixtures.utils import query_scalar def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_createuser", "empty") - pg = env.postgres.create_start("test_createuser") + endpoint = env.endpoints.create_start("test_createuser") log.info("postgres is running on 'test_createuser' branch") - with pg.cursor() as cur: + with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch cur.execute("CREATE USER testuser with password %s", ("testpwd",)) @@ -22,7 +22,7 @@ def test_createuser(neon_simple_env: NeonEnv): # Create a branch env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start("test_createuser2") + endpoint2 = env.endpoints.create_start("test_createuser2") # Test that you can connect to new branch as a new user - assert pg2.safe_psql("select current_user", user="testuser") == [("testuser",)] + assert endpoint2.safe_psql("select current_user", user="testuser") == [("testuser",)] diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 413d6c9d5a..31c7ef2b17 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -91,8 +91,8 @@ class EvictionEnv: This assumes that the tenant is still at the state after pbench -i. """ lsn = self.pgbench_init_lsns[tenant_id] - with self.neon_env.postgres.create_start("main", tenant_id=tenant_id, lsn=lsn) as pg: - self.pg_bin.run(["pgbench", "-S", pg.connstr()]) + with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint: + self.pg_bin.run(["pgbench", "-S", endpoint.connstr()]) def pageserver_start_with_disk_usage_eviction( self, period, max_usage_pct, min_avail_bytes, mock_behavior @@ -168,9 +168,9 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev } ) - with env.postgres.create_start("main", tenant_id=tenant_id) as pg: - pg_bin.run(["pgbench", "-i", f"-s{scale}", pg.connstr()]) - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) timelines.append((tenant_id, timeline_id)) diff --git a/test_runner/regress/test_fsm_truncate.py b/test_runner/regress/test_fsm_truncate.py index 4551ff97e0..80e4da8380 100644 --- a/test_runner/regress/test_fsm_truncate.py +++ b/test_runner/regress/test_fsm_truncate.py @@ -4,7 +4,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_fsm_truncate") - pg = env.postgres.create_start("test_fsm_truncate") - pg.safe_psql( + endpoint = env.endpoints.create_start("test_fsm_truncate") + endpoint.safe_psql( "CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;" ) diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index b3d58edf6b..ece9dccf93 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -24,10 +24,10 @@ def test_fullbackup( env = neon_env_builder.init_start() env.neon_cli.create_branch("test_fullbackup") - pgmain = env.postgres.create_start("test_fullbackup") + endpoint_main = env.endpoints.create_start("test_fullbackup") log.info("postgres is running on 'test_fullbackup' branch") - with pgmain.cursor() as cur: + with endpoint_main.cursor() as cur: timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # data loading may take a while, so increase statement timeout diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 702d94c691..d38be057d3 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -5,9 +5,9 @@ import random import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, - Postgres, RemoteStorageKind, wait_for_last_flush_lsn, ) @@ -26,9 +26,9 @@ updates_performed = 0 # Run random UPDATEs on test table -async def update_table(pg: Postgres): +async def update_table(endpoint: Endpoint): global updates_performed - pg_conn = await pg.connect_async() + pg_conn = await endpoint.connect_async() while updates_performed < updates_to_perform: updates_performed += 1 @@ -52,10 +52,10 @@ async def gc(env: NeonEnv, timeline: TimelineId): # At the same time, run UPDATEs and GC -async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId): +async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId): workers = [] for worker_id in range(num_connections): - workers.append(asyncio.create_task(update_table(pg))) + workers.append(asyncio.create_task(update_table(endpoint))) workers.append(asyncio.create_task(gc(env, timeline))) # await all workers @@ -72,10 +72,10 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() env.neon_cli.create_branch("test_gc_aggressive", "main") - pg = env.postgres.create_start("test_gc_aggressive") + endpoint = env.endpoints.create_start("test_gc_aggressive") log.info("postgres is running on test_gc_aggressive branch") - with pg.cursor() as cur: + with endpoint.cursor() as cur: timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) # Create table, and insert the first 100 rows @@ -89,7 +89,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): ) cur.execute("CREATE INDEX ON foo(id)") - asyncio.run(update_and_gc(env, pg, timeline)) + asyncio.run(update_and_gc(env, endpoint, timeline)) cur.execute("SELECT COUNT(*), SUM(counter) FROM foo") r = cur.fetchone() @@ -110,11 +110,11 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: env = neon_env_builder.init_start() env.neon_cli.create_branch("test_gc_index_upload", "main") - pg = env.postgres.create_start("test_gc_index_upload") + endpoint = env.endpoints.create_start("test_gc_index_upload") pageserver_http = env.pageserver.http_client() - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id")) @@ -146,7 +146,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: return int(total) # Sanity check that the metric works - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) before = get_num_remote_ops("index", "upload") diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index 1b98a414da..79453c1bdc 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -31,8 +31,8 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): "image_creation_threshold": "2", } ) - pg = env.postgres.create_start("main", tenant_id=tenant_id) - connstr = pg.connstr(options="-csynchronous_commit=off") + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + connstr = endpoint.connstr(options="-csynchronous_commit=off") pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 774ed98563..137ce457bc 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -9,10 +9,10 @@ from pathlib import Path import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, PgBin, - Postgres, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId @@ -72,7 +72,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] - node_name = "import_from_vanilla" + endpoint_id = "ep-import_from_vanilla" tenant = TenantId.generate() timeline = TimelineId.generate() @@ -113,7 +113,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build "--timeline-id", str(timeline), "--node-name", - node_name, + endpoint_id, "--base-lsn", start_lsn, "--base-tarfile", @@ -153,8 +153,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) # Check it worked - pg = env.postgres.create_start(node_name, tenant_id=tenant) - assert pg.safe_psql("select count(*) from t") == [(300000,)] + endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) + assert endpoint.safe_psql("select count(*) from t") == [(300000,)] @pytest.mark.timeout(600) @@ -168,10 +168,10 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu ) timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") - pg = env.postgres.create_start("test_import_from_pageserver_small") + endpoint = env.endpoints.create_start("test_import_from_pageserver_small") num_rows = 3000 - lsn = _generate_data(num_rows, pg) + lsn = _generate_data(num_rows, endpoint) _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) @@ -185,14 +185,14 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne env = neon_env_builder.init_start() timeline = env.neon_cli.create_branch("test_import_from_pageserver_multisegment") - pg = env.postgres.create_start("test_import_from_pageserver_multisegment") + endpoint = env.endpoints.create_start("test_import_from_pageserver_multisegment") # For `test_import_from_pageserver_multisegment`, we want to make sure that the data # is large enough to create multi-segment files. Typically, a segment file's size is # at most 1GB. A large number of inserted rows (`30000000`) is used to increase the # DB size to above 1GB. Related: https://github.com/neondatabase/neon/issues/2097. num_rows = 30000000 - lsn = _generate_data(num_rows, pg) + lsn = _generate_data(num_rows, endpoint) logical_size = env.pageserver.http_client().timeline_detail(env.initial_tenant, timeline)[ "current_logical_size" @@ -213,12 +213,12 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne assert cnt_seg_files > 0 -def _generate_data(num_rows: int, pg: Postgres) -> Lsn: +def _generate_data(num_rows: int, endpoint: Endpoint) -> Lsn: """Generate a table with `num_rows` rows. Returns: the latest insert WAL's LSN""" - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") @@ -263,7 +263,7 @@ def _import( tar_output_file = result_basepath + ".stdout" # Stop the first pageserver instance, erase all its data - env.postgres.stop_all() + env.endpoints.stop_all() env.pageserver.stop() dir_to_clear = Path(env.repo_dir) / "tenants" @@ -278,7 +278,7 @@ def _import( tenant = TenantId.generate() # Import to pageserver - node_name = "import_from_pageserver" + endpoint_id = "ep-import_from_pageserver" client = env.pageserver.http_client() client.tenant_create(tenant) env.neon_cli.raw_cli( @@ -290,7 +290,7 @@ def _import( "--timeline-id", str(timeline), "--node-name", - node_name, + endpoint_id, "--base-lsn", str(lsn), "--base-tarfile", @@ -305,8 +305,8 @@ def _import( wait_for_upload(client, tenant, timeline, lsn) # Check it worked - pg = env.postgres.create_start(node_name, tenant_id=tenant) - assert pg.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] + endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) + assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup query = f"fullbackup { tenant} {timeline} {lsn}" diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index f14265f6fd..ac83131ba2 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -15,9 +15,9 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_large_schema(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() tables = 2 # 10 is too much for debug build @@ -27,18 +27,18 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): # Restart compute. Restart is actually not strictly needed. # It is done mostly because this test originally tries to model the problem reported by Ketteq. - pg.stop() + endpoint.stop() # Kill and restart the pageserver. # env.pageserver.stop(immediate=True) # env.pageserver.start() - pg.start() + endpoint.start() retry_sleep = 0.5 max_retries = 200 retries = 0 while True: try: - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() cur.execute(f"CREATE TABLE if not exists t_{i}(pk integer) partition by range (pk)") for j in range(1, partitions + 1): @@ -63,7 +63,7 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): raise break - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() for i in range(1, tables + 1): @@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") # Check layer file sizes - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] + timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id) for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 2d07d02ce7..1ae32fb398 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -27,13 +27,13 @@ def test_basic_eviction( env = neon_env_builder.init_start() client = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) # Create a number of layers in the tenant - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") cur.execute( """ @@ -172,15 +172,15 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): env.initial_tenant = tenant_id # update_and_gc relies on this ps_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") log.info("fill with data, creating delta & image layers, some of which are GC'able after") # no particular reason to create the layers like this, but we are sure # not to hit the image_creation_threshold here. - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("create table a (id bigserial primary key, some_value bigint not null)") cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) ps_http.timeline_checkpoint(tenant_id, timeline_id) # Create delta layers, then turn them into image layers. @@ -191,19 +191,19 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): for i in range(0, 2): for j in range(0, 3): # create a minimal amount of "delta difficulty" for this table - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("update a set some_value = -some_value + %s", (j,)) - with pg.cursor() as cur: + with endpoint.cursor() as cur: # vacuuming should aid to reuse keys, though it's not really important # with image_creation_threshold=1 which we will use on the last compaction cur.execute("vacuum") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) if i == 1 and j == 2 and k == 1: # last iteration; stop before checkpoint to avoid leaving an inmemory layer - pg.stop_and_destroy() + endpoint.stop_and_destroy() ps_http.timeline_checkpoint(tenant_id, timeline_id) diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py index e8ba0e7d91..d2d85a43e0 100644 --- a/test_runner/regress/test_layer_writers_fail.py +++ b/test_runner/regress/test_layer_writers_fail.py @@ -20,7 +20,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): } ) - pg = env.postgres.create_start("main", tenant_id=tenant_id) + pg = env.endpoints.create_start("main", tenant_id=tenant_id) pg.safe_psql_many( [ "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", @@ -64,8 +64,8 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): } ) - pg = env.postgres.create_start("main", tenant_id=tenant_id) - pg.safe_psql_many( + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + endpoint.safe_psql_many( [ "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", """INSERT INTO foo diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index c5a49a6704..8ccfc21cf7 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -12,10 +12,10 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") - pgmain = env.postgres.create_start("test_lsn_mapping") + endpoint_main = env.endpoints.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") - cur = pgmain.connect().cursor() + cur = endpoint_main.connect().cursor() # Create table, and insert rows, each in a separate transaction # Disable synchronous_commit to make this initialization go faster. # @@ -35,7 +35,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): cur.execute("INSERT INTO foo VALUES (-1)") # Wait until WAL is received by pageserver - wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id) + wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) with env.pageserver.http_client() as client: # Check edge cases: timestamp in the future @@ -61,9 +61,9 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Call get_lsn_by_timestamp to get the LSN # Launch a new read-only node at that LSN, and check that only the rows # that were supposed to be committed at that point in time are visible. - pg_here = env.postgres.create_start( - branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + endpoint_here = env.endpoints.create_start( + branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn ) - assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i + assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i - pg_here.stop_and_destroy() + endpoint_here.stop_and_destroy() diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index a33af9a3b2..ecbce1f8f7 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -123,9 +123,9 @@ def test_metric_collection( # before pageserver, pageserver log might contain such errors in the end. env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") env.neon_cli.create_branch("test_metric_collection") - pg = env.postgres.create_start("test_metric_collection") + endpoint = env.endpoints.create_start("test_metric_collection") - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id")) @@ -158,7 +158,7 @@ def test_metric_collection( # upload some data to remote storage if remote_storage_kind == RemoteStorageKind.LOCAL_FS: - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) pageserver_http = env.pageserver.http_client() pageserver_http.timeline_checkpoint(tenant_id, timeline_id) pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 635beb16b7..fe50969a0a 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -12,10 +12,10 @@ from fixtures.utils import query_scalar def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_multixact", "empty") - pg = env.postgres.create_start("test_multixact") + endpoint = env.endpoints.create_start("test_multixact") log.info("postgres is running on 'test_multixact' branch") - cur = pg.connect().cursor() + cur = endpoint.connect().cursor() cur.execute( """ CREATE TABLE t1(i int primary key); @@ -32,7 +32,7 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): connections = [] for i in range(nclients): # Do not turn on autocommit. We want to hold the key-share locks. - conn = pg.connect(autocommit=False) + conn = endpoint.connect(autocommit=False) connections.append(conn) # On each iteration, we commit the previous transaction on a connection, @@ -65,10 +65,10 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): # Branch at this point env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) - pg_new = env.postgres.create_start("test_multixact_new") + endpoint_new = env.endpoints.create_start("test_multixact_new") log.info("postgres is running on 'test_multixact_new' branch") - next_multixact_id_new = pg_new.safe_psql( + next_multixact_id_new = endpoint_new.safe_psql( "SELECT next_multixact_id FROM pg_control_checkpoint()" )[0][0] @@ -76,4 +76,4 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): assert next_multixact_id_new == next_multixact_id # Check that we can restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) + check_restored_datadir_content(test_output_dir, env, endpoint) diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index bd0f550ba5..f6629c54f9 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -9,9 +9,11 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por try: env.neon_cli.start() env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True) - env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port()) + env.neon_cli.endpoint_start(endpoint_id="ep-main", port=port_distributor.get_port()) env.neon_cli.create_branch(new_branch_name="migration_check") - env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port()) + env.neon_cli.endpoint_start( + endpoint_id="ep-migration_check", port=port_distributor.get_port() + ) finally: env.neon_cli.stop() diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 698ea0e1d3..6e94e15227 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -8,9 +8,9 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_next_xid(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() cur.execute("CREATE TABLE t(x integer)") @@ -19,17 +19,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): print(f"iteration {i} / {iterations}") # Kill and restart the pageserver. - pg.stop() + endpoint.stop() env.pageserver.stop(immediate=True) env.pageserver.start() - pg.start() + endpoint.start() retry_sleep = 0.5 max_retries = 200 retries = 0 while True: try: - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() cur.execute(f"INSERT INTO t values({i})") conn.close() @@ -48,7 +48,7 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): raise break - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() cur.execute("SELECT count(*) FROM t") assert cur.fetchone() == (iterations,) diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index aa37a2411c..50de99adb5 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -6,9 +6,9 @@ from fixtures.pageserver.http import PageserverHttpClient def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - res_1 = pg.safe_psql_many( + res_1 = endpoint.safe_psql_many( queries=[ "CREATE TABLE t(key int primary key, value text)", "INSERT INTO t SELECT generate_series(1,100000), 'payload'", @@ -19,14 +19,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): assert res_1[-1][0] == (5000050000,) # TODO check detach on live instance log.info("stopping compute") - pg.stop() + endpoint.stop() log.info("compute stopped") - pg.start() - res_2 = pg.safe_psql("SELECT sum(key) FROM t") + endpoint.start() + res_2 = endpoint.safe_psql("SELECT sum(key) FROM t") assert res_2[0] == (5000050000,) - pg.stop() + endpoint.stop() pageserver_http.tenant_detach(tenant_id) diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 9885a811e1..814b9f3de0 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -19,10 +19,10 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() env.neon_cli.create_branch("test_old_request_lsn", "main") - pg = env.postgres.create_start("test_old_request_lsn") + endpoint = env.endpoints.create_start("test_old_request_lsn") log.info("postgres is running on test_old_request_lsn branch") - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 07410b64df..cb08b014fd 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -73,17 +73,17 @@ def test_ondemand_download_large_rel( ) env.initial_tenant = tenant - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] + timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] # We want to make sure that the data is large enough that the keyspace is partitioned. num_rows = 1000000 - with pg.cursor() as cur: + with endpoint.cursor() as cur: # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") cur.execute( @@ -106,7 +106,7 @@ def test_ondemand_download_large_rel( log.info("uploads have finished") ##### Stop the first pageserver instance, erase all its data - pg.stop() + endpoint.stop() env.pageserver.stop() # remove all the layer files @@ -117,7 +117,7 @@ def test_ondemand_download_large_rel( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - pg.start() + endpoint.start() before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) # Probe in the middle of the table. There's a high chance that the beginning @@ -125,7 +125,7 @@ def test_ondemand_download_large_rel( # from other tables, and with the entry that stores the size of the # relation, so they are likely already downloaded. But the middle of the # table should not have been needed by anything yet. - with pg.cursor() as cur: + with endpoint.cursor() as cur: assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1 after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) @@ -167,17 +167,17 @@ def test_ondemand_download_timetravel( ) env.initial_tenant = tenant - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] + timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] lsns = [] table_len = 10000 - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( f""" CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); @@ -192,7 +192,7 @@ def test_ondemand_download_timetravel( lsns.append((0, current_lsn)) for checkpoint_number in range(1, 20): - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) lsns.append((checkpoint_number, current_lsn)) @@ -204,7 +204,7 @@ def test_ondemand_download_timetravel( client.timeline_checkpoint(tenant_id, timeline_id) ##### Stop the first pageserver instance, erase all its data - env.postgres.stop_all() + env.endpoints.stop_all() # wait until pageserver has successfully uploaded all the data to remote storage wait_for_sk_commit_lsn_to_reach_remote_storage( @@ -251,10 +251,10 @@ def test_ondemand_download_timetravel( num_layers_downloaded = [0] resident_size = [get_resident_physical_size()] for checkpoint_number, lsn in lsns: - pg_old = env.postgres.create_start( - branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn + endpoint_old = env.endpoints.create_start( + branch_name="main", endpoint_id=f"ep-old_lsn_{checkpoint_number}", lsn=lsn ) - with pg_old.cursor() as cur: + with endpoint_old.cursor() as cur: # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000 assert ( query_scalar( @@ -331,15 +331,15 @@ def test_download_remote_layers_api( ) env.initial_tenant = tenant - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] + timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] table_len = 10000 - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( f""" CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); @@ -347,7 +347,7 @@ def test_download_remote_layers_api( """ ) - env.postgres.stop_all() + env.endpoints.stop_all() wait_for_sk_commit_lsn_to_reach_remote_storage( tenant_id, timeline_id, env.safekeepers, env.pageserver @@ -463,8 +463,8 @@ def test_download_remote_layers_api( sk.start() # ensure that all the data is back - pg_old = env.postgres.create_start(branch_name="main") - with pg_old.cursor() as cur: + endpoint_old = env.endpoints.create_start(branch_name="main") + with endpoint_old.cursor() as cur: assert query_scalar(cur, "select count(*) from testtab") == table_len @@ -513,17 +513,17 @@ def test_compaction_downloads_on_demand_without_image_creation( env.initial_tenant = tenant_id pageserver_http = env.pageserver.http_client() - with env.postgres.create_start("main") as pg: + with env.endpoints.create_start("main") as endpoint: # no particular reason to create the layers like this, but we are sure # not to hit the image_creation_threshold here. - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("update a set id = -id") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) @@ -589,32 +589,32 @@ def test_compaction_downloads_on_demand_with_image_creation( env.initial_tenant = tenant_id pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") # no particular reason to create the layers like this, but we are sure # not to hit the image_creation_threshold here. - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("create table a (id bigserial primary key, some_value bigint not null)") cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) for i in range(0, 2): for j in range(0, 3): # create a minimal amount of "delta difficulty" for this table - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("update a set some_value = -some_value + %s", (j,)) - with pg.cursor() as cur: + with endpoint.cursor() as cur: # vacuuming should aid to reuse keys, though it's not really important # with image_creation_threshold=1 which we will use on the last compaction cur.execute("vacuum") - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) if i == 1 and j == 2: # last iteration; stop before checkpoint to avoid leaving an inmemory layer - pg.stop_and_destroy() + endpoint.stop_and_destroy() pageserver_http.timeline_checkpoint(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 5b05989ae4..e86cd18f58 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -150,7 +150,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): env = neon_simple_env with env.pageserver.http_client() as client: tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) + endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) # Wait to make sure that we get a latest WAL receiver data. # We need to wait here because it's possible that we don't have access to @@ -163,7 +163,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): ) # Make a DB modification then expect getting a new WAL receiver's data. - pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") wait_until( number_of_iterations=5, interval=1, diff --git a/test_runner/regress/test_pageserver_catchup.py b/test_runner/regress/test_pageserver_catchup.py index cba3203591..c16cbcb4ba 100644 --- a/test_runner/regress/test_pageserver_catchup.py +++ b/test_runner/regress/test_pageserver_catchup.py @@ -11,11 +11,11 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) env.neon_cli.create_branch("test_pageserver_catchup_while_compute_down") # Make shared_buffers large to ensure we won't query pageserver while it is down. - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( "test_pageserver_catchup_while_compute_down", config_lines=["shared_buffers=512MB"] ) - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() # Create table, and insert some rows. @@ -59,10 +59,10 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) env.safekeepers[2].start() # restart compute node - pg.stop_and_destroy().create_start("test_pageserver_catchup_while_compute_down") + endpoint.stop_and_destroy().create_start("test_pageserver_catchup_while_compute_down") # Ensure that basebackup went correct and pageserver returned all data - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() cur.execute("SELECT count(*) FROM foo") diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 77db729880..6da5503fb1 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -11,9 +11,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_pageserver_restart") - pg = env.postgres.create_start("test_pageserver_restart") + endpoint = env.endpoints.create_start("test_pageserver_restart") - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() # Create table, and insert some rows. Make it big enough that it doesn't fit in @@ -84,13 +84,13 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): } ) env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant) - pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant) + endpoint = env.endpoints.create_start("test_pageserver_chaos", tenant_id=tenant) # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (id int, t text, updates int)") cur.execute("CREATE INDEX ON foo (id)") @@ -116,12 +116,12 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): # Update the whole table, then immediately kill and restart the pageserver for i in range(1, 15): - pg.safe_psql("UPDATE foo set updates = updates + 1") + endpoint.safe_psql("UPDATE foo set updates = updates + 1") # This kills the pageserver immediately, to simulate a crash env.pageserver.stop(immediate=True) env.pageserver.start() # Check that all the updates are visible - num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0] + num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] assert num_updates == i * 100000 diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index eab8b112f0..bc3f3f2be4 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -5,7 +5,7 @@ import threading import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres +from fixtures.neon_fixtures import NeonEnv, PgBin # Test restarting page server, while safekeeper and compute node keep @@ -13,7 +13,7 @@ from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin): env = neon_simple_env env.neon_cli.create_branch("test_pageserver_restarts") - pg = env.postgres.create_start("test_pageserver_restarts") + endpoint = env.endpoints.create_start("test_pageserver_restarts") n_restarts = 10 scale = 10 @@ -23,13 +23,12 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant" ) - def run_pgbench(pg: Postgres): - connstr = pg.connstr() + def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr]) - thread = threading.Thread(target=run_pgbench, args=(pg,), daemon=True) + thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) thread.start() for i in range(n_restarts): diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index 59f19026cc..577bbc21bf 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -2,7 +2,7 @@ import asyncio from io import BytesIO from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, Postgres +from fixtures.neon_fixtures import Endpoint, NeonEnv async def repeat_bytes(buf, repetitions: int): @@ -10,7 +10,7 @@ async def repeat_bytes(buf, repetitions: int): yield buf -async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str): +async def copy_test_data_to_table(endpoint: Endpoint, worker_id: int, table_name: str): buf = BytesIO() for i in range(1000): buf.write( @@ -20,7 +20,7 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) copy_input = repeat_bytes(buf.read(), 5000) - pg_conn = await pg.connect_async() + pg_conn = await endpoint.connect_async() # PgProtocol.connect_async sets statement_timeout to 2 minutes. # That's not enough for this test, on a slow system in debug mode. @@ -29,10 +29,10 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) await pg_conn.copy_to_table(table_name, source=copy_input) -async def parallel_load_same_table(pg: Postgres, n_parallel: int): +async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, "copytest") + worker = copy_test_data_to_table(endpoint, worker_id, "copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -43,13 +43,13 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env env.neon_cli.create_branch("test_parallel_copy", "empty") - pg = env.postgres.create_start("test_parallel_copy") + endpoint = env.endpoints.create_start("test_parallel_copy") log.info("postgres is running on 'test_parallel_copy' branch") # Create test table - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() cur.execute("CREATE TABLE copytest (i int, t text)") # Run COPY TO to load the table with parallel connections. - asyncio.run(parallel_load_same_table(pg, n_parallel)) + asyncio.run(parallel_load_same_table(endpoint, n_parallel)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 5eb1ebb3de..64625ea4ee 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -24,8 +24,8 @@ def test_pg_regress( env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start("test_pg_regress") - pg.safe_psql("CREATE DATABASE regression") + endpoint = env.endpoints.create_start("test_pg_regress") + endpoint.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -49,9 +49,9 @@ def test_pg_regress( ] env_vars = { - "PGPORT": str(pg.default_options["port"]), - "PGUSER": pg.default_options["user"], - "PGHOST": pg.default_options["host"], + "PGPORT": str(endpoint.default_options["port"]), + "PGUSER": endpoint.default_options["user"], + "PGHOST": endpoint.default_options["host"], } # Run the command. @@ -61,10 +61,10 @@ def test_pg_regress( pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql("CHECKPOINT") + endpoint.safe_psql("CHECKPOINT") # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) + check_restored_datadir_content(test_output_dir, env, endpoint) # Run the PostgreSQL "isolation" tests, in src/test/isolation. @@ -85,8 +85,10 @@ def test_isolation( env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - pg = env.postgres.create_start("test_isolation", config_lines=["max_prepared_transactions=100"]) - pg.safe_psql("CREATE DATABASE isolation_regression") + endpoint = env.endpoints.create_start( + "test_isolation", config_lines=["max_prepared_transactions=100"] + ) + endpoint.safe_psql("CREATE DATABASE isolation_regression") # Create some local directories for pg_isolation_regress to run in. runpath = test_output_dir / "regress" @@ -109,9 +111,9 @@ def test_isolation( ] env_vars = { - "PGPORT": str(pg.default_options["port"]), - "PGUSER": pg.default_options["user"], - "PGHOST": pg.default_options["host"], + "PGPORT": str(endpoint.default_options["port"]), + "PGUSER": endpoint.default_options["user"], + "PGHOST": endpoint.default_options["host"], } # Run the command. @@ -135,8 +137,8 @@ def test_sql_regress( env.neon_cli.create_branch("test_sql_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start("test_sql_regress") - pg.safe_psql("CREATE DATABASE regression") + endpoint = env.endpoints.create_start("test_sql_regress") + endpoint.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -160,9 +162,9 @@ def test_sql_regress( ] env_vars = { - "PGPORT": str(pg.default_options["port"]), - "PGUSER": pg.default_options["user"], - "PGHOST": pg.default_options["host"], + "PGPORT": str(endpoint.default_options["port"]), + "PGUSER": endpoint.default_options["user"], + "PGHOST": endpoint.default_options["host"], } # Run the command. @@ -172,8 +174,8 @@ def test_sql_regress( pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql("CHECKPOINT") - pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] + endpoint.safe_psql("CHECKPOINT") + endpoint.safe_psql("select pg_current_wal_insert_lsn()")[0][0] # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) + check_restored_datadir_content(test_output_dir, env, endpoint) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index fe4fbc0927..c2ea5b332a 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -15,10 +15,10 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() - pgmain = env.postgres.create_start("main") + endpoint_main = env.endpoints.create_start("main") log.info("postgres is running on 'main' branch") - main_pg_conn = pgmain.connect() + main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) @@ -62,10 +62,10 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # It must have been preserved by PITR setting env.neon_cli.create_branch("test_pitr_gc_hundred", "main", ancestor_start_lsn=lsn_a) - pg_hundred = env.postgres.create_start("test_pitr_gc_hundred") + endpoint_hundred = env.endpoints.create_start("test_pitr_gc_hundred") # On the 'hundred' branch, we should see only 100 rows - hundred_pg_conn = pg_hundred.connect() + hundred_pg_conn = endpoint_hundred.connect() hundred_cur = hundred_pg_conn.cursor() hundred_cur.execute("SELECT count(*) FROM foo") assert hundred_cur.fetchone() == (100,) diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py index be0eb76ccd..9ebe53fc17 100644 --- a/test_runner/regress/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -21,22 +21,22 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): ) timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant) - pg = env.postgres.create_start("test_trace_replay", "main", tenant) + endpoint = env.endpoints.create_start("test_trace_replay", "main", tenant) - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("create table t (i integer);") cur.execute(f"insert into t values (generate_series(1,{10000}));") cur.execute("select count(*) from t;") - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # wait until pageserver receives that data pageserver_http = env.pageserver.http_client() wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) - # Stop pg so we drop the connection and flush the traces - pg.stop() + # Stop postgres so we drop the connection and flush the traces + endpoint.stop() trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline) assert trace_path.exists() diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 47135dc56c..47a06359bb 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -17,10 +17,10 @@ def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_read_validation", "empty") - pg = env.postgres.create_start("test_read_validation") + endpoint = env.endpoints.create_start("test_read_validation") log.info("postgres is running on 'test_read_validation' branch") - with closing(pg.connect()) as con: + with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: c.execute("create extension if not exists {};".format(e)) @@ -144,10 +144,10 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - pg = env.postgres.create_start("test_read_validation_neg") + endpoint = env.endpoints.create_start("test_read_validation_neg") log.info("postgres is running on 'test_read_validation_neg' branch") - with closing(pg.connect()) as con: + with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: c.execute("create extension if not exists {};".format(e)) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 69d6e427ce..2d641e36a7 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -15,12 +15,12 @@ from fixtures.utils import query_scalar def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_readonly_node", "empty") - pgmain = env.postgres.create_start("test_readonly_node") + endpoint_main = env.endpoints.create_start("test_readonly_node") log.info("postgres is running on 'test_readonly_node' branch") env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*") - main_pg_conn = pgmain.connect() + main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() # Create table, and insert the first 100 rows @@ -61,23 +61,23 @@ def test_readonly_node(neon_simple_env: NeonEnv): log.info("LSN after 400100 rows: " + lsn_c) # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start( - branch_name="test_readonly_node", node_name="test_readonly_node_hundred", lsn=lsn_a + endpoint_hundred = env.endpoints.create_start( + branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a ) # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start( - branch_name="test_readonly_node", node_name="test_readonly_node_more", lsn=lsn_b + endpoint_more = env.endpoints.create_start( + branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b ) # On the 'hundred' node, we should see only 100 rows - hundred_pg_conn = pg_hundred.connect() + hundred_pg_conn = endpoint_hundred.connect() hundred_cur = hundred_pg_conn.cursor() hundred_cur.execute("SELECT count(*) FROM foo") assert hundred_cur.fetchone() == (100,) # On the 'more' node, we should see 100200 rows - more_pg_conn = pg_more.connect() + more_pg_conn = endpoint_more.connect() more_cur = more_pg_conn.cursor() more_cur.execute("SELECT count(*) FROM foo") assert more_cur.fetchone() == (200100,) @@ -87,21 +87,21 @@ def test_readonly_node(neon_simple_env: NeonEnv): assert main_cur.fetchone() == (400100,) # Check creating a node at segment boundary - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( branch_name="test_readonly_node", - node_name="test_branch_segment_boundary", + endpoint_id="ep-branch_segment_boundary", lsn=Lsn("0/3000000"), ) - cur = pg.connect().cursor() + cur = endpoint.connect().cursor() cur.execute("SELECT 1") assert cur.fetchone() == (1,) # Create node at pre-initdb lsn with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail - env.postgres.create_start( + env.endpoints.create_start( branch_name="test_readonly_node", - node_name="test_readonly_node_preinitdb", + endpoint_id="ep-readonly_node_preinitdb", lsn=Lsn("0/42"), ) @@ -111,16 +111,16 @@ def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() env.neon_cli.create_branch("test_timetravel", "empty") - pg = env.postgres.create_start("test_timetravel") + endpoint = env.endpoints.create_start("test_timetravel") client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] + timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] lsns = [] - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( """ CREATE TABLE testtab(id serial primary key, iteration int, data text); @@ -131,7 +131,7 @@ def test_timetravel(neon_simple_env: NeonEnv): lsns.append((0, current_lsn)) for i in range(1, 5): - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute(f"UPDATE testtab SET iteration = {i}") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) lsns.append((i, current_lsn)) @@ -143,14 +143,14 @@ def test_timetravel(neon_simple_env: NeonEnv): pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver - env.postgres.stop_all() + env.endpoints.stop_all() env.pageserver.stop() env.pageserver.start() for i, lsn in lsns: - pg_old = env.postgres.create_start( - branch_name="test_timetravel", node_name=f"test_old_lsn_{i}", lsn=lsn + endpoint_old = env.endpoints.create_start( + branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn ) - with pg_old.cursor() as cur: + with endpoint_old.cursor() as cur: assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000 assert query_scalar(cur, f"select count(*) from testtab where iteration<>{i}") == 0 diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 09644eaaa1..76e97a35a4 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -22,10 +22,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") - pg = env.postgres.create_start("test_pageserver_recovery") + endpoint = env.endpoints.create_start("test_pageserver_recovery") log.info("postgres is running on 'test_pageserver_recovery' branch") - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: with env.pageserver.http_client() as pageserver_http: # Create and initialize test table @@ -54,7 +54,7 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("select count(*) from foo") assert cur.fetchone() == (100000,) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 222305f006..6de5f7db04 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -87,17 +87,17 @@ def test_remote_storage_backup_and_restore( env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*") pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) checkpoint_numbers = range(1, 3) for checkpoint_number in checkpoint_numbers: - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( f""" CREATE TABLE t{checkpoint_number}(id int primary key, data text); @@ -126,7 +126,7 @@ def test_remote_storage_backup_and_restore( ) ##### Stop the first pageserver instance, erase all its data - env.postgres.stop_all() + env.endpoints.stop_all() env.pageserver.stop() dir_to_clear = Path(env.repo_dir) / "tenants" @@ -187,8 +187,8 @@ def test_remote_storage_backup_and_restore( ), "current db Lsn should should not be less than the one stored on remote storage" log.info("select some data, this will cause layers to be downloaded") - pg = env.postgres.create_start("main") - with pg.cursor() as cur: + endpoint = env.endpoints.create_start("main") + with endpoint.cursor() as cur: for checkpoint_number in checkpoint_numbers: assert ( query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};") @@ -238,9 +238,9 @@ def test_remote_storage_upload_queue_retries( client = env.pageserver.http_client() - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") def configure_storage_sync_failpoints(action): client.configure_failpoints( @@ -253,7 +253,7 @@ def test_remote_storage_upload_queue_retries( def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): # create initial set of layers & upload them with failpoints configured - pg.safe_psql_many( + endpoint.safe_psql_many( [ f""" INSERT INTO foo (id, val) @@ -266,7 +266,7 @@ def test_remote_storage_upload_queue_retries( "VACUUM foo", ] ) - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) def get_queued_count(file_kind, op_kind): val = client.get_remote_timeline_client_metric( @@ -343,7 +343,7 @@ def test_remote_storage_upload_queue_retries( # but how do we validate the result after restore? env.pageserver.stop(immediate=True) - env.postgres.stop_all() + env.endpoints.stop_all() dir_to_clear = Path(env.repo_dir) / "tenants" shutil.rmtree(dir_to_clear) @@ -357,8 +357,8 @@ def test_remote_storage_upload_queue_retries( wait_until_tenant_active(client, tenant_id) log.info("restarting postgres to validate") - pg = env.postgres.create_start("main", tenant_id=tenant_id) - with pg.cursor() as cur: + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + with endpoint.cursor() as cur: assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000 @@ -394,13 +394,13 @@ def test_remote_timeline_client_calls_started_metric( client = env.pageserver.http_client() - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): # create initial set of layers & upload them with failpoints configured - pg.safe_psql_many( + endpoint.safe_psql_many( [ f""" INSERT INTO foo (id, val) @@ -413,7 +413,7 @@ def test_remote_timeline_client_calls_started_metric( "VACUUM foo", ] ) - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) calls_started: Dict[Tuple[str, str], List[int]] = { ("layer", "upload"): [0], @@ -478,7 +478,7 @@ def test_remote_timeline_client_calls_started_metric( ) env.pageserver.stop(immediate=True) - env.postgres.stop_all() + env.endpoints.stop_all() dir_to_clear = Path(env.repo_dir) / "tenants" shutil.rmtree(dir_to_clear) @@ -492,8 +492,8 @@ def test_remote_timeline_client_calls_started_metric( wait_until_tenant_active(client, tenant_id) log.info("restarting postgres to validate") - pg = env.postgres.create_start("main", tenant_id=tenant_id) - with pg.cursor() as cur: + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + with endpoint.cursor() as cur: assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000 # ensure that we updated the calls_started download metric @@ -543,17 +543,17 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ) return int(val) if val is not None else val - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) client.configure_failpoints(("before-upload-layer", "return")) - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE foo (x INTEGER)", "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g", ] ) - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) # Kick off a checkpoint operation. # It will get stuck in remote_client.wait_completion(), since the select query will have @@ -627,8 +627,8 @@ def test_empty_branch_remote_storage_upload( new_branch_name = "new_branch" new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant) - with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg: - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id) + with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint: + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id) wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) timelines_before_detach = set( @@ -676,8 +676,8 @@ def test_empty_branch_remote_storage_upload_on_restart( new_branch_name = "new_branch" new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant) - with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg: - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id) + with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint: + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id) wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id) env.pageserver.stop() diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index 42234bf535..494820ef8e 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -11,10 +11,10 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_subxacts", "empty") - pg = env.postgres.create_start("test_subxacts") + endpoint = env.endpoints.create_start("test_subxacts") log.info("postgres is running on 'test_subxacts' branch") - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() cur.execute( @@ -37,4 +37,4 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): cur.execute("checkpoint") # Check that we can restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) + check_restored_datadir_content(test_output_dir, env, endpoint) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 80d4b99504..28f1a960df 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -43,11 +43,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" tenant, _ = env.neon_cli.create_tenant(conf=new_conf) env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant) - env.postgres.create_start( - "test_tenant_conf", - "main", - tenant, - ) + env.endpoints.create_start("test_tenant_conf", "main", tenant) # check the configuration of the default tenant # it should match global configuration diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 58a010951e..847ae4b2b8 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -7,9 +7,9 @@ import asyncpg import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, - Postgres, RemoteStorageKind, available_remote_storages, ) @@ -59,8 +59,8 @@ def test_tenant_reattach( # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() - with env.postgres.create_start("main", tenant_id=tenant_id) as pg: - with pg.cursor() as cur: + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -99,8 +99,8 @@ def test_tenant_reattach( assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn - with env.postgres.create_start("main", tenant_id=tenant_id) as pg: - with pg.cursor() as cur: + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 # Check that we had to retry the downloads @@ -157,11 +157,11 @@ async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: T # async guts of test_tenant_reattach_while_bysy test async def reattach_while_busy( - env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId + env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId ): workers = [] for worker_id in range(num_connections): - pg_conn = await pg.connect_async() + pg_conn = await endpoint.connect_async() workers.append(asyncio.create_task(update_table(pg_conn))) workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id))) @@ -238,15 +238,15 @@ def test_tenant_reattach_while_busy( conf={"checkpoint_distance": "100000"} ) - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - cur = pg.connect().cursor() + cur = endpoint.connect().cursor() cur.execute("CREATE TABLE t(id int primary key, counter int)") cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0") # Run the test - asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id)) + asyncio.run(reattach_while_busy(env, endpoint, pageserver_http, tenant_id)) # Verify table contents assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows @@ -278,9 +278,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - pg.safe_psql_many( + endpoint.safe_psql_many( queries=[ "CREATE TABLE t(key int primary key, value text)", "INSERT INTO t SELECT generate_series(1,100000), 'payload'", @@ -339,9 +339,9 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - pg.safe_psql_many( + endpoint.safe_psql_many( queries=[ "CREATE TABLE t(key int primary key, value text)", "INSERT INTO t SELECT generate_series(1,100000), 'payload'", @@ -388,9 +388,9 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - pg.safe_psql_many( + endpoint.safe_psql_many( queries=[ "CREATE TABLE t(key int primary key, value text)", "INSERT INTO t SELECT generate_series(1,100000), 'payload'", @@ -425,18 +425,18 @@ def test_detach_while_attaching( ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") cur.execute( """ @@ -477,7 +477,7 @@ def test_detach_while_attaching( # cycle are still running, things could get really confusing.. pageserver_http.tenant_attach(tenant_id) - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("SELECT COUNT(*) FROM foo") @@ -572,14 +572,14 @@ def test_ignored_tenant_download_missing_layers( ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) data_id = 1 data_secret = "very secret secret" - insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg) + insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] tenants_before_ignore.sort() @@ -611,9 +611,9 @@ def test_ignored_tenant_download_missing_layers( ] assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back" - pg.stop() - pg.start() - ensure_test_data(data_id, data_secret, pg) + endpoint.stop() + endpoint.start() + ensure_test_data(data_id, data_secret, endpoint) # Tests that it's possible to `load` broken tenants: @@ -631,10 +631,10 @@ def test_ignored_tenant_stays_broken_without_metadata( ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) # ignore the tenant and remove its metadata pageserver_http.tenant_ignore(tenant_id) @@ -666,9 +666,9 @@ def test_load_attach_negatives( ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") with pytest.raises( @@ -707,16 +707,16 @@ def test_ignore_while_attaching( env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") pageserver_http = env.pageserver.http_client() - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) data_id = 1 data_secret = "very secret secret" - insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg) + insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] @@ -754,9 +754,9 @@ def test_ignore_while_attaching( wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) - pg.stop() - pg.start() - ensure_test_data(data_id, data_secret, pg) + endpoint.stop() + endpoint.start() + ensure_test_data(data_id, data_secret, endpoint) def insert_test_data( @@ -765,9 +765,9 @@ def insert_test_data( timeline_id: TimelineId, data_id: int, data: str, - pg: Postgres, + endpoint: Endpoint, ): - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( f""" CREATE TABLE test(id int primary key, secret text); @@ -787,8 +787,8 @@ def insert_test_data( wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) -def ensure_test_data(data_id: int, data: str, pg: Postgres): - with pg.cursor() as cur: +def ensure_test_data(data_id: int, data: str, endpoint: Endpoint): + with endpoint.cursor() as cur: assert ( query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data ), "Should have timeline data back" diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 3569ab0c53..180afd88cd 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,11 +7,11 @@ from typing import Any, Dict, Optional, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, NeonBroker, NeonEnv, NeonEnvBuilder, PortDistributor, - Postgres, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -87,20 +87,20 @@ def new_pageserver_service( @contextmanager -def pg_cur(pg): - with closing(pg.connect()) as conn: +def pg_cur(endpoint): + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: yield cur -def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Event): +def load(endpoint: Endpoint, stop_event: threading.Event, load_ok_event: threading.Event): log.info("load started") inserted_ctr = 0 failed = False while not stop_event.is_set(): try: - with pg_cur(pg) as cur: + with pg_cur(endpoint) as cur: cur.execute("INSERT INTO load VALUES ('some payload')") inserted_ctr += 1 except: # noqa: E722 @@ -110,7 +110,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve load_ok_event.clear() else: if failed: - with pg_cur(pg) as cur: + with pg_cur(endpoint) as cur: # if we recovered after failure verify that we have correct number of rows log.info("recovering at %s", inserted_ctr) cur.execute("SELECT count(*) FROM load") @@ -124,14 +124,14 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve def populate_branch( - pg: Postgres, + endpoint: Endpoint, tenant_id: TenantId, ps_http: PageserverHttpClient, create_table: bool, expected_sum: Optional[int], ) -> Tuple[TimelineId, Lsn]: # insert some data - with pg_cur(pg) as cur: + with pg_cur(endpoint) as cur: cur.execute("SHOW neon.timeline_id") timeline_id = TimelineId(cur.fetchone()[0]) log.info("timeline to relocate %s", timeline_id) @@ -196,19 +196,19 @@ def check_timeline_attached( def switch_pg_to_new_pageserver( env: NeonEnv, - pg: Postgres, + endpoint: Endpoint, new_pageserver_port: int, tenant_id: TenantId, timeline_id: TimelineId, ) -> Path: - pg.stop() + endpoint.stop() - pg_config_file_path = Path(pg.config_file_path()) + pg_config_file_path = Path(endpoint.config_file_path()) pg_config_file_path.open("a").write( f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'" ) - pg.start() + endpoint.start() timeline_to_detach_local_path = ( env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -226,8 +226,8 @@ def switch_pg_to_new_pageserver( return timeline_to_detach_local_path -def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path: Path): - with pg_cur(pg) as cur: +def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_local_path: Path): + with pg_cur(endpoint) as cur: # check that data is still there cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (sum_before_migration,) @@ -288,12 +288,12 @@ def test_tenant_relocation( log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) - pg_main = env.postgres.create_start( + ep_main = env.endpoints.create_start( branch_name="test_tenant_relocation_main", tenant_id=tenant_id ) timeline_id_main, current_lsn_main = populate_branch( - pg_main, + ep_main, tenant_id=tenant_id, ps_http=pageserver_http, create_table=True, @@ -306,12 +306,12 @@ def test_tenant_relocation( ancestor_start_lsn=current_lsn_main, tenant_id=tenant_id, ) - pg_second = env.postgres.create_start( + ep_second = env.endpoints.create_start( branch_name="test_tenant_relocation_second", tenant_id=tenant_id ) timeline_id_second, current_lsn_second = populate_branch( - pg_second, + ep_second, tenant_id=tenant_id, ps_http=pageserver_http, create_table=False, @@ -327,14 +327,14 @@ def test_tenant_relocation( if with_load == "with_load": # create load table - with pg_cur(pg_main) as cur: + with pg_cur(ep_main) as cur: cur.execute("CREATE TABLE load(value text)") load_stop_event = threading.Event() load_ok_event = threading.Event() load_thread = threading.Thread( target=load, - args=(pg_main, load_stop_event, load_ok_event), + args=(ep_main, load_stop_event, load_ok_event), daemon=True, # To make sure the child dies when the parent errors ) load_thread.start() @@ -450,7 +450,7 @@ def test_tenant_relocation( old_local_path_main = switch_pg_to_new_pageserver( env, - pg_main, + ep_main, new_pageserver_pg_port, tenant_id, timeline_id_main, @@ -458,7 +458,7 @@ def test_tenant_relocation( old_local_path_second = switch_pg_to_new_pageserver( env, - pg_second, + ep_second, new_pageserver_pg_port, tenant_id, timeline_id_second, @@ -475,11 +475,11 @@ def test_tenant_relocation( interval=1, func=lambda: tenant_exists(pageserver_http, tenant_id), ) - post_migration_check(pg_main, 500500, old_local_path_main) - post_migration_check(pg_second, 1001000, old_local_path_second) + post_migration_check(ep_main, 500500, old_local_path_main) + post_migration_check(ep_second, 1001000, old_local_path_second) # ensure that we can successfully read all relations on the new pageserver - with pg_cur(pg_second) as cur: + with pg_cur(ep_second) as cur: cur.execute( """ DO $$ diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 9037fe0045..e8d534142e 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -4,9 +4,9 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, - Postgres, wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) @@ -28,12 +28,12 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] assert branch_name == main_branch_name - with env.postgres.create_start( + with env.endpoints.create_start( main_branch_name, tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], - ) as pg: - with pg.cursor() as cur: + ) as endpoint: + with endpoint.cursor() as cur: cur.execute("SELECT 1") row = cur.fetchone() assert row is not None @@ -105,12 +105,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id) - with env.postgres.create_start("first-branch", tenant_id=tenant_id) as pg: - with pg.cursor() as cur: + with env.endpoints.create_start("first-branch", tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: cur.execute( "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" ) - wait_for_last_flush_lsn(env, pg, tenant_id, first_branch_timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, first_branch_timeline_id) size_after_branching = http_client.tenant_size(tenant_id) log.info(f"size_after_branching: {size_after_branching}") @@ -164,12 +164,12 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou assert last_branch is not None - with env.postgres.create_start(last_branch_name, tenant_id=tenant_id) as pg: - with pg.cursor() as cur: + with env.endpoints.create_start(last_branch_name, tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: cur.execute( "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" ) - wait_for_last_flush_lsn(env, pg, tenant_id, last_branch) + wait_for_last_flush_lsn(env, endpoint, tenant_id, last_branch) size_after_writes = http_client.tenant_size(tenant_id) assert size_after_writes > initial_size @@ -194,11 +194,11 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)}) http_client = env.pageserver.http_client() - with env.postgres.create_start("main", tenant_id=tenant_id) as pg: - initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id) - with pg.cursor() as cur: + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + initdb_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id) + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)") - flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id) + flushed_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id) size_before_branching = http_client.tenant_size(tenant_id) @@ -208,10 +208,10 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn ) - with env.postgres.create_start("branch", tenant_id=tenant_id) as pg: - with pg.cursor() as cur: + with env.endpoints.create_start("branch", tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)") - wait_for_last_flush_lsn(env, pg, tenant_id, branch_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, branch_id) size_after = http_client.tenant_size(tenant_id) @@ -237,17 +237,17 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)}) http_client = env.pageserver.http_client() - with env.postgres.create_start("main", tenant_id=tenant_id) as pg: - initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id) - with pg.cursor() as cur: + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + initdb_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id) + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)") - flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id) + flushed_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id) - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t00 AS SELECT i::bigint n FROM generate_series(0, 2000) s(i)") - wait_for_last_flush_lsn(env, pg, tenant_id, main_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, main_id) size_before_branching = http_client.tenant_size(tenant_id) @@ -257,10 +257,10 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn ) - with env.postgres.create_start("branch", tenant_id=tenant_id) as pg: - with pg.cursor() as cur: + with env.endpoints.create_start("branch", tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)") - wait_for_last_flush_lsn(env, pg, tenant_id, branch_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, branch_id) size_after = http_client.tenant_size(tenant_id) @@ -297,12 +297,12 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa # gc is not expected to change the results for branch_name, amount in [("main", 2000), ("first", 15000), ("second", 3000)]: - with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg: - with pg.cursor() as cur: + with env.endpoints.create_start(branch_name, tenant_id=tenant_id) as endpoint: + with endpoint.cursor() as cur: cur.execute( f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {amount}) s(i)" ) - wait_for_last_flush_lsn(env, pg, tenant_id, ids[branch_name]) + wait_for_last_flush_lsn(env, endpoint, tenant_id, ids[branch_name]) size_now = http_client.tenant_size(tenant_id) if latest_size is not None: assert size_now > latest_size @@ -359,7 +359,7 @@ def test_single_branch_get_tenant_size_grows( def get_current_consistent_size( env: NeonEnv, - pg: Postgres, + endpoint: Endpoint, size_debug_file, # apparently there is no public signature for open()... http_client: PageserverHttpClient, tenant_id: TenantId, @@ -368,7 +368,7 @@ def test_single_branch_get_tenant_size_grows( consistent = False size_debug = None - current_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id) + current_lsn = wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) # We want to make sure we have a self-consistent set of values. # Size changes with WAL, so only if both before and after getting # the size of the tenant reports the same WAL insert LSN, we're OK @@ -382,35 +382,35 @@ def test_single_branch_get_tenant_size_grows( size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) size_debug = http_client.tenant_size_debug(tenant_id) - after_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id) + after_lsn = wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) consistent = current_lsn == after_lsn current_lsn = after_lsn size_debug_file.write(size_debug) return (current_lsn, size) - with env.postgres.create_start( + with env.endpoints.create_start( branch_name, tenant_id=tenant_id, ### autovacuum is disabled to limit WAL logging. config_lines=["autovacuum=off"], - ) as pg: + ) as endpoint: (initdb_lsn, size) = get_current_consistent_size( - env, pg, size_debug_file, http_client, tenant_id, timeline_id + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) collected_responses.append(("INITDB", initdb_lsn, size)) - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL) WITH (fillfactor = 40)") (current_lsn, size) = get_current_consistent_size( - env, pg, size_debug_file, http_client, tenant_id, timeline_id + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) collected_responses.append(("CREATE", current_lsn, size)) batch_size = 100 for i in range(3): - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)", (i, i), @@ -419,7 +419,7 @@ def test_single_branch_get_tenant_size_grows( i += 1 (current_lsn, size) = get_current_consistent_size( - env, pg, size_debug_file, http_client, tenant_id, timeline_id + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) prev_size = collected_responses[-1][2] @@ -438,7 +438,7 @@ def test_single_branch_get_tenant_size_grows( collected_responses.append(("INSERT", current_lsn, size)) while True: - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( f"UPDATE t0 SET i = -i WHERE i IN (SELECT i FROM t0 WHERE i > 0 LIMIT {batch_size})" ) @@ -448,7 +448,7 @@ def test_single_branch_get_tenant_size_grows( break (current_lsn, size) = get_current_consistent_size( - env, pg, size_debug_file, http_client, tenant_id, timeline_id + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) prev_size = collected_responses[-1][2] @@ -458,7 +458,7 @@ def test_single_branch_get_tenant_size_grows( collected_responses.append(("UPDATE", current_lsn, size)) while True: - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute(f"DELETE FROM t0 WHERE i IN (SELECT i FROM t0 LIMIT {batch_size})") deleted = cur.rowcount @@ -466,7 +466,7 @@ def test_single_branch_get_tenant_size_grows( break (current_lsn, size) = get_current_consistent_size( - env, pg, size_debug_file, http_client, tenant_id, timeline_id + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) prev_size = collected_responses[-1][2] @@ -475,14 +475,14 @@ def test_single_branch_get_tenant_size_grows( collected_responses.append(("DELETE", current_lsn, size)) - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("DROP TABLE t0") # The size of the tenant should still be as large as before we dropped # the table, because the drop operation can still be undone in the PITR # defined by gc_horizon. (current_lsn, size) = get_current_consistent_size( - env, pg, size_debug_file, http_client, tenant_id, timeline_id + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) prev_size = collected_responses[-1][2] @@ -532,16 +532,16 @@ def test_get_tenant_size_with_multiple_branches( http_client = env.pageserver.http_client() - main_pg = env.postgres.create_start(main_branch_name, tenant_id=tenant_id) + main_endpoint = env.endpoints.create_start(main_branch_name, tenant_id=tenant_id) batch_size = 10000 - with main_pg.cursor() as cur: + with main_endpoint.cursor() as cur: cur.execute( f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)" ) - wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id) + wait_for_last_flush_lsn(env, main_endpoint, tenant_id, main_timeline_id) size_at_branch = http_client.tenant_size(tenant_id) assert size_at_branch > 0 @@ -552,23 +552,23 @@ def test_get_tenant_size_with_multiple_branches( size_after_first_branch = http_client.tenant_size(tenant_id) assert size_after_first_branch == size_at_branch - first_branch_pg = env.postgres.create_start("first-branch", tenant_id=tenant_id) + first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id) - with first_branch_pg.cursor() as cur: + with first_branch_endpoint.cursor() as cur: cur.execute( f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)" ) - wait_for_last_flush_lsn(env, first_branch_pg, tenant_id, first_branch_timeline_id) + wait_for_last_flush_lsn(env, first_branch_endpoint, tenant_id, first_branch_timeline_id) size_after_growing_first_branch = http_client.tenant_size(tenant_id) assert size_after_growing_first_branch > size_after_first_branch - with main_pg.cursor() as cur: + with main_endpoint.cursor() as cur: cur.execute( f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 2*{batch_size}) s(i)" ) - wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id) + wait_for_last_flush_lsn(env, main_endpoint, tenant_id, main_timeline_id) size_after_continuing_on_main = http_client.tenant_size(tenant_id) assert size_after_continuing_on_main > size_after_growing_first_branch @@ -578,31 +578,31 @@ def test_get_tenant_size_with_multiple_branches( size_after_second_branch = http_client.tenant_size(tenant_id) assert size_after_second_branch == size_after_continuing_on_main - second_branch_pg = env.postgres.create_start("second-branch", tenant_id=tenant_id) + second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id) - with second_branch_pg.cursor() as cur: + with second_branch_endpoint.cursor() as cur: cur.execute( f"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 3*{batch_size}) s(i)" ) - wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id) + wait_for_last_flush_lsn(env, second_branch_endpoint, tenant_id, second_branch_timeline_id) size_after_growing_second_branch = http_client.tenant_size(tenant_id) assert size_after_growing_second_branch > size_after_second_branch - with second_branch_pg.cursor() as cur: + with second_branch_endpoint.cursor() as cur: cur.execute("DROP TABLE t0") cur.execute("DROP TABLE t1") cur.execute("VACUUM FULL") - wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id) + wait_for_last_flush_lsn(env, second_branch_endpoint, tenant_id, second_branch_timeline_id) size_after_thinning_branch = http_client.tenant_size(tenant_id) assert ( size_after_thinning_branch > size_after_growing_second_branch ), "tenant_size should grow with dropped tables and full vacuum" - first_branch_pg.stop_and_destroy() - second_branch_pg.stop_and_destroy() - main_pg.stop() + first_branch_endpoint.stop_and_destroy() + second_branch_endpoint.stop_and_destroy() + main_endpoint.stop() env.pageserver.stop() env.pageserver.start() diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 8c89100745..21e4af4127 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -29,7 +29,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) - pg = env.postgres.create_start(name, tenant_id=tenant) + endpoint = env.endpoints.create_start(name, tenant_id=tenant) assert_tenant_state( client, tenant, @@ -38,7 +38,7 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): ) # Stop compute - pg.stop() + endpoint.stop() # Delete all timelines on all tenants. # diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 2162520217..8026d7f5c6 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -66,17 +66,17 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_1) env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_2) - pg_tenant1 = env.postgres.create_start( + endpoint_tenant1 = env.endpoints.create_start( "test_tenants_normal_work", tenant_id=tenant_1, ) - pg_tenant2 = env.postgres.create_start( + endpoint_tenant2 = env.endpoints.create_start( "test_tenants_normal_work", tenant_id=tenant_2, ) - for pg in [pg_tenant1, pg_tenant2]: - with closing(pg.connect()) as conn: + for endpoint in [endpoint_tenant1, endpoint_tenant2]: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -97,11 +97,11 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): timeline_1 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_1) timeline_2 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_2) - pg_tenant1 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_1) - pg_tenant2 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_2) + endpoint_tenant1 = env.endpoints.create_start("test_metrics_normal_work", tenant_id=tenant_1) + endpoint_tenant2 = env.endpoints.create_start("test_metrics_normal_work", tenant_id=tenant_2) - for pg in [pg_tenant1, pg_tenant2]: - with closing(pg.connect()) as conn: + for endpoint in [endpoint_tenant1, endpoint_tenant2]: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") @@ -242,11 +242,15 @@ def test_pageserver_metrics_removed_after_detach( env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1) env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2) - pg_tenant1 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_1) - pg_tenant2 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_2) + endpoint_tenant1 = env.endpoints.create_start( + "test_metrics_removed_after_detach", tenant_id=tenant_1 + ) + endpoint_tenant2 = env.endpoints.create_start( + "test_metrics_removed_after_detach", tenant_id=tenant_2 + ) - for pg in [pg_tenant1, pg_tenant2]: - with closing(pg.connect()) as conn: + for endpoint in [endpoint_tenant1, endpoint_tenant2]: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") @@ -317,7 +321,7 @@ def test_pageserver_with_empty_tenants( ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" # Trigger timeline re-initialization after pageserver restart - env.postgres.stop_all() + env.endpoints.stop_all() env.pageserver.stop() tenant_without_timelines_dir = env.initial_tenant diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 514e2b6fa0..d7c0814570 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -15,10 +15,10 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, LocalFsStorage, NeonEnv, NeonEnvBuilder, - Postgres, RemoteStorageKind, available_remote_storages, wait_for_sk_commit_lsn_to_reach_remote_storage, @@ -32,10 +32,10 @@ from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until -async def tenant_workload(env: NeonEnv, pg: Postgres): +async def tenant_workload(env: NeonEnv, endpoint: Endpoint): await env.pageserver.connect_async() - pg_conn = await pg.connect_async() + pg_conn = await endpoint.connect_async() await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): @@ -49,10 +49,10 @@ async def tenant_workload(env: NeonEnv, pg: Postgres): assert res == i * 1000 -async def all_tenants_workload(env: NeonEnv, tenants_pgs): +async def all_tenants_workload(env: NeonEnv, tenants_endpoints): workers = [] - for _, pg in tenants_pgs: - worker = tenant_workload(env, pg) + for _, endpoint in tenants_endpoints: + worker = tenant_workload(env, endpoint) workers.append(asyncio.create_task(worker)) # await all workers @@ -73,7 +73,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" ) - tenants_pgs: List[Tuple[TenantId, Postgres]] = [] + tenants_endpoints: List[Tuple[TenantId, Endpoint]] = [] for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly @@ -84,18 +84,18 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem ) env.neon_cli.create_timeline("test_tenants_many", tenant_id=tenant) - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( "test_tenants_many", tenant_id=tenant, ) - tenants_pgs.append((tenant, pg)) + tenants_endpoints.append((tenant, endpoint)) - asyncio.run(all_tenants_workload(env, tenants_pgs)) + asyncio.run(all_tenants_workload(env, tenants_endpoints)) # Wait for the remote storage uploads to finish pageserver_http = env.pageserver.http_client() - for tenant, pg in tenants_pgs: - res = pg.safe_psql_many( + for tenant, endpoint in tenants_endpoints: + res = endpoint.safe_psql_many( ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] ) tenant_id = TenantId(res[0][0][0]) @@ -137,15 +137,15 @@ def test_tenants_attached_after_download( ) pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) for checkpoint_number in range(1, 3): - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute( f""" CREATE TABLE t{checkpoint_number}(id int primary key, secret text); @@ -174,7 +174,7 @@ def test_tenants_attached_after_download( ) ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 - env.postgres.stop_all() + env.endpoints.stop_all() wait_for_sk_commit_lsn_to_reach_remote_storage( tenant_id, timeline_id, env.safekeepers, env.pageserver @@ -244,12 +244,12 @@ def test_tenant_redownloads_truncated_file_on_startup( env.pageserver.allowed_errors.append(".*No timelines to attach received.*") pageserver_http = env.pageserver.http_client() - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("CREATE TABLE t1 AS VALUES (123, 'foobar');") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -257,7 +257,7 @@ def test_tenant_redownloads_truncated_file_on_startup( pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) - env.postgres.stop_all() + env.endpoints.stop_all() env.pageserver.stop() timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) @@ -313,9 +313,9 @@ def test_tenant_redownloads_truncated_file_on_startup( os.stat(remote_layer_path).st_size == expected_size ), "truncated file should not had been uploaded around re-download" - pg = env.postgres.create_start("main") + endpoint = env.endpoints.create_start("main") - with pg.cursor() as cur: + with endpoint.cursor() as cur: cur.execute("INSERT INTO t1 VALUES (234, 'test data');") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 28da3c5a48..db278d5646 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -12,11 +12,11 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, PgBin, PortDistributor, - Postgres, RemoteStorageKind, VanillaPostgres, wait_for_last_flush_lsn, @@ -38,10 +38,10 @@ def test_timeline_size(neon_simple_env: NeonEnv): client = env.pageserver.http_client() wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) - pgmain = env.postgres.create_start("test_timeline_size") + endpoint_main = env.endpoints.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") - with closing(pgmain.connect()) as conn: + with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") cur.execute( @@ -74,10 +74,10 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) - pgmain = env.postgres.create_start("test_timeline_size_createdropdb") + endpoint_main = env.endpoints.create_start("test_timeline_size_createdropdb") log.info("postgres is running on 'test_timeline_size_createdropdb' branch") - with closing(pgmain.connect()) as conn: + with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: res = client.timeline_detail( env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True @@ -89,7 +89,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): ), "no writes should not change the incremental logical size" cur.execute("CREATE DATABASE foodb") - with closing(pgmain.connect(dbname="foodb")) as conn: + with closing(endpoint_main.connect(dbname="foodb")) as conn: with conn.cursor() as cur2: cur2.execute("CREATE TABLE foo (t text)") cur2.execute( @@ -118,7 +118,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): # wait until received_lsn_lag is 0 -def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60): +def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, timeout=60): started_at = time.time() received_lsn_lag = 1 @@ -129,7 +129,7 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 "timed out waiting for pageserver to reach pg_current_wal_flush_lsn()" ) - res = pgmain.safe_psql( + res = endpoint_main.safe_psql( """ SELECT pg_size_pretty(pg_cluster_size()), @@ -150,20 +150,20 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) - pgmain = env.postgres.create_start( + endpoint_main = env.endpoints.create_start( "test_timeline_size_quota", # Set small limit for the test config_lines=["neon.max_cluster_size=30MB"], ) log.info("postgres is running on 'test_timeline_size_quota' branch") - with closing(pgmain.connect()) as conn: + with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? cur.execute("CREATE TABLE foo (t text)") - wait_for_pageserver_catchup(pgmain) + wait_for_pageserver_catchup(endpoint_main) # Insert many rows. This query must fail because of space limit try: @@ -175,7 +175,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): """ ) - wait_for_pageserver_catchup(pgmain) + wait_for_pageserver_catchup(endpoint_main) cur.execute( """ @@ -195,7 +195,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): # drop table to free space cur.execute("DROP TABLE foo") - wait_for_pageserver_catchup(pgmain) + wait_for_pageserver_catchup(endpoint_main) # create it again and insert some rows. This query must succeed cur.execute("CREATE TABLE foo (t text)") @@ -207,7 +207,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): """ ) - wait_for_pageserver_catchup(pgmain) + wait_for_pageserver_catchup(endpoint_main) cur.execute("SELECT * from pg_size_pretty(pg_cluster_size())") pg_cluster_size = cur.fetchone() @@ -231,15 +231,15 @@ def test_timeline_initial_logical_size_calculation_cancellation( tenant_id, timeline_id = env.neon_cli.create_tenant() # load in some data - pg = env.postgres.create_start("main", tenant_id=tenant_id) - pg.safe_psql_many( + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + endpoint.safe_psql_many( [ "CREATE TABLE foo (x INTEGER)", "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g", ] ) - wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - pg.stop() + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop() # restart with failpoint inside initial size calculation task env.pageserver.stop() @@ -311,9 +311,9 @@ def test_timeline_physical_size_init( env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") - pg = env.postgres.create_start("test_timeline_physical_size_init") + endpoint = env.endpoints.create_start("test_timeline_physical_size_init") - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE foo (t text)", """INSERT INTO foo @@ -322,7 +322,7 @@ def test_timeline_physical_size_init( ] ) - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) # restart the pageserer to force calculating timeline's initial physical size env.pageserver.stop() @@ -355,9 +355,9 @@ def test_timeline_physical_size_post_checkpoint( pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") - pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") + endpoint = env.endpoints.create_start("test_timeline_physical_size_post_checkpoint") - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE foo (t text)", """INSERT INTO foo @@ -366,7 +366,7 @@ def test_timeline_physical_size_post_checkpoint( ] ) - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) assert_physical_size_invariants( @@ -394,7 +394,7 @@ def test_timeline_physical_size_post_compaction( pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") - pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") + endpoint = env.endpoints.create_start("test_timeline_physical_size_post_compaction") # We don't want autovacuum to run on the table, while we are calculating the # physical size, because that could cause a new layer to be created and a @@ -402,7 +402,7 @@ def test_timeline_physical_size_post_compaction( # happens, because of some other background activity or autovacuum on other # tables, we could simply retry the size calculations. It's unlikely that # that would happen more than once.) - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", """INSERT INTO foo @@ -411,7 +411,7 @@ def test_timeline_physical_size_post_compaction( ] ) - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) # shutdown safekeepers to prevent new data from coming in for sk in env.safekeepers: @@ -446,10 +446,10 @@ def test_timeline_physical_size_post_gc( pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") - pg = env.postgres.create_start("test_timeline_physical_size_post_gc") + endpoint = env.endpoints.create_start("test_timeline_physical_size_post_gc") # Like in test_timeline_physical_size_post_compaction, disable autovacuum - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", """INSERT INTO foo @@ -458,10 +458,10 @@ def test_timeline_physical_size_post_gc( ] ) - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - pg.safe_psql( + endpoint.safe_psql( """ INSERT INTO foo SELECT 'long string to consume some space' || g @@ -469,7 +469,7 @@ def test_timeline_physical_size_post_gc( """ ) - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) @@ -495,9 +495,9 @@ def test_timeline_size_metrics( pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") - pg = env.postgres.create_start("test_timeline_size_metrics") + endpoint = env.endpoints.create_start("test_timeline_size_metrics") - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE foo (t text)", """INSERT INTO foo @@ -506,7 +506,7 @@ def test_timeline_size_metrics( ] ) - wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) # get the metrics and parse the metric for the current timeline's physical size @@ -558,7 +558,7 @@ def test_timeline_size_metrics( # The sum of the sizes of all databases, as seen by pg_database_size(), should also # be close. Again allow some slack, the logical size metric includes some things like # the SLRUs that are not included in pg_database_size(). - dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0] + dbsize_sum = endpoint.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0] assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024) @@ -592,16 +592,16 @@ def test_tenant_physical_size( n_rows = random.randint(100, 1000) timeline = env.neon_cli.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant) - pg = env.postgres.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant) + endpoint = env.endpoints.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant) - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE foo (t text)", f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", ] ) - wait_for_last_flush_lsn(env, pg, tenant, timeline) + wait_for_last_flush_lsn(env, endpoint, tenant, timeline) pageserver_http.timeline_checkpoint(tenant, timeline) if remote_storage_kind is not None: @@ -609,7 +609,7 @@ def test_tenant_physical_size( timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) - pg.stop() + endpoint.stop() # ensure that tenant_status current_physical size reports sum of timeline current_physical_size tenant_current_physical_size = int( diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index cfe8a7f067..b1ddd93a40 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -27,8 +27,8 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): ) env.neon_cli.create_timeline("test_truncate", tenant_id=tenant) - pg = env.postgres.create_start("test_truncate", tenant_id=tenant) - cur = pg.connect().cursor() + endpoint = env.endpoints.create_start("test_truncate", tenant_id=tenant) + cur = endpoint.connect().cursor() cur.execute("create table t1(x integer)") cur.execute(f"insert into t1 values (generate_series(1,{n_records}))") cur.execute("vacuum t1") diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index f3b0f9ca06..305271c715 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -10,10 +10,12 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn def test_twophase(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_twophase", "empty") - pg = env.postgres.create_start("test_twophase", config_lines=["max_prepared_transactions=5"]) + endpoint = env.endpoints.create_start( + "test_twophase", config_lines=["max_prepared_transactions=5"] + ) log.info("postgres is running on 'test_twophase' branch") - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() cur.execute("CREATE TABLE foo (t text)") @@ -42,7 +44,7 @@ def test_twophase(neon_simple_env: NeonEnv): # pg_twophase directory and fsynced cur.execute("CHECKPOINT") - twophase_files = os.listdir(pg.pg_twophase_dir_path()) + twophase_files = os.listdir(endpoint.pg_twophase_dir_path()) log.info(twophase_files) assert len(twophase_files) == 4 @@ -50,25 +52,25 @@ def test_twophase(neon_simple_env: NeonEnv): cur.execute("ROLLBACK PREPARED 'insert_four'") cur.execute("CHECKPOINT") - twophase_files = os.listdir(pg.pg_twophase_dir_path()) + twophase_files = os.listdir(endpoint.pg_twophase_dir_path()) log.info(twophase_files) assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - fork_at_current_lsn(env, pg, "test_twophase_prepared", "test_twophase") + fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase") # Start compute on the new branch - pg2 = env.postgres.create_start( + endpoint2 = env.endpoints.create_start( "test_twophase_prepared", config_lines=["max_prepared_transactions=5"], ) # Check that we restored only needed twophase files - twophase_files2 = os.listdir(pg2.pg_twophase_dir_path()) + twophase_files2 = os.listdir(endpoint2.pg_twophase_dir_path()) log.info(twophase_files2) assert twophase_files2.sort() == twophase_files.sort() - conn2 = pg2.connect() + conn2 = endpoint2.connect() cur2 = conn2.cursor() # On the new branch, commit one of the prepared transactions, diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index b6b20f1230..708bf0dfeb 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -9,9 +9,9 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn def test_unlogged(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_unlogged", "empty") - pg = env.postgres.create_start("test_unlogged") + endpoint = env.endpoints.create_start("test_unlogged") - conn = pg.connect() + conn = endpoint.connect() cur = conn.cursor() cur.execute("CREATE UNLOGGED TABLE iut (id int);") @@ -20,12 +20,10 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("INSERT INTO iut values (42);") # create another compute to fetch inital empty contents from pageserver - fork_at_current_lsn(env, pg, "test_unlogged_basebackup", "test_unlogged") - pg2 = env.postgres.create_start( - "test_unlogged_basebackup", - ) + fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") + endpoint2 = env.endpoints.create_start("test_unlogged_basebackup") - conn2 = pg2.connect() + conn2 = endpoint2.connect() cur2 = conn2.cursor() # after restart table should be empty but valid cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)") diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 16a870471b..d8034b31b0 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -10,10 +10,10 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_vm_bit_clear", "empty") - pg = env.postgres.create_start("test_vm_bit_clear") + endpoint = env.endpoints.create_start("test_vm_bit_clear") log.info("postgres is running on 'test_vm_bit_clear' branch") - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() # Install extension containing function needed for test @@ -33,7 +33,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1") # Branch at this point, to test that later - fork_at_current_lsn(env, pg, "test_vm_bit_clear_new", "test_vm_bit_clear") + fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server @@ -63,10 +63,10 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # a dirty VM page is evicted. If the VM bit was not correctly cleared by the # earlier WAL record, the full-page image hides the problem. Starting a new # server at the right point-in-time avoids that full-page image. - pg_new = env.postgres.create_start("test_vm_bit_clear_new") + endpoint_new = env.endpoints.create_start("test_vm_bit_clear_new") log.info("postgres is running on 'test_vm_bit_clear_new' branch") - pg_new_conn = pg_new.connect() + pg_new_conn = endpoint_new.connect() cur_new = pg_new_conn.cursor() cur_new.execute( diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c24c77bb95..77a2987a96 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -16,6 +16,7 @@ from typing import Any, List, Optional import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( + Endpoint, NeonBroker, NeonEnv, NeonEnvBuilder, @@ -23,7 +24,6 @@ from fixtures.neon_fixtures import ( PgBin, PgProtocol, PortDistributor, - Postgres, RemoteStorageKind, RemoteStorageUsers, Safekeeper, @@ -39,11 +39,11 @@ from fixtures.utils import get_dir_size, query_scalar, start_in_background def wait_lsn_force_checkpoint( tenant_id: TenantId, timeline_id: TimelineId, - pg: Postgres, + endpoint: Endpoint, ps: NeonPageserver, pageserver_conn_options={}, ): - lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") auth_token = None @@ -97,10 +97,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): branch_names_to_timeline_ids = {} # start postgres on each timeline - pgs = [] + endpoints = [] for branch_name in branch_names: new_timeline_id = env.neon_cli.create_branch(branch_name) - pgs.append(env.postgres.create_start(branch_name)) + endpoints.append(env.endpoints.create_start(branch_name)) branch_names_to_timeline_ids[branch_name] = new_timeline_id tenant_id = env.initial_tenant @@ -160,8 +160,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): # Do everything in different loops to have actions on different timelines # interleaved. # create schema - for pg in pgs: - pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + for endpoint in endpoints: + endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") init_m = collect_metrics("after CREATE TABLE") # Populate data for 2/3 timelines @@ -197,16 +197,16 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): metrics_checker = MetricsChecker() metrics_checker.start() - for pg in pgs[:-1]: - pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + for endpoint in endpoints[:-1]: + endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'") metrics_checker.stop() collect_metrics("after INSERT INTO") # Check data for 2/3 timelines - for pg in pgs[:-1]: - res = pg.safe_psql("SELECT sum(key) FROM t") + for endpoint in endpoints[:-1]: + res = endpoint.safe_psql("SELECT sum(key) FROM t") assert res[0] == (5000050000,) final_m = collect_metrics("after SELECT") @@ -233,11 +233,11 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_restarts") - pg = env.postgres.create_start("test_safekeepers_restarts") + endpoint = env.endpoints.create_start("test_safekeepers_restarts") # we rely upon autocommit after each statement # as waiting for acceptors happens there - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() failed_node = None @@ -268,22 +268,22 @@ def test_broker(neon_env_builder: NeonEnvBuilder): ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" ) - pg = env.postgres.create_start("test_broker") - pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + endpoint = env.endpoints.create_start("test_broker") + endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] log.info(f"statuses is {stat_before}") - pg.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'") + endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'") # force checkpoint in pageserver to advance remote_consistent_lsn - wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver) + wait_lsn_force_checkpoint(tenant_id, timeline_id, endpoint, env.pageserver) # and wait till remote_consistent_lsn propagates to all safekeepers started_at = time.time() @@ -317,26 +317,28 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): ) env.neon_cli.create_branch("test_safekeepers_wal_removal") - pg = env.postgres.create_start("test_safekeepers_wal_removal") + endpoint = env.endpoints.create_start("test_safekeepers_wal_removal") # Note: it is important to insert at least two segments, as currently # control file is synced roughly once in segment range and WAL is not # removed until all horizons are persisted. - pg.safe_psql_many( + endpoint.safe_psql_many( [ "CREATE TABLE t(key int primary key, value text)", "INSERT INTO t SELECT generate_series(1,200000), 'payload'", ] ) - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} if auth_enabled: pageserver_conn_options["password"] = env.auth_keys.generate_tenant_token(tenant_id) - wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver, pageserver_conn_options) + wait_lsn_force_checkpoint( + tenant_id, timeline_id, endpoint, env.pageserver, pageserver_conn_options + ) # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ @@ -436,13 +438,13 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_wal_backup") - pg = env.postgres.create_start("test_safekeepers_wal_backup") + endpoint = env.endpoints.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() cur.execute("create table t(key int, value text)") @@ -465,9 +467,9 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot # put one of safekeepers down again env.safekeepers[0].stop() # restart postgres - pg.stop_and_destroy().create_start("test_safekeepers_wal_backup") + endpoint.stop_and_destroy().create_start("test_safekeepers_wal_backup") # and ensure offloading still works - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("insert into t select generate_series(1,250000), 'payload'") seg_end = Lsn("0/5000000") @@ -491,15 +493,15 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re env = neon_env_builder.init_start() env.neon_cli.create_branch("test_s3_wal_replay") - pg = env.postgres.create_start("test_s3_wal_replay") + endpoint = env.endpoints.create_start("test_s3_wal_replay") # learn neon timeline from compute - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) expected_sum = 0 - with closing(pg.connect()) as conn: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("create table t(key int, value text)") cur.execute("insert into t values (1, 'payload')") @@ -547,7 +549,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) - pg.stop_and_destroy() + endpoint.stop_and_destroy() ps_cli.timeline_delete(tenant_id, timeline_id) # Also delete and manually create timeline on safekeepers -- this tests @@ -609,9 +611,9 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re log.info(f"WAL redo took {elapsed} s") # verify data - pg.create_start("test_s3_wal_replay") + endpoint.create_start("test_s3_wal_replay") - assert pg.safe_psql("select sum(key) from t")[0][0] == expected_sum + assert endpoint.safe_psql("select sum(key) from t")[0][0] == expected_sum class ProposerPostgres(PgProtocol): @@ -762,13 +764,13 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_timeline_status") - pg = env.postgres.create_start("test_timeline_status") + endpoint = env.endpoints.create_start("test_timeline_status") wa = env.safekeepers[0] # learn neon timeline from compute - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) if not auth_enabled: wa_http_cli = wa.http_client() @@ -806,11 +808,11 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert debug_dump_0["timelines_count"] == 1 assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id) - pg.safe_psql("create table t(i int)") + endpoint.safe_psql("create table t(i int)") # ensure epoch goes up after reboot - pg.stop().start() - pg.safe_psql("insert into t values(10)") + endpoint.stop().start() + endpoint.safe_psql("insert into t values(10)") tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) epoch_after_reboot = tli_status.acceptor_epoch @@ -992,8 +994,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) - def execute_payload(pg: Postgres): - with closing(pg.connect()) as conn: + def execute_payload(endpoint: Endpoint): + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -1021,26 +1023,26 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() active_safekeepers = [1, 2, 3] - pg = env.postgres.create("test_replace_safekeeper") - pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) - pg.start() + endpoint = env.endpoints.create("test_replace_safekeeper") + endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.start() # learn neon timeline from compute - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) - timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) - execute_payload(pg) + execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Restart all safekeepers to flush everything") env.safekeepers[0].stop(immediate=True) - execute_payload(pg) + execute_payload(endpoint) env.safekeepers[0].start() env.safekeepers[1].stop(immediate=True) - execute_payload(pg) + execute_payload(endpoint) env.safekeepers[1].start() env.safekeepers[2].stop(immediate=True) - execute_payload(pg) + execute_payload(endpoint) env.safekeepers[2].start() env.safekeepers[0].stop(immediate=True) @@ -1050,27 +1052,27 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): env.safekeepers[1].start() env.safekeepers[2].start() - execute_payload(pg) + execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Stop sk1 (simulate failure) and use only quorum of sk2 and sk3") env.safekeepers[0].stop(immediate=True) - execute_payload(pg) + execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Recreate postgres to replace failed sk1 with new sk4") - pg.stop_and_destroy().create("test_replace_safekeeper") + endpoint.stop_and_destroy().create("test_replace_safekeeper") active_safekeepers = [2, 3, 4] env.safekeepers[3].start() - pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) - pg.start() + endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + endpoint.start() - execute_payload(pg) + execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Stop sk2 to require quorum of sk3 and sk4 for normal work") env.safekeepers[1].stop(immediate=True) - execute_payload(pg) + execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1082,13 +1084,13 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): last_lsn = Lsn(0) # returns pg_wal size in MB - def collect_stats(pg: Postgres, cur, enable_logs=True): + def collect_stats(endpoint: Endpoint, cur, enable_logs=True): nonlocal last_lsn - assert pg.pgdata_dir is not None + assert endpoint.pgdata_dir is not None log.info("executing INSERT to generate WAL") current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - pg_wal_size_mb = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 + pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024 if enable_logs: lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024 log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB") @@ -1104,25 +1106,25 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch("test_wal_deleted_after_broadcast") # Adjust checkpoint config to prevent keeping old WAL segments - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( "test_wal_deleted_after_broadcast", config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], ) - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() cur.execute("CREATE TABLE t(key int, value text)") - collect_stats(pg, cur) + collect_stats(endpoint, cur) # generate WAL to simulate normal workload for i in range(5): generate_wal(cur) - collect_stats(pg, cur) + collect_stats(endpoint, cur) log.info("executing checkpoint") cur.execute("CHECKPOINT") - wal_size_after_checkpoint = collect_stats(pg, cur) + wal_size_after_checkpoint = collect_stats(endpoint, cur) # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) assert wal_size_after_checkpoint < 16 * 2.5 @@ -1151,13 +1153,13 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): tenant_id_other, timeline_id_other = env.neon_cli.create_tenant() # Populate branches - pg_1 = env.postgres.create_start("br1") - pg_2 = env.postgres.create_start("br2") - pg_3 = env.postgres.create_start("br3") - pg_4 = env.postgres.create_start("br4") - pg_other = env.postgres.create_start("main", tenant_id=tenant_id_other) - for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: - with closing(pg.connect()) as conn: + endpoint_1 = env.endpoints.create_start("br1") + endpoint_2 = env.endpoints.create_start("br2") + endpoint_3 = env.endpoints.create_start("br3") + endpoint_4 = env.endpoints.create_start("br4") + endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other) + for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key)") sk = env.safekeepers[0] @@ -1178,14 +1180,14 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. - pg_2.stop_and_destroy() - pg_4.stop_and_destroy() + endpoint_2.stop_and_destroy() + endpoint_4.stop_and_destroy() sk.stop() sk.start() # Ensure connections to Safekeeper are established - for pg in [pg_1, pg_3, pg_other]: - with closing(pg.connect()) as conn: + for endpoint in [endpoint_1, endpoint_3, endpoint_other]: + with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("INSERT INTO t (key) VALUES (1)") @@ -1244,6 +1246,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Ensure the other tenant still works sk_http_other.timeline_status(tenant_id_other, timeline_id_other) - with closing(pg_other.connect()) as conn: + with closing(endpoint_other.connect()) as conn: with conn.cursor() as cur: cur.execute("INSERT INTO t (key) VALUES (123)") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index f10a40690e..7debeed140 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -6,7 +6,7 @@ from typing import List, Optional import asyncpg from fixtures.log_helper import getLogger -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") @@ -82,8 +82,10 @@ class WorkerStats(object): log.info("All workers made {} transactions".format(progress)) -async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer): - pg_conn = await pg.connect_async() +async def run_random_worker( + stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer +): + pg_conn = await endpoint.connect_async() log.debug("Started worker {}".format(worker_id)) while stats.running: @@ -141,7 +143,7 @@ async def wait_for_lsn( # consistent. async def run_restarts_under_load( env: NeonEnv, - pg: Postgres, + endpoint: Endpoint, acceptors: List[Safekeeper], n_workers=10, n_accounts=100, @@ -154,7 +156,7 @@ async def run_restarts_under_load( # taking into account that this timeout is checked only at the beginning of every iteration. test_timeout_at = time.monotonic() + 5 * 60 - pg_conn = await pg.connect_async() + pg_conn = await endpoint.connect_async() tenant_id = TenantId(await pg_conn.fetchval("show neon.tenant_id")) timeline_id = TimelineId(await pg_conn.fetchval("show neon.timeline_id")) @@ -165,7 +167,7 @@ async def run_restarts_under_load( stats = WorkerStats(n_workers) workers = [] for worker_id in range(n_workers): - worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer) + worker = run_random_worker(stats, endpoint, worker_id, bank.n_accounts, max_transfer) workers.append(asyncio.create_task(worker)) for it in range(iterations): @@ -212,11 +214,11 @@ def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch("test_safekeepers_restarts_under_load") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( "test_safekeepers_restarts_under_load", config_lines=["max_replication_write_lag=1MB"] ) - asyncio.run(run_restarts_under_load(env, pg, env.safekeepers)) + asyncio.run(run_restarts_under_load(env, endpoint, env.safekeepers)) # Restart acceptors one by one and test that everything is working as expected @@ -228,7 +230,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch("test_restarts_frequent_checkpoints") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long - pg = env.postgres.create_start( + endpoint = env.endpoints.create_start( "test_restarts_frequent_checkpoints", config_lines=[ "max_replication_write_lag=1MB", @@ -240,11 +242,13 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments # are not removed before broadcasted to all safekeepers, with the help of replication slot - asyncio.run(run_restarts_under_load(env, pg, env.safekeepers, period_time=15, iterations=5)) + asyncio.run( + run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5) + ) -def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): - pg = Postgres( +def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): + endpoint = Endpoint( env, tenant_id=env.initial_tenant, port=env.port_distributor.get_port(), @@ -253,19 +257,19 @@ def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): check_stop_result=False, ) - # embed current time in node name - node_name = pgdir_name or f"pg_node_{time.time()}" - return pg.create_start( - branch_name=branch, node_name=node_name, config_lines=["log_statement=all"] + # embed current time in endpoint ID + endpoint_id = pgdir_name or f"ep-{time.time()}" + return endpoint.create_start( + branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"] ) async def exec_compute_query( env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None ): - with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg: + with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint: before_conn = time.time() - conn = await pg.connect_async() + conn = await endpoint.connect_async() res = await conn.fetch(query) await conn.close() after_conn = time.time() @@ -436,8 +440,8 @@ async def check_unavailability( assert bg_query.done() -async def run_unavailability(env: NeonEnv, pg: Postgres): - conn = await pg.connect_async() +async def run_unavailability(env: NeonEnv, endpoint: Endpoint): + conn = await endpoint.connect_async() # check basic work with table await conn.execute("CREATE TABLE t(key int primary key, value text)") @@ -462,9 +466,9 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_unavailability") - pg = env.postgres.create_start("test_safekeepers_unavailability") + endpoint = env.endpoints.create_start("test_safekeepers_unavailability") - asyncio.run(run_unavailability(env, pg)) + asyncio.run(run_unavailability(env, endpoint)) @dataclass @@ -493,8 +497,8 @@ async def xmas_garland(safekeepers: List[Safekeeper], data: RaceConditionTest): await asyncio.sleep(1) -async def run_race_conditions(env: NeonEnv, pg: Postgres): - conn = await pg.connect_async() +async def run_race_conditions(env: NeonEnv, endpoint: Endpoint): + conn = await endpoint.connect_async() await conn.execute("CREATE TABLE t(key int primary key, value text)") data = RaceConditionTest(0, False) @@ -525,14 +529,14 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_race_conditions") - pg = env.postgres.create_start("test_safekeepers_race_conditions") + endpoint = env.endpoints.create_start("test_safekeepers_race_conditions") - asyncio.run(run_race_conditions(env, pg)) + asyncio.run(run_race_conditions(env, endpoint)) # Check that pageserver can select safekeeper with largest commit_lsn # and switch if LSN is not updated for some time (NoWalTimeout). -async def run_wal_lagging(env: NeonEnv, pg: Postgres): +async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint): def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str: # use ports 10, 11 and 12 to simulate unavailable safekeepers return ",".join( @@ -542,10 +546,10 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): ] ) - conn = await pg.connect_async() + conn = await endpoint.connect_async() await conn.execute("CREATE TABLE t(key int primary key, value text)") await conn.close() - pg.stop() + endpoint.stop() n_iterations = 20 n_txes = 10000 @@ -561,11 +565,11 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): it -= 1 continue - pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) + endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) log.info(f"Iteration {it}: {active_sk}") - pg.start() - conn = await pg.connect_async() + endpoint.start() + conn = await endpoint.connect_async() for _ in range(n_txes): await conn.execute(f"INSERT INTO t values ({i}, 'payload')") @@ -573,11 +577,11 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): i += 1 await conn.close() - pg.stop() + endpoint.stop() - pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers))) - pg.start() - conn = await pg.connect_async() + endpoint.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers))) + endpoint.start() + conn = await endpoint.connect_async() log.info(f"Executed {i-1} queries") @@ -591,6 +595,6 @@ def test_wal_lagging(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_wal_lagging") - pg = env.postgres.create_start("test_wal_lagging") + endpoint = env.endpoints.create_start("test_wal_lagging") - asyncio.run(run_wal_lagging(env, pg)) + asyncio.run(run_wal_lagging(env, endpoint)) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 63d0b46f63..dd944af7eb 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -19,9 +19,9 @@ def test_wal_restore( ): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_wal_restore") - pg = env.postgres.create_start("test_wal_restore") - pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + endpoint = env.endpoints.create_start("test_wal_restore") + endpoint.safe_psql("create table t as select generate_series(1,300000)") + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index d6302f8632..7d944bebb3 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -45,9 +45,9 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): # assert tenant exists on disk assert (env.repo_dir / "tenants" / str(tenant_id)).exists() - pg = env.postgres.create_start("main", tenant_id=tenant_id) + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - pg_conn = pg.connect() + pg_conn = endpoint.connect() cur = pg_conn.cursor() # Create table, and insert some rows. Make it big enough that it doesn't fit in diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 0281f4f48b..7e8aef5a5f 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -24,7 +24,7 @@ def test_broken(neon_simple_env: NeonEnv, pg_bin): env = neon_simple_env env.neon_cli.create_branch("test_broken", "empty") - env.postgres.create_start("test_broken") + env.endpoints.create_start("test_broken") log.info("postgres is running") log.info("THIS NEXT COMMAND WILL FAIL:") From 89b5589b1b7c766100491be3d885783f913cdf54 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 13 Apr 2023 14:59:11 +0300 Subject: [PATCH 278/426] Tenant size should never be zero. Simplify test. Looking at the git history of this test, I think "size == 0" used to have a special meaning earlier, but now it should never happen. --- test_runner/regress/test_tenant_size.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index e8d534142e..2d905910f8 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -386,6 +386,7 @@ def test_single_branch_get_tenant_size_grows( consistent = current_lsn == after_lsn current_lsn = after_lsn size_debug_file.write(size_debug) + assert size > 0 return (current_lsn, size) with env.endpoints.create_start( @@ -423,17 +424,15 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - if size == 0: - assert prev_size == 0 - else: - # branch start shouldn't be past gc_horizon yet - # thus the size should grow as we insert more data - # "gc_horizon" is tuned so that it kicks in _after_ the - # insert phase, but before the update phase ends. - assert ( - current_lsn - initdb_lsn <= gc_horizon - ), "Tuning of GC window is likely out-of-date" - assert size > prev_size + + # branch start shouldn't be past gc_horizon yet + # thus the size should grow as we insert more data + # "gc_horizon" is tuned so that it kicks in _after_ the + # insert phase, but before the update phase ends. + assert ( + current_lsn - initdb_lsn <= gc_horizon + ), "Tuning of GC window is likely out-of-date" + assert size > prev_size collected_responses.append(("INSERT", current_lsn, size)) @@ -491,6 +490,9 @@ def test_single_branch_get_tenant_size_grows( collected_responses.append(("DROP", current_lsn, size)) + # Should have gone past gc_horizon, otherwise gc_horizon is too large + assert current_lsn - initdb_lsn > gc_horizon + # this isn't too many lines to forget for a while. observed while # developing these tests that locally the value is a bit more than what we # get in the ci. From 36c20946b44dbd305e9bce3ab0a8bf3e13e4386b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 13 Apr 2023 15:25:09 +0100 Subject: [PATCH 279/426] Verify extensions checksums (#4014) To not be taken by surprise by upstream git re-tag or by malicious activity, let's verify the checksum for extensions we download Also, unify the installation of `pg_graphql` and `pg_tiktoken` with other extensions. --- Dockerfile.compute-node | 44 +++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 5a223ae432..742f2e18a1 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -60,6 +60,7 @@ RUN apt update && \ # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ + echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ @@ -68,6 +69,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar ENV PATH "/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ + echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ @@ -84,6 +86,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ + echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && \ cd build && \ @@ -104,6 +107,7 @@ RUN apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \ + echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -125,11 +129,13 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # packaged cmake is too old RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ -q -O /tmp/cmake-install.sh \ + && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \ && chmod u+x /tmp/cmake-install.sh \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ + echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ cmake .. -DCMAKE_BUILD_TYPE=Release && \ @@ -139,6 +145,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz rm -rf build RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \ + echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -156,6 +163,7 @@ FROM build-deps AS unit-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ + echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -176,6 +184,7 @@ FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \ + echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -192,6 +201,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ + echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control @@ -206,6 +216,7 @@ FROM build-deps AS hypopg-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \ + echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -221,6 +232,7 @@ FROM build-deps AS pg-hashids-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ + echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -236,6 +248,7 @@ FROM build-deps AS rum-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ + echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -251,6 +264,7 @@ FROM build-deps AS pgtap-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ + echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -266,6 +280,7 @@ FROM build-deps AS ip4r-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \ + echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -281,6 +296,7 @@ FROM build-deps AS prefix-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \ + echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -296,6 +312,7 @@ FROM build-deps AS hll-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \ + echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -311,6 +328,7 @@ FROM build-deps AS plpgsql-check-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \ + echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -330,6 +348,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH" RUN apt-get update && \ apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ + echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \ cd build && \ @@ -352,22 +371,25 @@ ENV PATH "/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ + export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ ;; \ "v15") \ export PG_HINT_PLAN_VERSION=15_1_5_0 \ + export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \ ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ esac && \ wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ + echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control ######################################################################################### -# +# # Layer "rust extensions" # This layer is used to build `pgx` deps # @@ -395,7 +417,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux USER root ######################################################################################### -# +# # Layer "pg-jsonschema-pg-build" # Compile "pg_jsonschema" extension # @@ -403,15 +425,17 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build -# there is no release tag yet, but we need it due to the superuser fix in the control file +# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023 +# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5 RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ + echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control ######################################################################################### -# +# # Layer "pg-graphql-pg-build" # Compile "pg_graphql" extension # @@ -419,11 +443,13 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e FROM rust-extensions-build AS pg-graphql-pg-build +# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch) # Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in # pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the # same 1.1 version we've used before. -RUN git clone -b remove-pgx-contrib-spiext --single-branch https://github.com/yrashk/pg_graphql && \ - cd pg_graphql && \ +RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \ + echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \ + mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \ cargo pgx install --release && \ @@ -440,8 +466,10 @@ RUN git clone -b remove-pgx-contrib-spiext --single-branch https://github.com/yr FROM rust-extensions-build AS pg-tiktoken-pg-build -RUN git clone --depth=1 --single-branch https://github.com/kelvich/pg_tiktoken && \ - cd pg_tiktoken && \ +# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023 +RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \ + echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \ + mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ cargo pgx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control From db8dd6f380c097ab03740ed40dccc9e8ab311b4c Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 13 Apr 2023 18:07:29 +0200 Subject: [PATCH 280/426] [compute_ctl] Implement live reconfiguration (#3980) With this commit one can request compute reconfiguration from the running `compute_ctl` with compute in `Running` state by sending a new spec: ```shell curl -d "{\"spec\": $(cat ./compute-spec-new.json)}" http://localhost:3080/configure ``` Internally, we start a separate configurator thread that is waiting on `Condvar` for `ConfigurationPending` compute state in a loop. Then it does reconfiguration, sets compute back to `Running` state and notifies other waiters. It will need some follow-ups, e.g. for retry logic for control-plane requests, but should be useful for testing in the current state. This shouldn't affect any existing environment, since computes are configured in a different way there. Resolves neondatabase/cloud#4433 --- compute_tools/src/bin/compute_ctl.rs | 3 ++ compute_tools/src/compute.rs | 42 ++++++++++++++++++++++ compute_tools/src/configurator.rs | 54 ++++++++++++++++++++++++++++ compute_tools/src/http/api.rs | 2 +- compute_tools/src/lib.rs | 1 + compute_tools/src/spec.rs | 17 ++++++--- libs/compute_api/src/responses.rs | 14 +++++++- 7 files changed, 126 insertions(+), 7 deletions(-) create mode 100644 compute_tools/src/configurator.rs diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 633e603f6b..309310407d 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -46,6 +46,7 @@ use url::Url; use compute_api::responses::ComputeStatus; use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec}; +use compute_tools::configurator::launch_configurator; use compute_tools::http::api::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; @@ -175,6 +176,8 @@ fn main() -> Result<()> { // Launch remaining service threads let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); + let _configurator_handle = + launch_configurator(&compute).expect("cannot launch configurator thread"); // Start Postgres let mut delay_exit = false; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 07ede44c9b..6ddfcf86c2 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -356,6 +356,48 @@ impl ComputeNode { Ok(()) } + // We could've wrapped this around `pg_ctl reload`, but right now we don't use + // `pg_ctl` for start / stop, so this just seems much easier to do as we already + // have opened connection to Postgres and superuser access. + #[instrument(skip(self, client))] + fn pg_reload_conf(&self, client: &mut Client) -> Result<()> { + client.simple_query("SELECT pg_reload_conf()")?; + Ok(()) + } + + /// Similar to `apply_config()`, but does a bit different sequence of operations, + /// as it's used to reconfigure a previously started and configured Postgres node. + #[instrument(skip(self))] + pub fn reconfigure(&self) -> Result<()> { + let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; + + // Write new config + let pgdata_path = Path::new(&self.pgdata); + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?; + + let mut client = Client::connect(self.connstr.as_str(), NoTls)?; + self.pg_reload_conf(&mut client)?; + + // Proceed with post-startup configuration. Note, that order of operations is important. + handle_roles(&spec, &mut client)?; + handle_databases(&spec, &mut client)?; + handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; + handle_grants(&spec, self.connstr.as_str(), &mut client)?; + handle_extensions(&spec, &mut client)?; + + // 'Close' connection + drop(client); + + let unknown_op = "unknown".to_string(); + let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op); + info!( + "finished reconfiguration of compute node for operation {}", + op_id + ); + + Ok(()) + } + #[instrument(skip(self))] pub fn start_compute(&self) -> Result { let compute_state = self.state.lock().unwrap().clone(); diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs new file mode 100644 index 0000000000..a07fd0b8cd --- /dev/null +++ b/compute_tools/src/configurator.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; +use std::thread; + +use anyhow::Result; +use tracing::{error, info, instrument}; + +use compute_api::responses::ComputeStatus; + +use crate::compute::ComputeNode; + +#[instrument(skip(compute))] +fn configurator_main_loop(compute: &Arc) { + info!("waiting for reconfiguration requests"); + loop { + let state = compute.state.lock().unwrap(); + let mut state = compute.state_changed.wait(state).unwrap(); + + if state.status == ComputeStatus::ConfigurationPending { + info!("got configuration request"); + state.status = ComputeStatus::Configuration; + compute.state_changed.notify_all(); + drop(state); + + let mut new_status = ComputeStatus::Failed; + if let Err(e) = compute.reconfigure() { + error!("could not configure compute node: {}", e); + } else { + new_status = ComputeStatus::Running; + info!("compute node configured"); + } + + // XXX: used to test that API is blocking + // std::thread::sleep(std::time::Duration::from_millis(10000)); + + compute.set_status(new_status); + } else if state.status == ComputeStatus::Failed { + info!("compute node is now in Failed state, exiting"); + break; + } else { + info!("woken up for compute status: {:?}, sleeping", state.status); + } + } +} + +pub fn launch_configurator(compute: &Arc) -> Result> { + let compute = Arc::clone(compute); + + Ok(thread::Builder::new() + .name("compute-configurator".into()) + .spawn(move || { + configurator_main_loop(&compute); + info!("configurator thread is exited"); + })?) +} diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 81d4953345..92d058fbd1 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -155,7 +155,7 @@ async fn handle_configure_request( // ``` { let mut state = compute.state.lock().unwrap(); - if state.status != ComputeStatus::Empty { + if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { let msg = format!( "invalid compute status for configuration request: {:?}", state.status.clone() diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index aee6b53e6a..24811f75ee 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -4,6 +4,7 @@ //! pub mod checker; pub mod config; +pub mod configurator; pub mod http; #[macro_use] pub mod logger; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 2350113c39..088f74335a 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,7 +1,7 @@ use std::path::Path; use std::str::FromStr; -use anyhow::Result; +use anyhow::{anyhow, bail, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; use tracing::{info, info_span, instrument, span_enabled, warn, Level}; @@ -10,6 +10,7 @@ use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; +use compute_api::responses::ControlPlaneSpecResponse; use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` @@ -26,13 +27,19 @@ pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result, +} From fd31fafeeeb46ab1f6a68d888dd9b1bf2c1db816 Mon Sep 17 00:00:00 2001 From: Sasha Krassovsky Date: Thu, 13 Apr 2023 09:31:30 -0700 Subject: [PATCH 281/426] Make proxy shutdown when all connections are closed (#3764) ## Describe your changes Makes Proxy start draining connections on SIGTERM. ## Issue ticket number and link #3333 --- Cargo.lock | 1 + proxy/Cargo.toml | 1 + proxy/src/http/websocket.rs | 3 ++ proxy/src/main.rs | 42 ++++++++++++--------- proxy/src/proxy.rs | 53 +++++++++++++++++++-------- test_runner/fixtures/neon_fixtures.py | 11 ++++++ test_runner/regress/test_proxy.py | 18 +++++++++ 7 files changed, 96 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fc587c57bf..f67311cf09 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2965,6 +2965,7 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls", + "tokio-util", "tracing", "tracing-opentelemetry", "tracing-subscriber", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index add8b14c95..9d702b29c3 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -64,6 +64,7 @@ webpki-roots.workspace = true x509-parser.workspace = true workspace_hack.workspace = true +tokio-util.workspace = true [dev-dependencies] rcgen.workspace = true diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index 1757652a90..c7676e8e14 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -22,6 +22,7 @@ use tokio::{ io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}, net::TcpListener, }; +use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument}; use utils::http::{error::ApiError, json::json_response}; @@ -188,6 +189,7 @@ async fn ws_handler( pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, + cancellation_token: CancellationToken, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -231,6 +233,7 @@ pub async fn task_main( hyper::Server::builder(accept::from_stream(tls_listener)) .serve(make_svc) + .with_graceful_shutdown(cancellation_token.cancelled()) .await?; Ok(()) diff --git a/proxy/src/main.rs b/proxy/src/main.rs index c6526e9aff..1fd13c9f68 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -28,6 +28,7 @@ use config::ProxyConfig; use futures::FutureExt; use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; +use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::{project_git_version, sentry_init::init_sentry}; @@ -66,39 +67,48 @@ async fn main() -> anyhow::Result<()> { let proxy_address: SocketAddr = args.get_one::("proxy").unwrap().parse()?; info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; + let cancellation_token = CancellationToken::new(); - let mut tasks = vec![ - tokio::spawn(handle_signals()), - tokio::spawn(http::server::task_main(http_listener)), - tokio::spawn(proxy::task_main(config, proxy_listener)), - tokio::spawn(console::mgmt::task_main(mgmt_listener)), - ]; + let mut client_tasks = vec![tokio::spawn(proxy::task_main( + config, + proxy_listener, + cancellation_token.clone(), + ))]; if let Some(wss_address) = args.get_one::("wss") { let wss_address: SocketAddr = wss_address.parse()?; info!("Starting wss on {wss_address}"); let wss_listener = TcpListener::bind(wss_address).await?; - tasks.push(tokio::spawn(http::websocket::task_main( + client_tasks.push(tokio::spawn(http::websocket::task_main( config, wss_listener, + cancellation_token.clone(), ))); } + let mut tasks = vec![ + tokio::spawn(handle_signals(cancellation_token)), + tokio::spawn(http::server::task_main(http_listener)), + tokio::spawn(console::mgmt::task_main(mgmt_listener)), + ]; + if let Some(metrics_config) = &config.metric_collection { tasks.push(tokio::spawn(metrics::task_main(metrics_config))); } - // This combinator will block until either all tasks complete or - // one of them finishes with an error (others will be cancelled). - let tasks = tasks.into_iter().map(flatten_err); - let _: Vec<()> = futures::future::try_join_all(tasks).await?; - + let tasks = futures::future::try_join_all(tasks.into_iter().map(flatten_err)); + let client_tasks = futures::future::try_join_all(client_tasks.into_iter().map(flatten_err)); + tokio::select! { + // We are only expecting an error from these forever tasks + res = tasks => { res?; }, + res = client_tasks => { res?; }, + } Ok(()) } /// Handle unix signals appropriately. -async fn handle_signals() -> anyhow::Result<()> { +async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> { use tokio::signal::unix::{signal, SignalKind}; let mut hangup = signal(SignalKind::hangup())?; @@ -116,11 +126,9 @@ async fn handle_signals() -> anyhow::Result<()> { warn!("received SIGINT, exiting immediately"); bail!("interrupted"); } - // TODO: Don't accept new proxy connections. - // TODO: Shut down once all exisiting connections have been closed. _ = terminate.recv() => { - warn!("received SIGTERM, exiting immediately"); - bail!("terminated"); + warn!("received SIGTERM, shutting down once all existing connections have closed"); + token.cancel(); } } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 70fb25474e..9945e3697f 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -17,6 +17,7 @@ use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; use utils::measured_stream::MeasuredStream; @@ -63,6 +64,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -72,29 +74,48 @@ pub async fn task_main( // will be inherited by all accepted client sockets. socket2::SockRef::from(&listener).set_keepalive(true)?; + let mut connections = tokio::task::JoinSet::new(); let cancel_map = Arc::new(CancelMap::default()); + loop { - let (socket, peer_addr) = listener.accept().await?; - info!("accepted postgres client connection from {peer_addr}"); + tokio::select! { + accept_result = listener.accept() => { + let (socket, peer_addr) = accept_result?; + info!("accepted postgres client connection from {peer_addr}"); - let session_id = uuid::Uuid::new_v4(); - let cancel_map = Arc::clone(&cancel_map); - tokio::spawn( - async move { - info!("spawned a task for {peer_addr}"); + let session_id = uuid::Uuid::new_v4(); + let cancel_map = Arc::clone(&cancel_map); + connections.spawn( + async move { + info!("spawned a task for {peer_addr}"); - socket - .set_nodelay(true) - .context("failed to set socket option")?; + socket + .set_nodelay(true) + .context("failed to set socket option")?; - handle_client(config, &cancel_map, session_id, socket).await + handle_client(config, &cancel_map, session_id, socket).await + } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); + }), + ); } - .unwrap_or_else(|e| { - // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); - }), - ); + _ = cancellation_token.cancelled() => { + drop(listener); + break; + } + } } + // Drain connections + while let Some(res) = connections.join_next().await { + if let Err(e) = res { + if !e.is_panic() && !e.is_cancelled() { + warn!("unexpected error from joined connection task: {e:?}"); + } + } + } + Ok(()) } // TODO(tech debt): unite this with its twin below. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e9f0363843..fb12752d3c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2041,6 +2041,17 @@ class NeonProxy(PgProtocol): self._wait_until_ready() return self + # Sends SIGTERM to the proxy if it has been started + def terminate(self): + if self._popen: + self._popen.terminate() + + # Waits for proxy to exit if it has been opened with a default timeout of + # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time. + def wait_for_exit(self, timeout=2): + if self._popen: + self._popen.wait(timeout=2) + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) def _wait_until_ready(self): requests.get(f"http://{self.host}:{self.http_port}/v1/status") diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 51fabdd2a1..ee6349436b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -1,3 +1,5 @@ +import subprocess + import psycopg2 import pytest from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres @@ -134,3 +136,19 @@ def test_forward_params_to_client(static_proxy: NeonProxy): for name, value in cur.fetchall(): # Check that proxy has forwarded this parameter. assert conn.get_parameter_status(name) == value + + +@pytest.mark.timeout(5) +def test_close_on_connections_exit(static_proxy: NeonProxy): + # Open two connections, send SIGTERM, then ensure that proxy doesn't exit + # until after connections close. + with static_proxy.connect(options="project=irrelevant"), static_proxy.connect( + options="project=irrelevant" + ): + static_proxy.terminate() + with pytest.raises(subprocess.TimeoutExpired): + static_proxy.wait_for_exit(timeout=2) + # Ensure we don't accept any more connections + with pytest.raises(psycopg2.OperationalError): + static_proxy.connect(options="project=irrelevant") + static_proxy.wait_for_exit() From b6c7c3290f795d918f726de2df0015bb4a3cd260 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 13 Apr 2023 20:03:24 +0100 Subject: [PATCH 282/426] Bump h2 from 0.3.15 to 0.3.17 (#4020) --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f67311cf09..86787b8f6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1572,9 +1572,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" +checksum = "66b91535aa35fea1523ad1b86cb6b53c28e0ae566ba4a460f4457e936cad7c6f" dependencies = [ "bytes", "fnv", From 8895f28dae229d84bf58d3660968b404a3f0c2e0 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 14 Apr 2023 12:25:45 +0200 Subject: [PATCH 283/426] make evictions_low_residence_duration_metric_threshold per-tenant (#3949) Before this patch, if a tenant would override its eviction_policy setting to use a lower LayerAccessThreshold::threshold than the `evictions_low_residence_duration_metric_threshold`, the evictions done for that tenant would count towards the `evictions_with_low_residence_duration` metric. That metric is used to identify pre-mature evictions, commonly triggered by disk-usage-based eviction under disk pressure. We don't want that to happen for the legitimate evictions of the tenant that overrides its eviction_policy. So, this patch - moves the setting into TenantConf - adds test coverage - updates the staging & prod yamls Forward Compatibility: Software before this patch will ignore the new tenant conf field and use the global one instead. So we can roll back safely. Backward Compatibility: Parsing old configs with software as of this patch will fail in `PageServerConf::parse_and_validate` with error `unrecognized pageserver option 'evictions_low_residence_duration_metric_threshold'` if the option is still present in the global section. We deal with this by updating the configs in Ansible. fixes https://github.com/neondatabase/neon/issues/3940 --- .../ansible/prod.ap-southeast-1.hosts.yaml | 2 +- .github/ansible/prod.eu-central-1.hosts.yaml | 2 +- .github/ansible/prod.us-east-2.hosts.yaml | 2 +- .github/ansible/prod.us-west-2.hosts.yaml | 8 +- .github/ansible/staging.eu-west-1.hosts.yaml | 2 +- .github/ansible/staging.us-east-2.hosts.yaml | 2 +- control_plane/src/pageserver.rs | 6 ++ libs/pageserver_api/src/models.rs | 3 + pageserver/src/config.rs | 40 ++------ pageserver/src/http/routes.rs | 26 +++++ pageserver/src/metrics.rs | 24 ++++- pageserver/src/tenant.rs | 10 ++ pageserver/src/tenant/config.rs | 16 ++++ pageserver/src/tenant/timeline.rs | 41 +++++++- test_runner/fixtures/pageserver/http.py | 7 ++ test_runner/regress/test_tenant_conf.py | 94 ++++++++++++++++++- 16 files changed, 239 insertions(+), 46 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index c185086eef..9c53733491 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index 0a0f974ea4..3186519ca8 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 4427bb344e..3062475b20 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index 53626b4f59..9cf847bcb1 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" @@ -34,7 +34,7 @@ storage: pageservers: hosts: pageserver-0.us-west-2.aws.neon.tech: - ansible_host: i-0d9f6dfae0e1c780d + ansible_host: i-0d9f6dfae0e1c780d pageserver-1.us-west-2.aws.neon.tech: ansible_host: i-0c834be1dddba8b3f pageserver-2.us-west-2.aws.neon.tech: @@ -49,5 +49,5 @@ storage: safekeeper-1.us-west-2.aws.neon.tech: ansible_host: i-074682f9d3c712e7c safekeeper-2.us-west-2.aws.neon.tech: - ansible_host: i-042b7efb1729d7966 - + ansible_host: i-042b7efb1729d7966 + diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index 34c8e77280..39f5613935 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "20m" threshold: &default_eviction_threshold "20m" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 94f2be83a4..e63ed6e639 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "20m" threshold: &default_eviction_threshold "20m" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 094069e4c0..b700d426ba 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -368,6 +368,9 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'min_resident_size_override' as integer")?, + evictions_low_residence_duration_metric_threshold: settings + .remove("evictions_low_residence_duration_metric_threshold") + .map(|x| x.to_string()), }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -445,6 +448,9 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'min_resident_size_override' as an integer")?, + evictions_low_residence_duration_metric_threshold: settings + .get("evictions_low_residence_duration_metric_threshold") + .map(|x| x.to_string()), }) .send()? .error_from_body()?; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index a351761f4a..15c37b9453 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -135,6 +135,7 @@ pub struct TenantCreateRequest { // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, pub min_resident_size_override: Option, + pub evictions_low_residence_duration_metric_threshold: Option, } #[serde_as] @@ -181,6 +182,7 @@ pub struct TenantConfigRequest { // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, pub min_resident_size_override: Option, + pub evictions_low_residence_duration_metric_threshold: Option, } impl TenantConfigRequest { @@ -202,6 +204,7 @@ impl TenantConfigRequest { trace_read_requests: None, eviction_policy: None, min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: None, } } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 19f0f22815..826cf1aab3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -62,7 +62,6 @@ pub mod defaults { pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; - pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; /// /// Default built-in configuration file. @@ -91,7 +90,6 @@ pub mod defaults { #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' -#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} @@ -108,6 +106,7 @@ pub mod defaults { #pitr_interval = '{DEFAULT_PITR_INTERVAL}' #min_resident_size_override = .. # in bytes +#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' # [remote_storage] @@ -182,9 +181,6 @@ pub struct PageServerConf { pub metric_collection_endpoint: Option, pub synthetic_size_calculation_interval: Duration, - // See the corresponding metric's help string. - pub evictions_low_residence_duration_metric_threshold: Duration, - pub disk_usage_based_eviction: Option, pub test_remote_failures: u64, @@ -257,8 +253,6 @@ struct PageServerConfigBuilder { metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, - evictions_low_residence_duration_metric_threshold: BuilderValue, - disk_usage_based_eviction: BuilderValue>, test_remote_failures: BuilderValue, @@ -316,11 +310,6 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), - evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration( - DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")), - disk_usage_based_eviction: Set(None), test_remote_failures: Set(0), @@ -438,10 +427,6 @@ impl PageServerConfigBuilder { self.test_remote_failures = BuilderValue::Set(fail_first); } - pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) { - self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value); - } - pub fn disk_usage_based_eviction(&mut self, value: Option) { self.disk_usage_based_eviction = BuilderValue::Set(value); } @@ -525,11 +510,6 @@ impl PageServerConfigBuilder { synthetic_size_calculation_interval: self .synthetic_size_calculation_interval .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, - evictions_low_residence_duration_metric_threshold: self - .evictions_low_residence_duration_metric_threshold - .ok_or(anyhow!( - "missing evictions_low_residence_duration_metric_threshold" - ))?, disk_usage_based_eviction: self .disk_usage_based_eviction .ok_or(anyhow!("missing disk_usage_based_eviction"))?, @@ -721,7 +701,6 @@ impl PageServerConf { "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), - "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?), "disk_usage_based_eviction" => { tracing::info!("disk_usage_based_eviction: {:#?}", &item); builder.disk_usage_based_eviction( @@ -839,6 +818,13 @@ impl PageServerConf { ); } + if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") { + t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration( + "evictions_low_residence_duration_metric_threshold", + item, + )?); + } + Ok(t_conf) } @@ -877,10 +863,6 @@ impl PageServerConf { cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, synthetic_size_calculation_interval: Duration::from_secs(60), - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .unwrap(), disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, @@ -1029,8 +1011,6 @@ cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' -evictions_low_residence_duration_metric_threshold = '444 s' - log_format = 'json' "#; @@ -1087,9 +1067,6 @@ log_format = 'json' synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD - )?, disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, @@ -1144,7 +1121,6 @@ log_format = 'json' cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), - evictions_low_residence_duration_metric_threshold: Duration::from_secs(444), disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e7a86e4822..06a97f6dff 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -781,6 +781,19 @@ async fn tenant_create_handler(mut request: Request) -> Result, } impl TimelineMetrics { @@ -656,7 +672,9 @@ impl TimelineMetrics { num_persistent_files_created, persistent_bytes_written, evictions, - evictions_with_low_residence_duration, + evictions_with_low_residence_duration: std::sync::RwLock::new( + evictions_with_low_residence_duration, + ), } } } @@ -675,6 +693,8 @@ impl Drop for TimelineMetrics { let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); self.evictions_with_low_residence_duration + .write() + .unwrap() .remove(tenant_id, timeline_id); for op in STORAGE_TIME_OPERATIONS { let _ = diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d98aa5c566..18a4d7617b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1735,6 +1735,13 @@ impl Tenant { pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { *self.tenant_conf.write().unwrap() = new_tenant_conf; + // Don't hold self.timelines.lock() during the notifies. + // There's no risk of deadlock right now, but there could be if we consolidate + // mutexes in struct Timeline in the future. + let timelines = self.list_timelines(); + for timeline in timelines { + timeline.tenant_conf_updated(); + } } fn create_timeline_data( @@ -2815,6 +2822,9 @@ pub mod harness { trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), min_resident_size_override: tenant_conf.min_resident_size_override, + evictions_low_residence_duration_metric_threshold: Some( + tenant_conf.evictions_low_residence_duration_metric_threshold, + ), } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index cdabb23a7b..c01a8aa8c0 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -39,6 +39,7 @@ pub mod defaults { pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; } /// Per-tenant configuration options @@ -93,6 +94,9 @@ pub struct TenantConf { pub trace_read_requests: bool, pub eviction_policy: EvictionPolicy, pub min_resident_size_override: Option, + // See the corresponding metric's help string. + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Duration, } /// Same as TenantConf, but this struct preserves the information about @@ -164,6 +168,11 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub min_resident_size_override: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + #[serde(default)] + pub evictions_low_residence_duration_metric_threshold: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -228,6 +237,9 @@ impl TenantConfOpt { min_resident_size_override: self .min_resident_size_override .or(global_conf.min_resident_size_override), + evictions_low_residence_duration_metric_threshold: self + .evictions_low_residence_duration_metric_threshold + .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), } } } @@ -260,6 +272,10 @@ impl Default for TenantConf { trace_read_requests: false, eviction_policy: EvictionPolicy::NoEviction, min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 29d8b544cc..b8b1f963e5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -77,6 +77,7 @@ pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; use self::walreceiver::{WalReceiver, WalReceiverConf}; +use super::config::TenantConf; use super::layer_map::BatchedUpdates; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; @@ -161,7 +162,7 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, - metrics: TimelineMetrics, + pub(super) metrics: TimelineMetrics, /// Ensures layers aren't frozen by checkpointer between /// [`Timeline::get_layer_for_write`] and layer reads. @@ -1136,6 +1137,8 @@ impl Timeline { if let Some(delta) = local_layer_residence_duration { self.metrics .evictions_with_low_residence_duration + .read() + .unwrap() .observe(delta); info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period"); } else { @@ -1209,6 +1212,35 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } + fn get_evictions_low_residence_duration_metric_threshold( + tenant_conf: &TenantConfOpt, + default_tenant_conf: &TenantConf, + ) -> Duration { + tenant_conf + .evictions_low_residence_duration_metric_threshold + .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) + } + + pub(super) fn tenant_conf_updated(&self) { + // NB: Most tenant conf options are read by background loops, so, + // changes will automatically be picked up. + + // The threshold is embedded in the metric. So, we need to update it. + { + let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( + &self.tenant_conf.read().unwrap(), + &self.conf.default_tenant_conf, + ); + let tenant_id_str = self.tenant_id.to_string(); + let timeline_id_str = self.timeline_id.to_string(); + self.metrics + .evictions_with_low_residence_duration + .write() + .unwrap() + .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold); + } + } + /// Open a Timeline handle. /// /// Loads the metadata for the timeline into memory, but not the layer map. @@ -1240,6 +1272,11 @@ impl Timeline { let max_lsn_wal_lag = tenant_conf_guard .max_lsn_wal_lag .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag); + let evictions_low_residence_duration_metric_threshold = + Self::get_evictions_low_residence_duration_metric_threshold( + &tenant_conf_guard, + &conf.default_tenant_conf, + ); drop(tenant_conf_guard); Arc::new_cyclic(|myself| { @@ -1287,7 +1324,7 @@ impl Timeline { &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", - conf.evictions_low_residence_duration_metric_threshold, + evictions_low_residence_duration_metric_threshold, ), ), diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 1e1effe295..69042478c7 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -519,6 +519,13 @@ class PageserverHttpClient(requests.Session): assert res.status_code == 200 + def download_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + info = self.layer_map_info(tenant_id, timeline_id) + for layer in info.historic_layers: + if not layer.remote: + continue + self.download_layer(tenant_id, timeline_id, layer.layer_file_name) + def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 28f1a960df..1ed86d19a2 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -18,7 +18,11 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = """ page_cache_size=444; wait_lsn_timeout='111 s'; -tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" +[tenant_config] +checkpoint_distance = 10000 +compaction_target_size = 1048576 +evictions_low_residence_duration_metric_threshold = "2 days" +""" env = neon_env_builder.init_start() http_client = env.pageserver.http_client() @@ -39,6 +43,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" new_conf = { "checkpoint_distance": "20000", "gc_period": "30sec", + "evictions_low_residence_duration_metric_threshold": "42s", } tenant, _ = env.neon_cli.create_tenant(conf=new_conf) @@ -78,6 +83,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert effective_config["gc_period"] == "1h" assert effective_config["image_creation_threshold"] == 3 assert effective_config["pitr_interval"] == "7days" + assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: @@ -112,6 +118,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert ( new_effective_config["gc_period"] == "30s" ), "Specific 'gc_period' config should override the default value" + assert ( + new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s" + ), "Should override default value" assert new_effective_config["compaction_target_size"] == 1048576 assert new_effective_config["compaction_period"] == "20s" assert new_effective_config["compaction_threshold"] == 10 @@ -125,6 +134,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_period": "80sec", "compaction_period": "80sec", "image_creation_threshold": "2", + "evictions_low_residence_duration_metric_threshold": "23h", } env.neon_cli.config_tenant( tenant_id=tenant, @@ -167,6 +177,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert ( updated_effective_config["compaction_period"] == "1m 20s" ), "Specific 'compaction_period' config should override the default value" + assert ( + updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h" + ), "Should override default value" assert updated_effective_config["compaction_target_size"] == 1048576 assert updated_effective_config["compaction_threshold"] == 10 assert updated_effective_config["gc_horizon"] == 67108864 @@ -225,6 +238,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert final_effective_config["gc_horizon"] == 67108864 assert final_effective_config["gc_period"] == "1h" assert final_effective_config["image_creation_threshold"] == 3 + assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -285,3 +299,81 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): # dont test applying the setting here, we have that another test case to show it # we just care about being able to create the file assert len(contents_first) > len(contents_later) + + +def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.LOCAL_FS, + test_name="test_live_reconfig_get_evictions_low_residence_duration_metric_threshold", + ) + + env = neon_env_builder.init_start() + assert isinstance(env.remote_storage, LocalFsStorage) + + (tenant_id, timeline_id) = env.neon_cli.create_tenant() + ps_http = env.pageserver.http_client() + + def get_metric(): + metrics = ps_http.get_metrics() + metric = metrics.query_one( + "pageserver_evictions_with_low_residence_duration_total", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + return metric + + default_value = ps_http.tenant_config(tenant_id).effective_config[ + "evictions_low_residence_duration_metric_threshold" + ] + metric = get_metric() + assert int(metric.value) == 0, "metric is present with default value" + + assert default_value == "1day" + + ps_http.download_all_layers(tenant_id, timeline_id) + ps_http.evict_all_layers(tenant_id, timeline_id) + metric = get_metric() + assert int(metric.value) > 0, "metric is updated" + + env.neon_cli.config_tenant( + tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value} + ) + updated_metric = get_metric() + assert int(updated_metric.value) == int( + metric.value + ), "metric is unchanged when setting same value" + + env.neon_cli.config_tenant( + tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"} + ) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 + assert int(metric.value) == 0 + + ps_http.download_all_layers(tenant_id, timeline_id) + ps_http.evict_all_layers(tenant_id, timeline_id) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 + assert int(metric.value) > 0 + + env.neon_cli.config_tenant( + tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"} + ) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 + assert int(metric.value) == 0, "value resets if label changes" + + ps_http.download_all_layers(tenant_id, timeline_id) + ps_http.evict_all_layers(tenant_id, timeline_id) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 + assert int(metric.value) > 0, "set a non-zero value for next step" + + env.neon_cli.config_tenant(tenant_id, {}) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default" + assert int(metric.value) == 0, "value resets to default" From 0c82ff3d989e592f9c6ea848e2d3538c42feac7a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 14 Apr 2023 11:46:47 +0100 Subject: [PATCH 284/426] test_runner: add Timeline Inspector to Grafana links (#4021) --- test_runner/fixtures/neon_fixtures.py | 17 ++++++++++++--- test_runner/fixtures/utils.py | 30 ++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fb12752d3c..c6610ba062 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1913,15 +1913,26 @@ def remote_pg( connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable") + + host = parse_dsn(connstr).get("host", "") + is_neon = host.endswith(".neon.build") + start_ms = int(datetime.utcnow().timestamp() * 1000) with RemotePostgres(pg_bin, connstr) as remote_pg: + if is_neon: + timeline_id = TimelineId(remote_pg.safe_psql("SHOW neon.timeline_id")[0][0]) + yield remote_pg end_ms = int(datetime.utcnow().timestamp() * 1000) - host = parse_dsn(connstr).get("host", "") - if host.endswith(".neon.build"): + if is_neon: # Add 10s margin to the start and end times - allure_add_grafana_links(host, start_ms - 10_000, end_ms + 10_000) + allure_add_grafana_links( + host, + timeline_id, + start_ms - 10_000, + end_ms + 10_000, + ) class PSQL: diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 71df74dfba..30acd3f637 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -13,6 +13,7 @@ import allure from psycopg2.extensions import cursor from fixtures.log_helper import log +from fixtures.types import TimelineId Fn = TypeVar("Fn", bound=Callable[..., Any]) @@ -186,11 +187,15 @@ def allure_attach_from_dir(dir: Path): allure.attach.file(source, name, attachment_type, extension) -DATASOURCE_ID = "xHHYY0dVz" +GRAFANA_URL = "https://neonprod.grafana.net" +GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore" +GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector" +LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz" -def allure_add_grafana_links(host: str, start_ms: int, end_ms: int): +def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): """Add links to server logs in Grafana to Allure report""" + links = {} # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) @@ -202,12 +207,12 @@ def allure_add_grafana_links(host: str, start_ms: int, end_ms: int): } params: Dict[str, Any] = { - "datasource": DATASOURCE_ID, + "datasource": LOGS_STAGING_DATASOURCE_ID, "queries": [ { "expr": "", "refId": "A", - "datasource": {"type": "loki", "uid": DATASOURCE_ID}, + "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID}, "editorMode": "code", "queryType": "range", } @@ -220,8 +225,23 @@ def allure_add_grafana_links(host: str, start_ms: int, end_ms: int): for name, expr in expressions.items(): params["queries"][0]["expr"] = expr query_string = urlencode({"orgId": 1, "left": json.dumps(params)}) - link = f"https://neonprod.grafana.net/explore?{query_string}" + links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}" + timeline_qs = urlencode( + { + "orgId": 1, + "var-environment": "victoria-metrics-aws-dev", + "var-timeline_id": timeline_id, + "var-endpoint_id": endpoint_id, + "var-log_datasource": "grafanacloud-neonstaging-logs", + "from": start_ms, + "to": end_ms, + } + ) + link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}" + links["Timeline Inspector"] = link + + for name, link in links.items(): allure.dynamic.link(link, name=name) log.info(f"{name}: {link}") From 589cf1ed21148035d033701ee911fee79a4cea6f Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Fri, 14 Apr 2023 13:05:07 +0200 Subject: [PATCH 285/426] [compute_ctl] Do not create availability checker data on each start (#4019) Initially, idea was to ensure that when we come and check data availability, special service table already contains one row. So if we loose it for some reason, we will error out. Yet, to do availability check we anyway start compute first! So it doesn't really add some value, but we affect each compute start as we update at least one row in the database. Also this writes some WAL, so if timeline is close to `neon.max_cluster_size` it could prevent compute from starting up. That said, do CREATE TABLE IF NOT EXISTS + UPSERT right in the `/check_writability` handler. --- compute_tools/src/checker.rs | 54 +++++++++++++++++------------------ compute_tools/src/compute.rs | 2 -- compute_tools/src/http/api.rs | 5 +++- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index b8413de516..b6a287bdeb 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,12 +1,28 @@ use anyhow::{anyhow, Result}; -use postgres::Client; use tokio_postgres::NoTls; use tracing::{error, instrument}; use crate::compute::ComputeNode; +/// Update timestamp in a row in a special service table to check +/// that we can actually write some data in this particular timeline. +/// Create table if it's missing. #[instrument(skip_all)] -pub fn create_writability_check_data(client: &mut Client) -> Result<()> { +pub async fn check_writability(compute: &ComputeNode) -> Result<()> { + // Connect to the database. + let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; + if client.is_closed() { + return Err(anyhow!("connection to postgres closed")); + } + + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + error!("connection error: {}", e); + } + }); + let query = " CREATE TABLE IF NOT EXISTS health_check ( id serial primary key, @@ -15,31 +31,15 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> { INSERT INTO health_check VALUES (1, now()) ON CONFLICT (id) DO UPDATE SET updated_at = now();"; - let result = client.simple_query(query)?; - if result.len() < 2 { - return Err(anyhow::format_err!("executed {} queries", result.len())); - } - Ok(()) -} - -#[instrument(skip_all)] -pub async fn check_writability(compute: &ComputeNode) -> Result<()> { - let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; - if client.is_closed() { - return Err(anyhow!("connection to postgres closed")); - } - tokio::spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - let result = client - .simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;") - .await?; - - if result.len() != 1 { - return Err(anyhow!("statement can't be executed")); + + let result = client.simple_query(query).await?; + + if result.len() != 2 { + return Err(anyhow::format_err!( + "expected 2 query results, but got {}", + result.len() + )); } + Ok(()) } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 6ddfcf86c2..51de2b6e0a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -32,7 +32,6 @@ use utils::lsn::Lsn; use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::ComputeSpec; -use crate::checker::create_writability_check_data; use crate::config; use crate::pg_helpers::*; use crate::spec::*; @@ -342,7 +341,6 @@ impl ComputeNode { handle_databases(spec, &mut client)?; handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; handle_grants(spec, self.connstr.as_str(), &mut client)?; - create_writability_check_data(&mut client)?; handle_extensions(spec, &mut client)?; // 'Close' connection diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 92d058fbd1..3ca688de69 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -85,7 +85,10 @@ async fn routes(req: Request, compute: &Arc) -> Response Response::new(Body::from("true")), - Err(e) => Response::new(Body::from(e.to_string())), + Err(e) => { + error!("check_writability failed: {}", e); + Response::new(Body::from(e.to_string())) + } } } From 017d3a390dd17313612d23eb7d757635d42f6365 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 14 Apr 2023 14:00:13 +0300 Subject: [PATCH 286/426] Compile postgres with lz4 and zstd support --- Dockerfile.compute-node | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 742f2e18a1..2b1d8d63ae 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -12,7 +12,7 @@ FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ - libicu-dev libxslt1-dev + libicu-dev libxslt1-dev liblz4-dev libzstd-dev ######################################################################################### # @@ -24,8 +24,13 @@ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION} postgres RUN cd postgres && \ - ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \ - --with-libxml --with-libxslt && \ + export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ + --with-icu --with-libxml --with-libxslt --with-lz4" && \ + if [ "${PG_VERSION}" != "v14" ]; then \ + # zstd is available only from PG15 + export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \ + fi && \ + eval $CONFIGURE_CMD && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers @@ -565,13 +570,17 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # Install: # libreadline8 for psql # libicu67, locales for collations (including ICU and plpgsql_check) +# liblz4-1 for lz4 # libossp-uuid16 for extension ossp-uuid # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS # libxml2, libxslt1.1 for xml2 +# libzstd1 for zstd RUN apt update && \ apt install --no-install-recommends -y \ + gdb \ locales \ libicu67 \ + liblz4-1 \ libreadline8 \ libossp-uuid16 \ libgeos-c1v5 \ @@ -581,7 +590,7 @@ RUN apt update && \ libsfcgal1 \ libxml2 \ libxslt1.1 \ - gdb && \ + libzstd1 && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 From 75ea8106ece7c7a8f1abedbeb382f322e070c686 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 14 Apr 2023 14:01:24 +0300 Subject: [PATCH 287/426] Add `procps` into compute containers --- Dockerfile.compute-node | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 2b1d8d63ae..229e09aa98 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -590,7 +590,8 @@ RUN apt update && \ libsfcgal1 \ libxml2 \ libxslt1.1 \ - libzstd1 && \ + libzstd1 \ + procps && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 From 5ffa20dd822f989019df7013db5459e8238ab1a2 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 14 Apr 2023 13:54:34 +0300 Subject: [PATCH 288/426] [proxy] adjust proxy sleep timeout --- .../dev-eu-west-1-zeta.neon-proxy-scram.yaml | 4 ++-- ...v-us-east-2-beta.neon-proxy-scram-legacy.yaml | 16 ++++++++++++++++ .../dev-us-east-2-beta.neon-proxy-scram.yaml | 5 +++-- ...-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 4 ++-- ...prod-eu-central-1-gamma.neon-proxy-scram.yaml | 4 ++-- .../prod-us-east-2-delta.neon-proxy-scram.yaml | 4 ++-- ...od-us-west-2-eta.neon-proxy-scram-legacy.yaml | 4 ++-- .../prod-us-west-2-eta.neon-proxy-scram.yaml | 4 ++-- 8 files changed, 31 insertions(+), 14 deletions(-) diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index 2307856464..a8567665d3 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -7,13 +7,13 @@ deploymentStrategy: maxSurge: 100% maxUnavailable: 50% -# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# Delay the kill signal by 5 minutes (5 * 60) # The pod(s) will stay in Terminating, keeps the existing connections # but doesn't receive new ones containerLifecycle: preStop: exec: - command: ["/bin/sh", "-c", "sleep 604800"] + command: ["/bin/sh", "-c", "sleep 300"] terminationGracePeriodSeconds: 604800 image: diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index feee1b369a..46cfdd2e69 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -1,6 +1,22 @@ # Helm chart values for neon-proxy-scram. # This is a YAML-formatted file. +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 5 minutes (5 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 300"] +terminationGracePeriodSeconds: 604800 + + image: repository: neondatabase/neon diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 2a8f028f3b..fdd869c122 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -7,15 +7,16 @@ deploymentStrategy: maxSurge: 100% maxUnavailable: 50% -# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# Delay the kill signal by 5 minutes (5 * 60) # The pod(s) will stay in Terminating, keeps the existing connections # but doesn't receive new ones containerLifecycle: preStop: exec: - command: ["/bin/sh", "-c", "sleep 604800"] + command: ["/bin/sh", "-c", "sleep 300"] terminationGracePeriodSeconds: 604800 + image: repository: neondatabase/neon diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 5a98217bae..6088d62fba 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -7,13 +7,13 @@ deploymentStrategy: maxSurge: 100% maxUnavailable: 50% -# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# Delay the kill signal by 5 minutes (5 * 60) # The pod(s) will stay in Terminating, keeps the existing connections # but doesn't receive new ones containerLifecycle: preStop: exec: - command: ["/bin/sh", "-c", "sleep 604800"] + command: ["/bin/sh", "-c", "sleep 300"] terminationGracePeriodSeconds: 604800 diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index a9ee49d82f..7d26f2e02f 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -7,13 +7,13 @@ deploymentStrategy: maxSurge: 100% maxUnavailable: 50% -# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# Delay the kill signal by 5 minutes (5 * 60) # The pod(s) will stay in Terminating, keeps the existing connections # but doesn't receive new ones containerLifecycle: preStop: exec: - command: ["/bin/sh", "-c", "sleep 604800"] + command: ["/bin/sh", "-c", "sleep 300"] terminationGracePeriodSeconds: 604800 diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 239a9911c7..ae239fd3c1 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -7,13 +7,13 @@ deploymentStrategy: maxSurge: 100% maxUnavailable: 50% -# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# Delay the kill signal by 5 minutes (5 * 60) # The pod(s) will stay in Terminating, keeps the existing connections # but doesn't receive new ones containerLifecycle: preStop: exec: - command: ["/bin/sh", "-c", "sleep 604800"] + command: ["/bin/sh", "-c", "sleep 300"] terminationGracePeriodSeconds: 604800 diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml index a186fb833f..7378e8abda 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -7,13 +7,13 @@ deploymentStrategy: maxSurge: 100% maxUnavailable: 50% -# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# Delay the kill signal by 5 minutes (5 * 60) # The pod(s) will stay in Terminating, keeps the existing connections # but doesn't receive new ones containerLifecycle: preStop: exec: - command: ["/bin/sh", "-c", "sleep 604800"] + command: ["/bin/sh", "-c", "sleep 300"] terminationGracePeriodSeconds: 604800 diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index c987ae236a..d9d458f081 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -7,13 +7,13 @@ deploymentStrategy: maxSurge: 100% maxUnavailable: 50% -# Delay the kill signal by 7 days (7 * 24 * 60 * 60) +# Delay the kill signal by 5 minutes (5 * 60) # The pod(s) will stay in Terminating, keeps the existing connections # but doesn't receive new ones containerLifecycle: preStop: exec: - command: ["/bin/sh", "-c", "sleep 604800"] + command: ["/bin/sh", "-c", "sleep 300"] terminationGracePeriodSeconds: 604800 From ebea29841517cd189c800e09e67a85202376dcdc Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 14 Apr 2023 18:28:54 +0300 Subject: [PATCH 289/426] Update most of the dependencies to their latest versions (#4026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/neondatabase/neon/pull/3991 Brings the changes back with the right way to use new `toml_edit` to deserialize values and a unit test for this. All non-trivial updates extracted into separate commits, also `carho hakari` data and its manifest format were updated. 3 sets of crates remain unupdated: * `base64` — touches proxy in a lot of places and changed its api (by 0.21 version) quite strongly since our version (0.13). * `opentelemetry` and `opentelemetry-*` crates ``` error[E0308]: mismatched types --> libs/tracing-utils/src/http.rs:65:21 | 65 | span.set_parent(parent_ctx); | ---------- ^^^^^^^^^^ expected struct `opentelemetry_api::context::Context`, found struct `opentelemetry::Context` | | | arguments to this method are incorrect | = note: struct `opentelemetry::Context` and struct `opentelemetry_api::context::Context` have similar names, but are actually distinct types note: struct `opentelemetry::Context` is defined in crate `opentelemetry_api` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/opentelemetry_api-0.19.0/src/context.rs:77:1 | 77 | pub struct Context { | ^^^^^^^^^^^^^^^^^^ note: struct `opentelemetry_api::context::Context` is defined in crate `opentelemetry_api` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/opentelemetry_api-0.18.0/src/context.rs:77:1 | 77 | pub struct Context { | ^^^^^^^^^^^^^^^^^^ = note: perhaps two different versions of crate `opentelemetry_api` are being used? note: associated function defined here --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/tracing-opentelemetry-0.18.0/src/span_ext.rs:43:8 | 43 | fn set_parent(&self, cx: Context); | ^^^^^^^^^^ For more information about this error, try `rustc --explain E0308`. error: could not compile `tracing-utils` due to previous error warning: build failed, waiting for other jobs to finish... error: could not compile `tracing-utils` due to previous error ``` `tracing-opentelemetry` of version `0.19` is not yet released, that is supposed to have the update we need. * similarly, `rustls`, `tokio-rustls`, `rustls-*` and `tls-listener` crates have similar issue: ``` error[E0308]: mismatched types --> libs/postgres_backend/tests/simple_select.rs:112:78 | 112 | let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg); | --------------------------------------------- ^^^^^^^^^^ expected struct `rustls::client::client_conn::ClientConfig`, found struct `ClientConfig` | | | arguments to this function are incorrect | = note: struct `ClientConfig` and struct `rustls::client::client_conn::ClientConfig` have similar names, but are actually distinct types note: struct `ClientConfig` is defined in crate `rustls` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/rustls-0.21.0/src/client/client_conn.rs:125:1 | 125 | pub struct ClientConfig { | ^^^^^^^^^^^^^^^^^^^^^^^ note: struct `rustls::client::client_conn::ClientConfig` is defined in crate `rustls` --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/rustls-0.20.8/src/client/client_conn.rs:91:1 | 91 | pub struct ClientConfig { | ^^^^^^^^^^^^^^^^^^^^^^^ = note: perhaps two different versions of crate `rustls` are being used? note: associated function defined here --> /Users/someonetoignore/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-postgres-rustls-0.9.0/src/lib.rs:23:12 | 23 | pub fn new(config: ClientConfig) -> Self { | ^^^ For more information about this error, try `rustc --explain E0308`. error: could not compile `postgres_backend` due to previous error warning: build failed, waiting for other jobs to finish... ``` * aws crates: I could not make new API to work with bucket endpoint overload, and console e2e tests failed. Other our tests passed, further investigation is worth to be done in https://github.com/neondatabase/neon/issues/4008 --- .config/hakari.toml | 2 +- Cargo.lock | 1406 +++++++++++------ Cargo.toml | 26 +- libs/consumption_metrics/Cargo.toml | 17 +- libs/postgres_ffi/build.rs | 6 +- libs/remote_storage/tests/pagination_tests.rs | 7 +- libs/tracing-utils/Cargo.toml | 3 +- libs/utils/Cargo.toml | 2 +- pageserver/src/config.rs | 90 +- pageserver/src/page_service.rs | 2 +- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/config.rs | 4 +- .../tenant/remote_timeline_client/upload.rs | 2 +- storage_broker/src/bin/storage_broker.rs | 3 +- trace/Cargo.toml | 2 - workspace_hack/Cargo.toml | 9 +- 16 files changed, 1023 insertions(+), 560 deletions(-) diff --git a/.config/hakari.toml b/.config/hakari.toml index 12d2d1bf9c..15b939e86f 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -4,7 +4,7 @@ hakari-package = "workspace_hack" # Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above. -dep-format-version = "3" +dep-format-version = "4" # Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended. # Hakari works much better with the new feature resolver. diff --git a/Cargo.lock b/Cargo.lock index 86787b8f6a..a18f4490da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,28 +64,77 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] -name = "anyhow" -version = "1.0.68" +name = "anstream" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61" +checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is-terminal", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" + +[[package]] +name = "anstyle-parse" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd" +dependencies = [ + "anstyle", + "windows-sys 0.48.0", +] + +[[package]] +name = "anyhow" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" dependencies = [ "backtrace", ] [[package]] name = "archery" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02" +checksum = "b6cd774058b1b415c4855d8b86436c04bf050c003156fe24bc326fb3fe75c343" dependencies = [ "static_assertions", ] [[package]] name = "asn1-rs" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4" +checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0" dependencies = [ "asn1-rs-derive", "asn1-rs-impl", @@ -105,7 +154,7 @@ checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "synstructure", ] @@ -117,46 +166,47 @@ checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "async-stream" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" dependencies = [ "async-stream-impl", "futures-core", + "pin-project-lite", ] [[package]] name = "async-stream-impl" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "async-trait" -version = "0.1.64" +version = "0.1.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2" +checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "atomic-polyfill" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d299f547288d6db8d5c3a2916f7b2f66134b15b8c1ac1c4357dd3b8752af7bb2" +checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289" dependencies = [ "critical-section", ] @@ -187,13 +237,13 @@ dependencies = [ "aws-http", "aws-sdk-sso", "aws-sdk-sts", - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-http-tower", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", "aws-smithy-json", - "aws-smithy-types", - "aws-types", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "bytes", "hex", "http", @@ -206,15 +256,29 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-credential-types" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4232d3729eefc287adc0d5a8adc97b7d94eefffe6bbe94312cc86c7ab6b06ce" +dependencies = [ + "aws-smithy-async 0.55.1", + "aws-smithy-types 0.55.1", + "fastrand", + "tokio", + "tracing", + "zeroize", +] + [[package]] name = "aws-endpoint" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ca8f374874f6459aaa88dc861d7f5d834ca1ff97668eae190e97266b5f6c3fb" dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "http", "regex", "tracing", @@ -226,9 +290,9 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78d41e19e779b73463f5f0c21b3aacc995f4ba783ab13a7ae9f5dfb159a551b4" dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "bytes", "http", "http-body", @@ -248,15 +312,15 @@ dependencies = [ "aws-http", "aws-sig-auth", "aws-sigv4", - "aws-smithy-async", + "aws-smithy-async 0.51.0", "aws-smithy-checksums", - "aws-smithy-client", + "aws-smithy-client 0.51.0", "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-http-tower", - "aws-smithy-types", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", + "aws-smithy-types 0.51.0", "aws-smithy-xml", - "aws-types", + "aws-types 0.51.0", "bytes", "bytes-utils", "http", @@ -275,13 +339,13 @@ dependencies = [ "aws-endpoint", "aws-http", "aws-sig-auth", - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-http-tower", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", "aws-smithy-json", - "aws-smithy-types", - "aws-types", + "aws-smithy-types 0.51.0", + "aws-types 0.51.0", "bytes", "http", "tokio-stream", @@ -297,14 +361,14 @@ dependencies = [ "aws-endpoint", "aws-http", "aws-sig-auth", - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-http-tower", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", "aws-smithy-query", - "aws-smithy-types", + "aws-smithy-types 0.51.0", "aws-smithy-xml", - "aws-types", + "aws-types 0.51.0", "bytes", "http", "tower", @@ -318,20 +382,20 @@ checksum = "12cbe7b2be9e185c1fbce27fc9c41c66b195b32d89aa099f98768d9544221308" dependencies = [ "aws-sigv4", "aws-smithy-eventstream", - "aws-smithy-http", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-types 0.51.0", "http", "tracing", ] [[package]] name = "aws-sigv4" -version = "0.51.0" +version = "0.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ff4cff8c4a101962d593ba94e72cd83891aecd423f0c6e3146bff6fb92c9e3" +checksum = "5c0b2658d2cb66dbf02f0e8dee80810ef1e0ca3530ede463e0ef994c301087d1" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-http", + "aws-smithy-http 0.51.0", "bytes", "form_urlencoded", "hex", @@ -356,14 +420,26 @@ dependencies = [ "tokio-stream", ] +[[package]] +name = "aws-smithy-async" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88573bcfbe1dcfd54d4912846df028b42d6255cbf9ce07be216b1bbfd11fc4b9" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", + "tokio-stream", +] + [[package]] name = "aws-smithy-checksums" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc227e36e346f45298288359f37123e1a92628d1cec6b11b5eb335553278bd9e" dependencies = [ - "aws-smithy-http", - "aws-smithy-types", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", "bytes", "crc32c", "crc32fast", @@ -383,10 +459,10 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff28d553714f8f54cd921227934fc13a536a1c03f106e56b362fd57e16d450ad" dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-http-tower", - "aws-smithy-types", + "aws-smithy-async 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-http-tower 0.51.0", + "aws-smithy-types 0.51.0", "bytes", "fastrand", "http", @@ -400,13 +476,33 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-smithy-client" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2f52352bae50d3337d5d6151b695d31a8c10ebea113eca5bead531f8301b067" +dependencies = [ + "aws-smithy-async 0.55.1", + "aws-smithy-http 0.55.1", + "aws-smithy-http-tower 0.55.1", + "aws-smithy-types 0.55.1", + "bytes", + "fastrand", + "http", + "http-body", + "pin-project-lite", + "tokio", + "tower", + "tracing", +] + [[package]] name = "aws-smithy-eventstream" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7ea0df7161ce65b5c8ca6eb709a1a907376fa18226976e41c748ce02ccccf24" dependencies = [ - "aws-smithy-types", + "aws-smithy-types 0.51.0", "bytes", "crc32fast", ] @@ -418,7 +514,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf58ed4fefa61dbf038e5421a521cbc2c448ef69deff0ab1d915d8a10eda5664" dependencies = [ "aws-smithy-eventstream", - "aws-smithy-types", + "aws-smithy-types 0.51.0", "bytes", "bytes-utils", "futures-core", @@ -434,13 +530,49 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-smithy-http" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03bcc02d7ed9649d855c8ce4a735e9848d7b8f7568aad0504c158e3baa955df8" +dependencies = [ + "aws-smithy-types 0.55.1", + "bytes", + "bytes-utils", + "futures-core", + "http", + "http-body", + "hyper", + "once_cell", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + [[package]] name = "aws-smithy-http-tower" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20c96d7bd35e7cf96aca1134b2f81b1b59ffe493f7c6539c051791cbbf7a42d3" dependencies = [ - "aws-smithy-http", + "aws-smithy-http 0.51.0", + "bytes", + "http", + "http-body", + "pin-project-lite", + "tower", + "tracing", +] + +[[package]] +name = "aws-smithy-http-tower" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da88b3a860f65505996c29192d800f1aeb9480440f56d63aad33a3c12045017a" +dependencies = [ + "aws-smithy-http 0.55.1", + "aws-smithy-types 0.55.1", "bytes", "http", "http-body", @@ -455,7 +587,7 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8324ba98c8a94187723cc16c37aefa09504646ee65c3d2c3af495bab5ea701b" dependencies = [ - "aws-smithy-types", + "aws-smithy-types 0.51.0", ] [[package]] @@ -464,7 +596,7 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83834ed2ff69ea6f6657baf205267dc2c0abe940703503a3e5d60ce23be3d306" dependencies = [ - "aws-smithy-types", + "aws-smithy-types 0.51.0", "urlencoding", ] @@ -480,6 +612,19 @@ dependencies = [ "time", ] +[[package]] +name = "aws-smithy-types" +version = "0.55.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd0afc731fd1417d791f9145a1e0c30e23ae0beaab9b4814017708ead2fc20f1" +dependencies = [ + "base64-simd", + "itoa", + "num-integer", + "ryu", + "time", +] + [[package]] name = "aws-smithy-xml" version = "0.51.0" @@ -495,10 +640,10 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05701d32da168b44f7ee63147781aed8723e792cc131cb9b18363b5393f17f70" dependencies = [ - "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", - "aws-smithy-types", + "aws-smithy-async 0.51.0", + "aws-smithy-client 0.51.0", + "aws-smithy-http 0.51.0", + "aws-smithy-types 0.51.0", "http", "rustc_version", "tracing", @@ -506,10 +651,26 @@ dependencies = [ ] [[package]] -name = "axum" -version = "0.6.4" +name = "aws-types" +version = "0.55.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc" +checksum = "b9b082e329d9a304d39e193ad5c7ab363a0d6507aca6965e0673a746686fb0cc" +dependencies = [ + "aws-credential-types", + "aws-smithy-async 0.55.1", + "aws-smithy-client 0.55.1", + "aws-smithy-http 0.55.1", + "aws-smithy-types 0.55.1", + "http", + "rustc_version", + "tracing", +] + +[[package]] +name = "axum" +version = "0.6.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62" dependencies = [ "async-trait", "axum-core", @@ -529,16 +690,15 @@ dependencies = [ "serde", "sync_wrapper", "tower", - "tower-http", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cae3e661676ffbacb30f1a824089a8c9150e71017f7e1e38f2aa32009188d34" +checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" dependencies = [ "async-trait", "bytes", @@ -584,6 +744,16 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + [[package]] name = "bincode" version = "1.3.3" @@ -595,9 +765,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.61.0" +version = "0.65.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" +checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" dependencies = [ "bitflags", "cexpr", @@ -606,12 +776,13 @@ dependencies = [ "lazycell", "log", "peeking_take_while", + "prettyplease 0.2.4", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", - "syn", + "syn 2.0.15", "which", ] @@ -623,18 +794,18 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "block-buffer" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "bstr" -version = "1.2.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f0778972c64420fdedc63f09919c8a88bda7b25135357fd25a5d9f3257e832" +checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" dependencies = [ "memchr", "once_cell", @@ -702,9 +873,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" dependencies = [ "iana-time-zone", "num-integer", @@ -742,9 +913,9 @@ dependencies = [ [[package]] name = "clang-sys" -version = "1.4.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" dependencies = [ "glob", "libc", @@ -765,30 +936,38 @@ dependencies = [ [[package]] name = "clap" -version = "4.1.4" +version = "4.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76" +checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a" dependencies = [ - "bitflags", + "clap_builder", "clap_derive", - "clap_lex 0.3.1", - "is-terminal", "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6" +dependencies = [ + "anstream", + "anstyle", + "bitflags", + "clap_lex 0.4.1", "strsim", - "termcolor", ] [[package]] name = "clap_derive" -version = "4.1.0" +version = "4.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" +checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4" dependencies = [ "heck", - "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] @@ -802,12 +981,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.3.1" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" -dependencies = [ - "os_str_bytes", -] +checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" [[package]] name = "close_fds" @@ -829,6 +1005,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + [[package]] name = "comfy-table" version = "6.1.4" @@ -859,7 +1041,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.1.4", + "clap 4.2.2", "compute_api", "futures", "hyper", @@ -921,7 +1103,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.2", "comfy-table", "git-version", "nix", @@ -957,15 +1139,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" dependencies = [ "libc", ] @@ -1032,9 +1214,9 @@ checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" [[package]] name = "crossbeam-channel" -version = "0.5.6" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1042,9 +1224,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -1053,22 +1235,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.13" +version = "0.9.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" +checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.7.1", + "memoffset 0.8.0", "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.14" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" +checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" dependencies = [ "cfg-if", ] @@ -1110,9 +1292,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9" +checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" dependencies = [ "cc", "cxxbridge-flags", @@ -1122,9 +1304,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d" +checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" dependencies = [ "cc", "codespan-reporting", @@ -1132,31 +1314,31 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn", + "syn 2.0.15", ] [[package]] name = "cxxbridge-flags" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a" +checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" [[package]] name = "cxxbridge-macro" -version = "1.0.89" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2" +checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "darling" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" dependencies = [ "darling_core", "darling_macro", @@ -1164,27 +1346,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn", + "syn 1.0.109", ] [[package]] name = "darling_macro" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" dependencies = [ "darling_core", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1218,9 +1400,9 @@ dependencies = [ [[package]] name = "der-parser" -version = "8.1.0" +version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1" +checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e" dependencies = [ "asn1-rs", "displaydoc", @@ -1249,7 +1431,7 @@ checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1269,9 +1451,9 @@ dependencies = [ [[package]] name = "enum-map" -version = "2.4.2" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c25992259941eb7e57b936157961b217a4fc8597829ddef0596d6c3cd86e1a" +checksum = "988f0d17a0fa38291e5f41f71ea8d46a5d5497b9054d5a759fae2cbb819f2356" dependencies = [ "enum-map-derive", ] @@ -1284,7 +1466,7 @@ checksum = "2a4da76b3b6116d758c7ba93f7ec6a35d2e2cf24feda76c6e38a375f4d5c59f2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1305,7 +1487,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1323,13 +1505,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.2.8" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -1361,23 +1543,23 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" dependencies = [ "instant", ] [[package]] name = "filetime" -version = "0.2.19" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e884668cd0c7480504233e951174ddc3b382f7c2666e3b7310b5c4e7b0c37f9" +checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" dependencies = [ "cfg-if", "libc", - "redox_syscall", - "windows-sys 0.42.0", + "redox_syscall 0.2.16", + "windows-sys 0.48.0", ] [[package]] @@ -1422,9 +1604,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" dependencies = [ "futures-channel", "futures-core", @@ -1437,9 +1619,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", "futures-sink", @@ -1447,15 +1629,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" [[package]] name = "futures-executor" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" dependencies = [ "futures-core", "futures-task", @@ -1464,32 +1646,32 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" [[package]] name = "futures-macro" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "futures-sink" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" [[package]] name = "futures-task" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" [[package]] name = "futures-timer" @@ -1499,9 +1681,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-channel", "futures-core", @@ -1517,9 +1699,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -1527,20 +1709,22 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] name = "gimli" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "221996f774192f0f718773def8201c4ae31f02616a54ccfc2d358bb0e5cefdec" +checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" [[package]] name = "git-version" @@ -1561,7 +1745,7 @@ dependencies = [ "proc-macro-hack", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1639,7 +1823,7 @@ dependencies = [ "atomic-polyfill", "hash32", "rustc_version", - "spin 0.9.4", + "spin 0.9.8", "stable_deref_trait", ] @@ -1667,6 +1851,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + [[package]] name = "hex" version = "0.4.3" @@ -1678,9 +1868,9 @@ dependencies = [ [[package]] name = "hex-literal" -version = "0.3.4" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" +checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" [[package]] name = "hmac" @@ -1704,9 +1894,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" dependencies = [ "bytes", "fnv", @@ -1724,12 +1914,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "http-range-header" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" - [[package]] name = "httparse" version = "1.8.0" @@ -1760,9 +1944,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.23" +version = "0.14.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" +checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4" dependencies = [ "bytes", "futures-channel", @@ -1775,7 +1959,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2", + "socket2 0.4.9", "tokio", "tower-service", "tracing", @@ -1791,10 +1975,10 @@ dependencies = [ "http", "hyper", "log", - "rustls", + "rustls 0.20.8", "rustls-native-certs", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] @@ -1824,16 +2008,16 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.53" +version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "winapi", + "windows", ] [[package]] @@ -1864,9 +2048,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.2" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown 0.12.3", @@ -1904,30 +2088,31 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "1.0.4" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" dependencies = [ + "hermit-abi 0.3.1", "libc", - "windows-sys 0.42.0", + "windows-sys 0.48.0", ] [[package]] name = "ipnet" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" +checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" [[package]] name = "is-terminal" -version = "0.4.2" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi 0.3.1", "io-lifetimes", - "rustix", - "windows-sys 0.42.0", + "rustix 0.37.11", + "windows-sys 0.48.0", ] [[package]] @@ -1941,9 +2126,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" +checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "js-sys" @@ -1956,11 +2141,11 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.2.0" +version = "8.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" +checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" dependencies = [ - "base64 0.13.1", + "base64 0.21.0", "pem", "ring", "serde", @@ -2002,9 +2187,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5" [[package]] name = "libloading" @@ -2031,6 +2216,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +[[package]] +name = "linux-raw-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f" + [[package]] name = "lock_api" version = "0.4.9" @@ -2123,9 +2314,9 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "mime_guess" @@ -2145,23 +2336,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.6.4" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2e212582ede878b109755efd0773a4f0f4ec851584cf0aefbeb4d9ecc114822" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" +checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" dependencies = [ "libc", "log", "wasi", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -2194,15 +2385,6 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "nom8" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae01545c9c7fc4486ab7debaf2aad7003ac19431791868fb2e8066df97fad2f8" -dependencies = [ - "memchr", -] - [[package]] name = "notify" version = "5.1.0" @@ -2291,9 +2473,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.0" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "oorandom" @@ -2358,8 +2540,8 @@ dependencies = [ "futures-util", "opentelemetry", "prost", - "tonic", - "tonic-build", + "tonic 0.8.3", + "tonic-build 0.8.4", ] [[package]] @@ -2411,9 +2593,9 @@ dependencies = [ [[package]] name = "os_info" -version = "3.6.0" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c424bc68d15e0778838ac013b5b3449544d8133633d8016319e7e05a820b8c0" +checksum = "006e42d5b888366f1880eda20371fedde764ed2213dc8496f49622fa0c99cd5e" dependencies = [ "log", "serde", @@ -2422,9 +2604,15 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.4.1" +version = "6.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" + +[[package]] +name = "outref" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" [[package]] name = "overload" @@ -2442,7 +2630,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.2", "close_fds", "const_format", "consumption_metrics", @@ -2541,7 +2729,7 @@ checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "windows-sys 0.45.0", ] @@ -2569,9 +2757,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "petgraph" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" +checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", "indexmap", @@ -2612,7 +2800,7 @@ checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -2706,14 +2894,14 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.23.4", "tracing", "workspace_hack", ] @@ -2779,36 +2967,22 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.23" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e97e3215779627f01ee256d2fad52f3d95e8e1c11e9fc6fd08f7cd455d5d5c78" +checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86" dependencies = [ "proc-macro2", - "syn", + "syn 1.0.109", ] [[package]] -name = "proc-macro-error" -version = "1.0.4" +name = "prettyplease" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" dependencies = [ "proc-macro2", - "quote", - "version_check", + "syn 2.0.15", ] [[package]] @@ -2819,9 +2993,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.50" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" dependencies = [ "unicode-ident", ] @@ -2836,7 +3010,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix", + "rustix 0.36.12", ] [[package]] @@ -2857,9 +3031,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21dc42e00223fc37204bd4aa177e69420c604ca4a183209a8f9de30c6d934698" +checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" dependencies = [ "bytes", "prost-derive", @@ -2867,9 +3041,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f8ad728fb08fe212df3c05169e940fbb6d9d16a877ddde14644a983ba2012e" +checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", "heck", @@ -2878,35 +3052,34 @@ dependencies = [ "log", "multimap", "petgraph", - "prettyplease", + "prettyplease 0.1.25", "prost", "prost-types", "regex", - "syn", + "syn 1.0.109", "tempfile", "which", ] [[package]] name = "prost-derive" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bda8c0881ea9f722eb9629376db3d0b903b462477c1aafcb0566610ac28ac5d" +checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", "itertools", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] name = "prost-types" -version = "0.11.6" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e0526209433e96d83d750dd81a99118edbc55739e7e61a46764fd2ad537788" +checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13" dependencies = [ - "bytes", "prost", ] @@ -2921,7 +3094,7 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.2", "consumption_metrics", "futures", "git-version", @@ -2951,20 +3124,20 @@ dependencies = [ "reqwest-tracing", "routerify", "rstest", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "scopeguard", "serde", "serde_json", "sha2", - "socket2", + "socket2 0.5.2", "sync_wrapper", "thiserror", "tls-listener", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.23.4", "tokio-util", "tracing", "tracing-opentelemetry", @@ -2973,16 +3146,16 @@ dependencies = [ "url", "utils", "uuid", - "webpki-roots", + "webpki-roots 0.23.0", "workspace_hack", "x509-parser", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" dependencies = [ "proc-macro2", ] @@ -3019,9 +3192,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" dependencies = [ "either", "rayon-core", @@ -3029,9 +3202,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.10.2" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -3061,10 +3234,19 @@ dependencies = [ ] [[package]] -name = "regex" -version = "1.7.1" +name = "redox_syscall" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" dependencies = [ "aho-corasick", "memchr", @@ -3082,9 +3264,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "remote_storage" @@ -3094,8 +3276,8 @@ dependencies = [ "async-trait", "aws-config", "aws-sdk-s3", - "aws-smithy-http", - "aws-types", + "aws-smithy-http 0.51.0", + "aws-types 0.55.1", "hyper", "metrics", "once_cell", @@ -3114,9 +3296,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.14" +version = "0.11.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21eed90ec8570952d53b772ecf8f206aa1ec9a3d76b2521c56c42973f2d91ee9" +checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" dependencies = [ "base64 0.21.0", "bytes", @@ -3136,27 +3318,27 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls", + "rustls 0.20.8", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "webpki-roots", + "webpki-roots 0.22.6", "winreg", ] [[package]] name = "reqwest-middleware" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894" +checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b" dependencies = [ "anyhow", "async-trait", @@ -3169,11 +3351,12 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce" +checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8" dependencies = [ "async-trait", + "getrandom", "opentelemetry", "reqwest", "reqwest-middleware", @@ -3212,18 +3395,18 @@ dependencies = [ [[package]] name = "rpds" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000" +checksum = "9bd6ce569b15c331b1e5fd8cf6adb0bf240678b5f0cdc4d0f41e11683f6feba9" dependencies = [ "archery", ] [[package]] name = "rstest" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07f2d176c472198ec1e6551dc7da28f1c089652f66a7b722676c2238ebc0edf" +checksum = "de1bb486a691878cd320c2f0d319ba91eeaa2e894066d8b5f8f117c000e9d962" dependencies = [ "futures", "futures-timer", @@ -3233,23 +3416,23 @@ dependencies = [ [[package]] name = "rstest_macros" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7229b505ae0706e64f37ffc54a9c163e11022a6636d58fe1f3f52018257ff9f7" +checksum = "290ca1a1c8ca7edb7c3283bd44dc35dd54fdec6253a3912e201ba1072018fca8" dependencies = [ "cfg-if", "proc-macro2", "quote", "rustc_version", - "syn", + "syn 1.0.109", "unicode-ident", ] [[package]] name = "rustc-demangle" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" +checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b" [[package]] name = "rustc-hash" @@ -3277,16 +3460,30 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.7" +version = "0.36.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" +checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", - "linux-raw-sys", - "windows-sys 0.42.0", + "linux-raw-sys 0.1.4", + "windows-sys 0.45.0", +] + +[[package]] +name = "rustix" +version = "0.37.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys 0.3.1", + "windows-sys 0.48.0", ] [[package]] @@ -3301,6 +3498,18 @@ dependencies = [ "webpki", ] +[[package]] +name = "rustls" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -3323,16 +3532,26 @@ dependencies = [ ] [[package]] -name = "rustversion" -version = "1.0.11" +name = "rustls-webpki" +version = "0.100.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70" +checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" [[package]] name = "ryu" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" +checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" [[package]] name = "safekeeper" @@ -3344,7 +3563,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.2", "const_format", "crc32c", "fs2", @@ -3417,9 +3636,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "scratch" -version = "1.0.3" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" [[package]] name = "sct" @@ -3456,33 +3675,33 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" +checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "sentry" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6097dc270a9c4555c5d6222ed243eaa97ff38e29299ed7c5cb36099033c604e" +checksum = "b5ce6d3512e2617c209ec1e86b0ca2fea06454cd34653c91092bf0f3ec41f8e3" dependencies = [ "httpdate", "reqwest", - "rustls", + "rustls 0.20.8", "sentry-backtrace", "sentry-contexts", "sentry-core", "sentry-panic", "tokio", "ureq", - "webpki-roots", + "webpki-roots 0.22.6", ] [[package]] name = "sentry-backtrace" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d92d1e4d591534ae4f872d6142f3b500f4ffc179a6aed8a3e86c7cc96d10a6a" +checksum = "0e7fe408d4d1f8de188a9309916e02e129cbe51ca19e55badea5a64899399b1a" dependencies = [ "backtrace", "once_cell", @@ -3492,9 +3711,9 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3afa877b1898ff67dd9878cf4bec4e53cef7d3be9f14b1fc9e4fcdf36f8e4259" +checksum = "5695096a059a89973ec541062d331ff4c9aeef9c2951416c894f0fff76340e7d" dependencies = [ "hostname", "libc", @@ -3506,9 +3725,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc43eb7e4e3a444151a0fe8a0e9ce60eabd905dae33d66e257fa26f1b509c1bd" +checksum = "5b22828bfd118a7b660cf7a155002a494755c0424cebb7061e4743ecde9c7dbc" dependencies = [ "once_cell", "rand", @@ -3519,9 +3738,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccab4fab11e3e63c45f4524bee2e75cde39cdf164cb0b0cbe6ccd1948ceddf66" +checksum = "1f4ced2a7a8c14899d58eec402d946f69d5ed26a3fc363a7e8b1e5cb88473a01" dependencies = [ "sentry-backtrace", "sentry-core", @@ -3529,9 +3748,9 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.29.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63708ec450b6bdcb657af760c447416d69c38ce421f34e5e2e9ce8118410bc7" +checksum = "360ee3270f7a4a1eee6c667f7d38360b995431598a73b740dfe420da548d9cc9" dependencies = [ "debugid", "getrandom", @@ -3546,35 +3765,44 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.152" +version = "1.0.160" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.152" +version = "1.0.160" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "serde_json" -version = "1.0.91" +version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" +checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" dependencies = [ "itoa", "ryu", "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -3589,9 +3817,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.2.0" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d904179146de381af4c93d3af6ca4984b3152db687dacb9c3c35e86f39809c" +checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0" dependencies = [ "base64 0.13.1", "chrono", @@ -3605,14 +3833,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.2.0" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1966009f3c05f095697c537312f5415d1e3ed31ce0a56942bac4c771c5c335e" +checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c" dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -3654,9 +3882,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" +checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" dependencies = [ "libc", "signal-hook-registry", @@ -3675,9 +3903,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" dependencies = [ "libc", ] @@ -3702,9 +3930,9 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" dependencies = [ "autocfg", ] @@ -3717,14 +3945,24 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" dependencies = [ "libc", "winapi", ] +[[package]] +name = "socket2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "spin" version = "0.5.2" @@ -3733,9 +3971,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "spin" -version = "0.9.4" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" dependencies = [ "lock_api", ] @@ -3759,7 +3997,7 @@ dependencies = [ "anyhow", "async-stream", "bytes", - "clap 4.1.4", + "clap 4.2.2", "const_format", "futures", "futures-core", @@ -3773,8 +4011,8 @@ dependencies = [ "prost", "tokio", "tokio-stream", - "tonic", - "tonic-build", + "tonic 0.9.1", + "tonic-build 0.9.1", "tracing", "utils", "workspace_hack", @@ -3812,7 +4050,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 1.0.109", ] [[package]] @@ -3829,9 +4067,20 @@ checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" [[package]] name = "syn" -version = "1.0.107" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" dependencies = [ "proc-macro2", "quote", @@ -3852,7 +4101,7 @@ checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "unicode-xid", ] @@ -3869,24 +4118,24 @@ dependencies = [ [[package]] name = "task-local-extensions" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4" +checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" dependencies = [ - "tokio", + "pin-utils", ] [[package]] name = "tempfile" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", - "rustix", - "windows-sys 0.42.0", + "redox_syscall 0.3.5", + "rustix 0.37.11", + "windows-sys 0.45.0", ] [[package]] @@ -3926,7 +4175,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" dependencies = [ "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -3937,38 +4186,39 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] name = "thread_local" -version = "1.1.4" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ + "cfg-if", "once_cell", ] [[package]] name = "time" -version = "0.3.17" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" +checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" dependencies = [ "itoa", "serde", @@ -3984,9 +4234,9 @@ checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" [[package]] name = "time-macros" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" +checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" dependencies = [ "time-core", ] @@ -4012,9 +4262,9 @@ dependencies = [ [[package]] name = "tinyvec_macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tls-listener" @@ -4027,26 +4277,25 @@ dependencies = [ "pin-project-lite", "thiserror", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] name = "tokio" -version = "1.25.0" +version = "1.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e00990ebabbe4c14c08aca901caed183ecd5c09562a12c824bb53d3c3fd3af" +checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001" dependencies = [ "autocfg", "bytes", "libc", - "memchr", "mio", "num_cpus", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.4.9", "tokio-macros", - "windows-sys 0.42.0", + "windows-sys 0.45.0", ] [[package]] @@ -4061,13 +4310,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.8.2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.15", ] [[package]] @@ -4088,7 +4337,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "socket2", + "socket2 0.4.9", "tokio", "tokio-util", ] @@ -4101,10 +4350,10 @@ checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7" dependencies = [ "futures", "ring", - "rustls", + "rustls 0.20.8", "tokio", "tokio-postgres", - "tokio-rustls", + "tokio-rustls 0.23.4", ] [[package]] @@ -4113,16 +4362,26 @@ version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ - "rustls", + "rustls 0.20.8", "tokio", "webpki", ] [[package]] -name = "tokio-stream" -version = "0.1.11" +name = "tokio-rustls" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" +checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" +dependencies = [ + "rustls 0.21.0", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" dependencies = [ "futures-core", "pin-project-lite", @@ -4137,7 +4396,7 @@ dependencies = [ "filetime", "futures-core", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "tokio", "tokio-stream", "xattr", @@ -4157,9 +4416,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.4" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" +checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" dependencies = [ "bytes", "futures-core", @@ -4171,33 +4430,36 @@ dependencies = [ [[package]] name = "toml" -version = "0.5.11" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" dependencies = [ "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", ] [[package]] name = "toml_datetime" -version = "0.5.1" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4553f467ac8e3d374bc9a177a26801e5d0f9b211aa1673fb137a403afd1c9cf5" +checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.17.1" +version = "0.19.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34cc558345efd7e88b9eda9626df2138b80bb46a7606f695e751c892bc7dac6" +checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" dependencies = [ "indexmap", - "itertools", - "nom8", "serde", + "serde_spanned", "toml_datetime", + "winnow", ] [[package]] @@ -4222,10 +4484,7 @@ dependencies = [ "pin-project", "prost", "prost-derive", - "rustls-native-certs", - "rustls-pemfile", "tokio", - "tokio-rustls", "tokio-stream", "tokio-util", "tower", @@ -4235,17 +4494,62 @@ dependencies = [ "tracing-futures", ] +[[package]] +name = "tonic" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.21.0", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "rustls-native-certs", + "rustls-pemfile", + "tokio", + "tokio-rustls 0.24.0", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic-build" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" dependencies = [ - "prettyplease", + "prettyplease 0.1.25", "proc-macro2", "prost-build", "quote", - "syn", + "syn 1.0.109", +] + +[[package]] +name = "tonic-build" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7" +dependencies = [ + "prettyplease 0.1.25", + "proc-macro2", + "prost-build", + "quote", + "syn 1.0.109", ] [[package]] @@ -4268,25 +4572,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "tower-http" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858" -dependencies = [ - "bitflags", - "bytes", - "futures-core", - "futures-util", - "http", - "http-body", - "http-range-header", - "pin-project-lite", - "tower", - "tower-layer", - "tower-service", -] - [[package]] name = "tower-layer" version = "0.3.2" @@ -4304,7 +4589,7 @@ name = "trace" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.2", "pageserver_api", "utils", "workspace_hack", @@ -4331,7 +4616,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -4477,15 +4762,15 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.10" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" [[package]] name = "unicode-normalization" @@ -4523,10 +4808,10 @@ dependencies = [ "base64 0.13.1", "log", "once_cell", - "rustls", + "rustls 0.20.8", "url", "webpki", - "webpki-roots", + "webpki-roots 0.22.6", ] [[package]] @@ -4553,6 +4838,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + [[package]] name = "utils" version = "0.1.0" @@ -4596,9 +4887,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" +checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb" dependencies = [ "getrandom", "serde", @@ -4616,12 +4907,18 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.1.4", + "clap 4.2.2", "env_logger", "log", "once_cell", @@ -4633,12 +4930,11 @@ dependencies = [ [[package]] name = "walkdir" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" dependencies = [ "same-file", - "winapi", "winapi-util", ] @@ -4679,7 +4975,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-shared", ] @@ -4713,7 +5009,7 @@ checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4753,6 +5049,15 @@ dependencies = [ "webpki", ] +[[package]] +name = "webpki-roots" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa54963694b65584e170cf5dc46aeb4dcaa5584e652ff5f3952e56d66aff0125" +dependencies = [ + "rustls-webpki", +] + [[package]] name = "which" version = "4.4.0" @@ -4795,19 +5100,28 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.0", +] + [[package]] name = "windows-sys" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -4816,65 +5130,140 @@ version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ - "windows-targets", + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.0", ] [[package]] name = "windows-targets" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_i686_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_x86_64_gnu" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_msvc" -version = "0.42.1" +version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "winnow" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" +dependencies = [ + "memchr", +] [[package]] name = "winreg" @@ -4893,7 +5282,8 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.1.4", + "clap 4.2.2", + "clap_builder", "crossbeam-utils", "digest", "either", @@ -4905,7 +5295,6 @@ dependencies = [ "futures-sink", "futures-util", "hashbrown 0.12.3", - "indexmap", "itertools", "libc", "log", @@ -4920,16 +5309,18 @@ dependencies = [ "regex-syntax", "reqwest", "ring", - "rustls", + "rustls 0.20.8", "scopeguard", "serde", "serde_json", - "socket2", - "syn", + "socket2 0.4.9", + "syn 1.0.109", + "syn 2.0.15", "tokio", - "tokio-rustls", + "tokio-rustls 0.23.4", "tokio-util", - "tonic", + "toml_datetime", + "toml_edit", "tower", "tracing", "tracing-core", @@ -4939,12 +5330,11 @@ dependencies = [ [[package]] name = "x509-parser" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" +checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634" dependencies = [ "asn1-rs", - "base64 0.13.1", "data-encoding", "der-parser", "lazy_static", @@ -4972,15 +5362,15 @@ checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" [[package]] name = "yasna" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed2e7a52e3744ab4d0c05c20aa065258e84c49fd4226f5191b2ed29712710b4" +checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd" dependencies = [ "time", ] [[package]] name = "zeroize" -version = "1.5.7" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" +checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" diff --git a/Cargo.toml b/Cargo.toml index 679605dc1d..0b545e6190 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,10 +24,10 @@ atty = "0.2.14" aws-config = { version = "0.51.0", default-features = false, features=["rustls"] } aws-sdk-s3 = "0.21.0" aws-smithy-http = "0.51.0" -aws-types = "0.51.0" +aws-types = "0.55" base64 = "0.13.0" bincode = "1.3" -bindgen = "0.61" +bindgen = "0.65" bstr = "1.0" byteorder = "1.4" bytes = "1.0" @@ -50,7 +50,7 @@ git-version = "0.3" hashbrown = "0.13" hashlink = "0.8.1" hex = "0.4" -hex-literal = "0.3" +hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" humantime = "2.1" @@ -80,18 +80,18 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } reqwest-middleware = "0.2.0" routerify = "3" -rpds = "0.12.0" +rpds = "0.13" rustls = "0.20" rustls-pemfile = "1" rustls-split = "0.3" scopeguard = "1.1" -sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "2.0" sha2 = "0.10.2" signal-hook = "0.3" -socket2 = "0.4.4" +socket2 = "0.5" strum = "0.24" strum_macros = "0.24" svg_fmt = "0.4.1" @@ -106,17 +106,17 @@ tokio-postgres-rustls = "0.9.0" tokio-rustls = "0.23" tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["io"] } -toml = "0.5" -toml_edit = { version = "0.17", features = ["easy"] } -tonic = {version = "0.8", features = ["tls", "tls-roots"]} +toml = "0.7" +toml_edit = "0.19" +tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" uuid = { version = "1.2", features = ["v4", "serde"] } walkdir = "2.3.2" -webpki-roots = "0.22.5" -x509-parser = "0.14" +webpki-roots = "0.23" +x509-parser = "0.15" ## TODO replace this with tracing env_logger = "0.10" @@ -154,9 +154,9 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.4" rcgen = "0.10" -rstest = "0.16" +rstest = "0.17" tempfile = "3.4" -tonic-build = "0.8" +tonic-build = "0.9" # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index f26aa2fbc5..3f290821c2 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -4,13 +4,12 @@ version = "0.1.0" edition = "2021" license = "Apache-2.0" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] -anyhow = "1.0.68" -chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } -rand = "0.8.3" -serde = "1.0.152" -serde_with = "2.1.0" -utils = { version = "0.1.0", path = "../utils" } -workspace_hack = { version = "0.1.0", path = "../../workspace_hack" } +anyhow.workspace = true +chrono.workspace = true +rand.workspace = true +serde.workspace = true +serde_with.workspace = true +utils.workspace = true + +workspace_hack.workspace = true diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 66221af522..f7e39751ef 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use std::process::Command; use anyhow::{anyhow, Context}; -use bindgen::callbacks::ParseCallbacks; +use bindgen::callbacks::{DeriveInfo, ParseCallbacks}; #[derive(Debug)] struct PostgresFfiCallbacks; @@ -20,7 +20,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { // Add any custom #[derive] attributes to the data structures that bindgen // creates. - fn add_derives(&self, name: &str) -> Vec { + fn add_derives(&self, derive_info: &DeriveInfo) -> Vec { // This is the list of data structures that we want to serialize/deserialize. let serde_list = [ "XLogRecord", @@ -31,7 +31,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { "ControlFileData", ]; - if serde_list.contains(&name) { + if serde_list.contains(&derive_info.name) { vec![ "Default".into(), // Default allows us to easily fill the padding fields with 0. "Serialize".into(), diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs index eb52409c44..048e99d841 100644 --- a/libs/remote_storage/tests/pagination_tests.rs +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -204,12 +204,7 @@ async fn upload_s3_data( let data = format!("remote blob data {i}").into_bytes(); let data_len = data.len(); task_client - .upload( - Box::new(std::io::Cursor::new(data)), - data_len, - &blob_path, - None, - ) + .upload(std::io::Cursor::new(data), data_len, &blob_path, None) .await?; Ok::<_, anyhow::Error>((blob_prefix, blob_path)) diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 8c3d3f9063..b285c9b5b0 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -14,4 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true -workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +workspace_hack.workspace = true diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 391bc52a80..dc6326e73e 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -33,7 +33,7 @@ serde_with.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true -uuid = { version = "1.2", features = ["v4", "serde"] } +uuid.workspace = true metrics.workspace = true workspace_hack.workspace = true diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 826cf1aab3..9e341230cf 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -6,6 +6,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use remote_storage::{RemotePath, RemoteStorageConfig}; +use serde::de::IntoDeserializer; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; @@ -704,8 +705,9 @@ impl PageServerConf { "disk_usage_based_eviction" => { tracing::info!("disk_usage_based_eviction: {:#?}", &item); builder.disk_usage_based_eviction( - toml_edit::de::from_item(item.clone()) - .context("parse disk_usage_based_eviction")?) + deserialize_from_item("disk_usage_based_eviction", item) + .context("parse disk_usage_based_eviction")? + ) }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), @@ -806,14 +808,14 @@ impl PageServerConf { if let Some(eviction_policy) = item.get("eviction_policy") { t_conf.eviction_policy = Some( - toml_edit::de::from_item(eviction_policy.clone()) + deserialize_from_item("eviction_policy", eviction_policy) .context("parse eviction_policy")?, ); } if let Some(item) = item.get("min_resident_size_override") { t_conf.min_resident_size_override = Some( - toml_edit::de::from_item(item.clone()) + deserialize_from_item("min_resident_size_override", item) .context("parse min_resident_size_override")?, ); } @@ -920,6 +922,18 @@ where }) } +fn deserialize_from_item(name: &str, item: &Item) -> anyhow::Result +where + T: serde::de::DeserializeOwned, +{ + // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way + let deserializer = match item.clone().into_value() { + Ok(value) => value.into_deserializer(), + Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"), + }; + T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) +} + /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty @@ -986,9 +1000,10 @@ mod tests { use remote_storage::{RemoteStorageKind, S3Config}; use tempfile::{tempdir, TempDir}; + use utils::serde_percent::Percent; use super::*; - use crate::DEFAULT_PG_VERSION; + use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION}; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -1286,6 +1301,71 @@ trace_read_requests = {trace_read_requests}"#, Ok(()) } + #[test] + fn eviction_pageserver_config_parse() -> anyhow::Result<()> { + let tempdir = tempdir()?; + let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + + let pageserver_conf_toml = format!( + r#"pg_distrib_dir = "{}" +metric_collection_endpoint = "http://sample.url" +metric_collection_interval = "10min" +id = 222 + +[disk_usage_based_eviction] +max_usage_pct = 80 +min_avail_bytes = 0 +period = "10s" + +[tenant_config] +evictions_low_residence_duration_metric_threshold = "20m" + +[tenant_config.eviction_policy] +kind = "LayerAccessThreshold" +period = "20m" +threshold = "20m" +"#, + pg_distrib_dir.display(), + ); + let toml: Document = pageserver_conf_toml.parse()?; + let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; + + assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); + assert_eq!( + conf.metric_collection_endpoint, + Some("http://sample.url".parse().unwrap()) + ); + assert_eq!( + conf.metric_collection_interval, + Duration::from_secs(10 * 60) + ); + assert_eq!( + conf.default_tenant_conf + .evictions_low_residence_duration_metric_threshold, + Duration::from_secs(20 * 60) + ); + assert_eq!(conf.id, NodeId(222)); + assert_eq!( + conf.disk_usage_based_eviction, + Some(DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 0, + period: Duration::from_secs(10), + #[cfg(feature = "testing")] + mock_statvfs: None, + }) + ); + match &conf.default_tenant_conf.eviction_policy { + EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"), + EvictionPolicy::LayerAccessThreshold(eviction_thresold) => { + assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60)); + assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60)); + } + } + + Ok(()) + } + fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c0e4a2a9cf..bd38a7a2f3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -65,7 +65,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream { // We were requested to shut down. - let msg = format!("pageserver is shutting down"); + let msg = "pageserver is shutting down".to_string(); let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)); Err(QueryError::Other(anyhow::anyhow!(msg))) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 18a4d7617b..11415b47c4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1894,7 +1894,7 @@ impl Tenant { .to_string(); // Convert the config to a toml file. - conf_content += &toml_edit::easy::to_string(&tenant_conf)?; + conf_content += &toml_edit::ser::to_string(&tenant_conf)?; let mut target_config_file = VirtualFile::open_with_options( target_config_path, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c01a8aa8c0..34f57840fb 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -291,9 +291,9 @@ mod tests { ..TenantConfOpt::default() }; - let toml_form = toml_edit::easy::to_string(&small_conf).unwrap(); + let toml_form = toml_edit::ser::to_string(&small_conf).unwrap(); assert_eq!(toml_form, "gc_horizon = 42\n"); - assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap()); + assert_eq!(small_conf, toml_edit::de::from_str(&toml_form).unwrap()); let json_form = serde_json::to_string(&small_conf).unwrap(); assert_eq!(json_form, "{\"gc_horizon\":42}"); diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index ce9f4d9bf8..699121ccd9 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -74,7 +74,7 @@ pub(super) async fn upload_timeline_layer<'a>( })?; storage - .upload(Box::new(source_file), fs_size, &storage_path, None) + .upload(source_file, fs_size, &storage_path, None) .await .with_context(|| { format!( diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index d7ace28426..de7b634ba0 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -23,7 +23,6 @@ use std::convert::Infallible; use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; -use std::task::Poll; use std::time::Duration; use tokio::sync::broadcast; use tokio::sync::broadcast::error::RecvError; @@ -374,7 +373,7 @@ impl BrokerService for Broker { Ok(info) => yield info, Err(RecvError::Lagged(skipped_msg)) => { missed_msgs += skipped_msg; - if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) { + if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() { warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs); missed_msgs = 0; diff --git a/trace/Cargo.toml b/trace/Cargo.toml index 6ced992d4c..d6eed3f49c 100644 --- a/trace/Cargo.toml +++ b/trace/Cargo.toml @@ -4,8 +4,6 @@ version = "0.1.0" edition.workspace = true license.workspace = true -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] clap.workspace = true anyhow.workspace = true diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f885f4a94d..f735ffed4c 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -18,6 +18,7 @@ byteorder = { version = "1" } bytes = { version = "1", features = ["serde"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } clap = { version = "4", features = ["derive", "string"] } +clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } digest = { version = "0.10", features = ["mac", "std"] } either = { version = "1" } @@ -29,7 +30,6 @@ futures-executor = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hashbrown = { version = "0.12", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -52,7 +52,8 @@ socket2 = { version = "0.4", default-features = false, features = ["all"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] } tokio-rustls = { version = "0.23" } tokio-util = { version = "0.7", features = ["codec", "io"] } -tonic = { version = "0.8", features = ["tls-roots"] } +toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } +toml_edit = { version = "0.19", features = ["serde"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } @@ -64,7 +65,6 @@ anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } either = { version = "1" } hashbrown = { version = "0.12", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -74,6 +74,7 @@ prost = { version = "0.11" } regex = { version = "1" } regex-syntax = { version = "0.6" } serde = { version = "1", features = ["alloc", "derive"] } -syn = { version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] } ### END HAKARI SECTION From c2496c7ef261150d5c79c46c5846a83e78d3e226 Mon Sep 17 00:00:00 2001 From: Matt Nappo Date: Fri, 14 Apr 2023 12:22:43 -0400 Subject: [PATCH 290/426] Added black_box in layer_map benches (fix #3396) --- pageserver/benches/bench_layer_map.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 4882fc518f..8f139a6596 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -13,7 +13,7 @@ use std::time::Instant; use utils::lsn::Lsn; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut layer_map = LayerMap::::default(); @@ -114,7 +114,7 @@ fn bench_from_captest_env(c: &mut Criterion) { c.bench_function("captest_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + black_box(layer_map.search(q.0, q.1)); } }); }); @@ -122,11 +122,11 @@ fn bench_from_captest_env(c: &mut Criterion) { // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs. c.bench_function("captest_rel_dir_query", |b| { b.iter(|| { - let result = layer_map.search( + let result = black_box(layer_map.search( Key::from_hex("000000067F00008000000000000000000001").unwrap(), // This LSN is higher than any of the LSNs in the tree Lsn::from_str("D0/80208AE1").unwrap(), - ); + )); result.unwrap(); }); }); @@ -183,7 +183,7 @@ fn bench_from_real_project(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + black_box(layer_map.search(q.0, q.1)); } }); }); @@ -232,7 +232,7 @@ fn bench_sequential(c: &mut Criterion) { group.bench_function("uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1); + black_box(layer_map.search(q.0, q.1)); } }); }); From 73f34eaa5e3632f978a19e3db85e555124920651 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 17 Apr 2023 11:24:57 +0300 Subject: [PATCH 291/426] Send AppendResponse keepalive once per second (#4036) Walproposer sends AppendRequest at least once per second. This patch adds a response to these requests once per second. Fixes https://github.com/neondatabase/neon/issues/4017 --- safekeeper/src/receive_wal.rs | 42 +++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 61e4c5f0fa..195470e3ca 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -27,6 +27,8 @@ use tokio::sync::mpsc::error::TryRecvError; use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Sender; use tokio::task::spawn_blocking; +use tokio::time::Duration; +use tokio::time::Instant; use tracing::*; use utils::id::TenantTimelineId; use utils::lsn::Lsn; @@ -206,6 +208,10 @@ async fn network_write( } } +// Send keepalive messages to walproposer, to make sure it receives updates +// even when it writes a steady stream of messages. +const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); + /// Takes messages from msg_rx, processes and pushes replies to reply_tx. struct WalAcceptor { tli: Arc, @@ -253,18 +259,25 @@ impl WalAcceptor { timeline: Arc::clone(&self.tli), }; - let mut next_msg: ProposerAcceptorMessage; + // After this timestamp we will stop processing AppendRequests and send a response + // to the walproposer. walproposer sends at least one AppendRequest per second, + // we will send keepalives by replying to these requests once per second. + let mut next_keepalive = Instant::now(); loop { let opt_msg = self.msg_rx.recv().await; if opt_msg.is_none() { return Ok(()); // chan closed, streaming terminated } - next_msg = opt_msg.unwrap(); + let mut next_msg = opt_msg.unwrap(); - if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) { + let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) { // loop through AppendRequest's while it's readily available to // write as many WAL as possible without fsyncing + // + // Note: this will need to be rewritten if we want to read non-AppendRequest messages here. + // Otherwise, we might end up in a situation where we read a message, but don't + // process it. while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg { let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); @@ -274,6 +287,11 @@ impl WalAcceptor { } } + // get out of this loop if keepalive time is reached + if Instant::now() >= next_keepalive { + break; + } + match self.msg_rx.try_recv() { Ok(msg) => next_msg = msg, Err(TryRecvError::Empty) => break, @@ -282,18 +300,18 @@ impl WalAcceptor { } // flush all written WAL to the disk - if let Some(reply) = self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)? { - if self.reply_tx.send(reply).await.is_err() { - return Ok(()); // chan closed, streaming terminated - } - } + self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)? } else { // process message other than AppendRequest - if let Some(reply) = self.tli.process_msg(&next_msg)? { - if self.reply_tx.send(reply).await.is_err() { - return Ok(()); // chan closed, streaming terminated - } + self.tli.process_msg(&next_msg)? + }; + + if let Some(reply) = reply_msg { + if self.reply_tx.send(reply).await.is_err() { + return Ok(()); // chan closed, streaming terminated } + // reset keepalive time + next_keepalive = Instant::now() + KEEPALIVE_INTERVAL; } } } From d8dd60dc811eade6fbf89b4416f1860f0000fb3d Mon Sep 17 00:00:00 2001 From: fcdm <128653800+fcdm@users.noreply.github.com> Date: Mon, 17 Apr 2023 10:58:53 +0100 Subject: [PATCH 292/426] Add helm values for us-east-1 --- ...prod-us-east-1-theta.neon-proxy-scram.yaml | 69 +++++++++++++++++++ ...d-us-east-1-theta.neon-storage-broker.yaml | 52 ++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 .github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml create mode 100644 .github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml diff --git a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml new file mode 100644 index 0000000000..f113d1f861 --- /dev/null +++ b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml @@ -0,0 +1,69 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +# Delay the kill signal by 5 minutes (5 * 60) +# The pod(s) will stay in Terminating, keeps the existing connections +# but doesn't receive new ones +containerLifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 300"] +terminationGracePeriodSeconds: 604800 + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" + domain: "*.us-east-1.aws.neon.tech" + # These domains haven't been delegated yet. + # extraDomains: ["*.us-east-1.retooldb.com", "*.us-east-1.postgres.vercel-storage.com"] + sentryEnvironment: "production" + wssPort: 8443 + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" + metricCollectionInterval: "10min" + +podLabels: + neon_service: proxy-scram + neon_env: prod + neon_region: us-east-1 + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: us-east-1.aws.neon.tech + httpsPort: 443 + +extraManifests: + - apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMServiceScrape + metadata: + name: "{{ include \"neon-proxy.fullname\" . }}" + labels: + helm.sh/chart: neon-proxy-{{ .Chart.Version }} + app.kubernetes.io/name: neon-proxy + app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" + app.kubernetes.io/version: "{{ .Chart.AppVersion }}" + app.kubernetes.io/managed-by: Helm + namespace: "{{ .Release.Namespace }}" + spec: + selector: + matchLabels: + app.kubernetes.io/name: "neon-proxy" + endpoints: + - port: http + path: /metrics + interval: 10s + scrapeTimeout: 10s + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" diff --git a/.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml new file mode 100644 index 0000000000..7c16911b5e --- /dev/null +++ b/.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml @@ -0,0 +1,52 @@ +# Helm chart values for neon-storage-broker +podLabels: + neon_env: production + neon_service: storage-broker + +# Use L4 LB +service: + # service.annotations -- Annotations to add to the service + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet + # assign service to this name at external-dns + external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.theta.us-east-1.internal.aws.neon.tech + # service.type -- Service type + type: LoadBalancer + # service.port -- broker listen port + port: 50051 + +ingress: + enabled: false + +metrics: + enabled: false + +extraManifests: + - apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMServiceScrape + metadata: + name: "{{ include \"neon-storage-broker.fullname\" . }}" + labels: + helm.sh/chart: neon-storage-broker-{{ .Chart.Version }} + app.kubernetes.io/name: neon-storage-broker + app.kubernetes.io/instance: neon-storage-broker + app.kubernetes.io/version: "{{ .Chart.AppVersion }}" + app.kubernetes.io/managed-by: Helm + namespace: "{{ .Release.Namespace }}" + spec: + selector: + matchLabels: + app.kubernetes.io/name: "neon-storage-broker" + endpoints: + - port: broker + path: /metrics + interval: 10s + scrapeTimeout: 10s + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" + +settings: + sentryEnvironment: "production" From 0c083564ce7f526d7950be757d2d0c6f84afd096 Mon Sep 17 00:00:00 2001 From: Cihan Demirci <128653800+fcdm@users.noreply.github.com> Date: Mon, 17 Apr 2023 13:25:27 +0100 Subject: [PATCH 293/426] Add us-east-1 hosts file and update regions (#4042) ## Describe your changes ## Issue ticket number and link ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --- .github/ansible/prod.us-east-1.hosts.yaml | 50 +++++++++++++++++++++++ .github/workflows/deploy-prod.yml | 6 ++- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 .github/ansible/prod.us-east-1.hosts.yaml diff --git a/.github/ansible/prod.us-east-1.hosts.yaml b/.github/ansible/prod.us-east-1.hosts.yaml new file mode 100644 index 0000000000..fcf472432b --- /dev/null +++ b/.github/ansible/prod.us-east-1.hosts.yaml @@ -0,0 +1,50 @@ +storage: + vars: + bucket_name: neon-prod-storage-us-east-1 + bucket_region: us-east-1 + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech + broker_endpoint: http://storage-broker-lb.theta.us-east-1.internal.aws.neon.tech:50051 + pageserver_config_stub: + pg_distrib_dir: /usr/local + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events + metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: us-east-1 + ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-1 + console_region_id: aws-us-east-1 + sentry_environment: production + + children: + pageservers: + hosts: + pageserver-0.us-east-1.aws.neon.tech: + ansible_host: i-0f58137883429f55a + pageserver-1.us-east-1.aws.neon.tech: + ansible_host: i-08e7ee6190a099019 + pageserver-2.us-east-1.aws.neon.tech: + ansible_host: i-0686a4e5e208e31a1 + + safekeepers: + hosts: + safekeeper-0.us-east-1.aws.neon.tech: + ansible_host: i-04ce739e88793d864 + safekeeper-1.us-east-1.aws.neon.tech: + ansible_host: i-0e9e6c9227fb81410 + safekeeper-2.us-east-1.aws.neon.tech: + ansible_host: i-072f4dd86a327d52f diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index 6096ac8ab9..92c7eb2492 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -49,7 +49,7 @@ jobs: shell: bash strategy: matrix: - target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ] + target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1, us-east-1 ] environment: name: prod-${{ matrix.target_region }} steps: @@ -97,6 +97,10 @@ jobs: target_cluster: prod-ap-southeast-1-epsilon deploy_link_proxy: false deploy_legacy_scram_proxy: false + - target_region: us-east-1 + target_cluster: prod-us-east-1-theta + deploy_link_proxy: false + deploy_legacy_scram_proxy: false environment: name: prod-${{ matrix.target_region }} steps: From e2a5177e8915db126ae5d033dd83bfb1c7458fc4 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 18 Apr 2023 16:04:10 +0300 Subject: [PATCH 294/426] Bump h2 from 0.3.17 to 0.3.18 (#4045) --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a18f4490da..ce24bbcee8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1756,9 +1756,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66b91535aa35fea1523ad1b86cb6b53c28e0ae566ba4a460f4457e936cad7c6f" +checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" dependencies = [ "bytes", "fnv", From f1b7dc40649a044cb614a08de57258fba73d6aa4 Mon Sep 17 00:00:00 2001 From: fcdm <128653800+fcdm@users.noreply.github.com> Date: Tue, 18 Apr 2023 13:25:27 +0100 Subject: [PATCH 295/426] Update pageserver instances in us-east-1 --- .github/ansible/prod.us-east-1.hosts.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/ansible/prod.us-east-1.hosts.yaml b/.github/ansible/prod.us-east-1.hosts.yaml index fcf472432b..b5b2b076bb 100644 --- a/.github/ansible/prod.us-east-1.hosts.yaml +++ b/.github/ansible/prod.us-east-1.hosts.yaml @@ -34,11 +34,11 @@ storage: pageservers: hosts: pageserver-0.us-east-1.aws.neon.tech: - ansible_host: i-0f58137883429f55a + ansible_host: i-085222088b0d2e0c7 pageserver-1.us-east-1.aws.neon.tech: - ansible_host: i-08e7ee6190a099019 + ansible_host: i-0969d4f684d23a21e pageserver-2.us-east-1.aws.neon.tech: - ansible_host: i-0686a4e5e208e31a1 + ansible_host: i-05dee87895da58dad safekeepers: hosts: From 0bfbae2d7302cf8753a999ecde4da9f921668832 Mon Sep 17 00:00:00 2001 From: Cihan Demirci <128653800+fcdm@users.noreply.github.com> Date: Tue, 18 Apr 2023 16:41:09 +0100 Subject: [PATCH 296/426] Add storage broker deployment to us-east-1 (#4048) --- .github/workflows/deploy-prod.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index 92c7eb2492..9fa31b3225 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -151,6 +151,8 @@ jobs: target_cluster: prod-eu-central-1-gamma - target_region: ap-southeast-1 target_cluster: prod-ap-southeast-1-epsilon + - target_region: us-east-1 + target_cluster: prod-us-east-1-theta environment: name: prod-${{ matrix.target_region }} steps: From 02b28ae0b107f0fa2cc5b650d36de04bd7bc78e7 Mon Sep 17 00:00:00 2001 From: sharnoff Date: Tue, 18 Apr 2023 18:54:32 +0300 Subject: [PATCH 297/426] fix vm-informant dbname: "neondb" -> "postgres" (#4046) Changes the vm-informant's postgres connection string's dbname from "neondb" (which sometimes doesn't exist) to "postgres" (which _hopefully_ should exist more often?). Currently there are a handful of VMs in prod that aren't working with autoscaling because they don't have the "neondb" database. The vm-informant doesn't require any database in particular; it's just connecting as `cloud_admin` to be able to adjust the file cache settings. --- Dockerfile.vm-compute-node | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node index 957166ecd1..aabb3c9953 100644 --- a/Dockerfile.vm-compute-node +++ b/Dockerfile.vm-compute-node @@ -54,7 +54,7 @@ RUN set -e \ RUN set -e \ && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ - && CONNSTR="dbname=neondb user=cloud_admin sslmode=disable" \ + && CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \ && ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \ && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab From 7ba5c286b7c023e39162c6c6bcdad9353a3b5194 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Fri, 21 Apr 2023 11:10:48 +0200 Subject: [PATCH 298/426] [compute_ctl] Improve 'empty' compute startup sequence (#4034) Do several attempts to get spec from the control-plane and retry network errors and all reasonable HTTP response codes. Do not hang waiting for spec without confirmation from the control-plane that compute is known and is in the `Empty` state. Adjust the way we track `total_startup_ms` metric, it should be calculated since the moment we received spec, not from the moment `compute_ctl` started. Also introduce a new `wait_for_spec_ms` metric to track the time spent sleeping and waiting for spec to be delivered from control-plane. Part of neondatabase/cloud#3533 --- compute_tools/src/bin/compute_ctl.rs | 24 ++++- compute_tools/src/compute.rs | 5 +- compute_tools/src/http/api.rs | 1 + compute_tools/src/http/openapi_spec.yaml | 10 ++ compute_tools/src/spec.rs | 113 +++++++++++++++++++---- libs/compute_api/src/responses.rs | 14 +++ 6 files changed, 141 insertions(+), 26 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 309310407d..36dbc382b5 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -73,7 +73,7 @@ fn main() -> Result<()> { // Try to use just 'postgres' if no path is provided let pgbin = matches.get_one::("pgbin").unwrap(); - let mut spec = None; + let spec; let mut live_config_allowed = false; match spec_json { // First, try to get cluster spec from the cli argument @@ -89,9 +89,13 @@ fn main() -> Result<()> { } else if let Some(id) = compute_id { if let Some(cp_base) = control_plane_uri { live_config_allowed = true; - if let Ok(s) = get_spec_from_control_plane(cp_base, id) { - spec = Some(s); - } + spec = match get_spec_from_control_plane(cp_base, id) { + Ok(s) => s, + Err(e) => { + error!("cannot get response from control plane: {}", e); + panic!("neither spec nor confirmation that compute is in the Empty state was received"); + } + }; } else { panic!("must specify both --control-plane-uri and --compute-id or none"); } @@ -114,7 +118,6 @@ fn main() -> Result<()> { spec_set = false; } let compute_node = ComputeNode { - start_time: Utc::now(), connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), @@ -147,6 +150,17 @@ fn main() -> Result<()> { let mut state = compute.state.lock().unwrap(); let pspec = state.pspec.as_ref().expect("spec must be set"); let startup_tracing_context = pspec.spec.startup_tracing_context.clone(); + + // Record for how long we slept waiting for the spec. + state.metrics.wait_for_spec_ms = Utc::now() + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + // Reset start time to the actual start of the configuration, so that + // total startup time was properly measured at the end. + state.start_time = Utc::now(); + state.status = ComputeStatus::Init; compute.state_changed.notify_all(); drop(state); diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 51de2b6e0a..507dac9c0d 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -38,7 +38,6 @@ use crate::spec::*; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { - pub start_time: DateTime, // Url type maintains proper escaping pub connstr: url::Url, pub pgdata: String, @@ -66,6 +65,7 @@ pub struct ComputeNode { #[derive(Clone, Debug)] pub struct ComputeState { + pub start_time: DateTime, pub status: ComputeStatus, /// Timestamp of the last Postgres activity pub last_active: DateTime, @@ -77,6 +77,7 @@ pub struct ComputeState { impl ComputeState { pub fn new() -> Self { Self { + start_time: Utc::now(), status: ComputeStatus::Empty, last_active: Utc::now(), error: None, @@ -425,7 +426,7 @@ impl ComputeNode { .unwrap() .as_millis() as u64; state.metrics.total_startup_ms = startup_end_time - .signed_duration_since(self.start_time) + .signed_duration_since(compute_state.start_time) .to_std() .unwrap() .as_millis() as u64; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 3ca688de69..4468f6f5e4 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -18,6 +18,7 @@ use tracing_utils::http::OtelName; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { + start_time: state.start_time, tenant: state .pspec .as_ref() diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index bdb09d4a6b..cc8f074a50 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -152,11 +152,14 @@ components: type: object description: Compute startup metrics. required: + - wait_for_spec_ms - sync_safekeepers_ms - basebackup_ms - config_ms - total_startup_ms properties: + wait_for_spec_ms: + type: integer sync_safekeepers_ms: type: integer basebackup_ms: @@ -181,6 +184,13 @@ components: - status - last_active properties: + start_time: + type: string + description: | + Time when compute was started. If initially compute was started in the `empty` + state and then provided with valid spec, `start_time` will be reset to the + moment, when spec was received. + example: "2022-10-12T07:20:50.52Z" status: $ref: '#/components/schemas/ComputeStatus' last_active: diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 088f74335a..28e0ef41b7 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -4,42 +4,117 @@ use std::str::FromStr; use anyhow::{anyhow, bail, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; -use tracing::{info, info_span, instrument, span_enabled, warn, Level}; +use reqwest::StatusCode; +use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use compute_api::responses::ControlPlaneSpecResponse; +use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; +// Do control plane request and return response if any. In case of error it +// returns a bool flag indicating whether it makes sense to retry the request +// and a string with error message. +fn do_control_plane_request( + uri: &str, + jwt: &str, +) -> Result { + let resp = reqwest::blocking::Client::new() + .get(uri) + .header("Authorization", jwt) + .send() + .map_err(|e| { + ( + true, + format!("could not perform spec request to control plane: {}", e), + ) + })?; + + match resp.status() { + StatusCode::OK => match resp.json::() { + Ok(spec_resp) => Ok(spec_resp), + Err(e) => Err(( + true, + format!("could not deserialize control plane response: {}", e), + )), + }, + StatusCode::SERVICE_UNAVAILABLE => { + Err((true, "control plane is temporarily unavailable".to_string())) + } + StatusCode::BAD_GATEWAY => { + // We have a problem with intermittent 502 errors now + // https://github.com/neondatabase/cloud/issues/2353 + // It's fine to retry GET request in this case. + Err((true, "control plane request failed with 502".to_string())) + } + // Another code, likely 500 or 404, means that compute is unknown to the control plane + // or some internal failure happened. Doesn't make much sense to retry in this case. + _ => Err(( + false, + format!( + "unexpected control plane response status code: {}", + resp.status() + ), + )), + } +} + /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` /// env variable is set, it will be used for authorization. -pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result { +pub fn get_spec_from_control_plane( + base_uri: &str, + compute_id: &str, +) -> Result> { let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec"); - let jwt: String = match std::env::var("NEON_CONSOLE_JWT") { + let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { Ok(v) => v, Err(_) => "".to_string(), }; + let mut attempt = 1; + let mut spec: Result> = Ok(None); + info!("getting spec from control plane: {}", cp_uri); - // TODO: check the response. We should distinguish cases when it's - // - network error, then retry - // - no spec for compute yet, then wait - // - compute id is unknown or any other error, then bail out - let resp: ControlPlaneSpecResponse = reqwest::blocking::Client::new() - .get(cp_uri) - .header("Authorization", jwt) - .send() - .map_err(|e| anyhow!("could not send spec request to control plane: {}", e))? - .json() - .map_err(|e| anyhow!("could not get compute spec from control plane: {}", e))?; + // Do 3 attempts to get spec from the control plane using the following logic: + // - network error -> then retry + // - compute id is unknown or any other error -> bail out + // - no spec for compute yet (Empty state) -> return Ok(None) + // - got spec -> return Ok(Some(spec)) + while attempt < 4 { + spec = match do_control_plane_request(&cp_uri, &jwt) { + Ok(spec_resp) => match spec_resp.status { + ControlPlaneComputeStatus::Empty => Ok(None), + ControlPlaneComputeStatus::Attached => { + if let Some(spec) = spec_resp.spec { + Ok(Some(spec)) + } else { + bail!("compute is attached, but spec is empty") + } + } + }, + Err((retry, msg)) => { + if retry { + Err(anyhow!(msg)) + } else { + bail!(msg); + } + } + }; - if let Some(spec) = resp.spec { - Ok(spec) - } else { - bail!("could not get compute spec from control plane") + if let Err(e) = &spec { + error!("attempt {} to get spec failed with: {}", attempt, e); + } else { + return spec; + } + + attempt += 1; + std::thread::sleep(std::time::Duration::from_millis(100)); } + + // All attempts failed, return error. + spec } /// It takes cluster specification and does the following: diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 370b2c5626..c409563b56 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -14,6 +14,7 @@ pub struct GenericAPIError { #[derive(Serialize, Debug)] #[serde(rename_all = "snake_case")] pub struct ComputeStatusResponse { + pub start_time: DateTime, pub tenant: Option, pub timeline: Option, pub status: ComputeStatus, @@ -63,6 +64,7 @@ where /// Response of the /metrics.json API #[derive(Clone, Debug, Default, Serialize)] pub struct ComputeMetrics { + pub wait_for_spec_ms: u64, pub sync_safekeepers_ms: u64, pub basebackup_ms: u64, pub config_ms: u64, @@ -75,4 +77,16 @@ pub struct ComputeMetrics { #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { pub spec: Option, + pub status: ControlPlaneComputeStatus, +} + +#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ControlPlaneComputeStatus { + // Compute is known to control-plane, but it's not + // yet attached to any timeline / endpoint. + Empty, + // Compute is attached to some timeline / endpoint and + // should be able to start with provided spec. + Attached, } From afbbc6103612819058db63dc24829ec5eccccef7 Mon Sep 17 00:00:00 2001 From: Eduard Dyckman Date: Mon, 24 Apr 2023 22:19:25 +0900 Subject: [PATCH 299/426] Adding synthetic size to pageserver swagger (#4049) ## Describe your changes I added synthetic size response to the console swagger. Now I am syncing it back to neon --- pageserver/src/http/openapi_spec.yml | 115 +++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b0e4e1ca85..95f6e96a5b 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -520,6 +520,43 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/synthetic_size: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: | + Calculate tenant's synthetic size + responses: + "200": + description: Tenant's synthetic size + content: + application/json: + schema: + $ref: "#/components/schemas/SyntheticSizeResponse" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id @@ -948,6 +985,84 @@ components: latest_gc_cutoff_lsn: type: string format: hex + + SyntheticSizeResponse: + type: object + required: + - id + - size + - segment_sizes + - inputs + properties: + id: + type: string + format: hex + size: + type: integer + segment_sizes: + type: array + items: + $ref: "#/components/schemas/SegmentSize" + inputs: + type: object + properties: + segments: + type: array + items: + $ref: "#/components/schemas/SegmentData" + timeline_inputs: + type: array + items: + $ref: "#/components/schemas/TimelineInput" + + SegmentSize: + type: object + required: + - method + - accum_size + properties: + method: + type: string + accum_size: + type: integer + + SegmentData: + type: object + required: + - segment + properties: + segment: + type: object + required: + - lsn + properties: + parent: + type: integer + lsn: + type: integer + size: + type: integer + needed: + type: boolean + timeline_id: + type: string + format: hex + kind: + type: string + + TimelineInput: + type: object + required: + - timeline_id + properties: + ancestor_id: + type: string + ancestor_lsn: + type: string + timeline_id: + type: string + format: hex + Error: type: object required: From e83684b8683847a5f467809cd7dd8e2ccdc9bffa Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Apr 2023 14:10:18 +0200 Subject: [PATCH 300/426] add libmetric metric for each logged log message (#4055) This patch extends the libmetrics logging setup functionality with a `tracing` layer that increments a Prometheus counter each time we log a log message. We have the counter per tracing event level. This allows for monitoring WARN and ERR log volume without parsing the log. Also, it would allow cross-checking whether logs got dropped on the way into Loki. It would be nicer if we could hook deeper into the tracing logging layer, to avoid evaluating the filter twice. But I don't know how to do it. --- libs/utils/src/logging.rs | 100 ++++++++++++++++++++---- pageserver/src/http/routes.rs | 34 ++++++++ test_runner/fixtures/metrics.py | 1 + test_runner/fixtures/pageserver/http.py | 10 +++ test_runner/regress/test_logging.py | 49 ++++++++++++ 5 files changed, 179 insertions(+), 15 deletions(-) create mode 100644 test_runner/regress/test_logging.py diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f770622a60..ed856b6804 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use anyhow::Context; +use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] @@ -23,25 +24,64 @@ impl LogFormat { } } -pub fn init(log_format: LogFormat) -> anyhow::Result<()> { - let default_filter_str = "info"; +static TRACING_EVENT_COUNT: Lazy = Lazy::new(|| { + metrics::register_int_counter_vec!( + "libmetrics_tracing_event_count", + "Number of tracing events, by level", + &["level"] + ) + .expect("failed to define metric") +}); +struct TracingEventCountLayer(&'static metrics::IntCounterVec); + +impl tracing_subscriber::layer::Layer for TracingEventCountLayer +where + S: tracing::Subscriber, +{ + fn on_event( + &self, + event: &tracing::Event<'_>, + _ctx: tracing_subscriber::layer::Context<'_, S>, + ) { + let level = event.metadata().level(); + let level = match *level { + tracing::Level::ERROR => "error", + tracing::Level::WARN => "warn", + tracing::Level::INFO => "info", + tracing::Level::DEBUG => "debug", + tracing::Level::TRACE => "trace", + }; + self.0.with_label_values(&[level]).inc(); + } +} + +pub fn init(log_format: LogFormat) -> anyhow::Result<()> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. - let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str)); + let rust_log_env_filter = || { + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) + }; - let base_logger = tracing_subscriber::fmt() - .with_env_filter(env_filter) - .with_target(false) - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_writer(std::io::stdout); - - match log_format { - LogFormat::Json => base_logger.json().init(), - LogFormat::Plain => base_logger.init(), - LogFormat::Test => base_logger.with_test_writer().init(), - } + // NB: the order of the with() calls does not matter. + // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering + use tracing_subscriber::prelude::*; + tracing_subscriber::registry() + .with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_writer(std::io::stdout); + let log_layer = match log_format { + LogFormat::Json => log_layer.json().boxed(), + LogFormat::Plain => log_layer.boxed(), + LogFormat::Test => log_layer.with_test_writer().boxed(), + }; + log_layer.with_filter(rust_log_env_filter()) + }) + .with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())) + .init(); Ok(()) } @@ -157,3 +197,33 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> { ::fmt(self, f) } } + +#[cfg(test)] +mod tests { + use metrics::{core::Opts, IntCounterVec}; + + use super::TracingEventCountLayer; + + #[test] + fn tracing_event_count_metric() { + let counter_vec = + IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap(); + let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static + let layer = TracingEventCountLayer(counter_vec); + use tracing_subscriber::prelude::*; + + tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || { + tracing::trace!("foo"); + tracing::debug!("foo"); + tracing::info!("foo"); + tracing::warn!("foo"); + tracing::error!("foo"); + }); + + assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1); + assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1); + } +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 06a97f6dff..3318e5263c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1201,6 +1201,36 @@ async fn handler_404(_: Request) -> Result, ApiError> { ) } +async fn post_tracing_event_handler(mut r: Request) -> Result, ApiError> { + #[derive(Debug, serde::Deserialize)] + #[serde(rename_all = "lowercase")] + enum Level { + Error, + Warn, + Info, + Debug, + Trace, + } + #[derive(Debug, serde::Deserialize)] + struct Request { + level: Level, + message: String, + } + let body: Request = json_request(&mut r) + .await + .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + + match body.level { + Level::Error => tracing::error!(?body.message), + Level::Warn => tracing::warn!(?body.message), + Level::Info => tracing::info!(?body.message), + Level::Debug => tracing::debug!(?body.message), + Level::Trace => tracing::trace!(?body.message), + } + + json_response(StatusCode::OK, ()) +} + pub fn make_router( conf: &'static PageServerConf, launch_ts: &'static LaunchTimestamp, @@ -1341,5 +1371,9 @@ pub fn make_router( testing_api!("set tenant state to broken", handle_tenant_break), ) .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) + .post( + "/v1/tracing/event", + testing_api!("emit a tracing event", post_tracing_event_handler), + ) .any(handler_404)) } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 2984f2c7d3..c88b985c8e 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -53,6 +53,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_storage_operations_seconds_global_bucket", "libmetrics_launch_timestamp", "libmetrics_build_info", + "libmetrics_tracing_event_count_total", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 69042478c7..cf92aeb6c0 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -550,3 +550,13 @@ class PageserverHttpClient(requests.Session): def tenant_break(self, tenant_id: TenantId): res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") self.verbose_error(res) + + def post_tracing_event(self, level: str, message: str): + res = self.post( + f"http://localhost:{self.port}/v1/tracing/event", + json={ + "level": level, + "message": message, + }, + ) + self.verbose_error(res) diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py new file mode 100644 index 0000000000..d559be0a8f --- /dev/null +++ b/test_runner/regress/test_logging.py @@ -0,0 +1,49 @@ +import uuid + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import wait_until + + +@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"]) +def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): + # self-test: make sure the event is logged (i.e., our testing endpoint works) + log_expected = { + "trace": False, + "debug": False, + "info": True, + "warn": True, + "error": True, + }[level] + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + msg_id = uuid.uuid4().hex + + # NB: the _total suffix is added by our prometheus client + before = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level}) + + # post the event + ps_http.post_tracing_event(level, msg_id) + if log_expected: + env.pageserver.allowed_errors.append(f".*{msg_id}.*") + + def assert_logged(): + if not log_expected: + return + assert env.pageserver.log_contains(f".*{msg_id}.*") + + wait_until(10, 0.5, assert_logged) + + # make sure it's counted + def assert_metric_value(): + if not log_expected: + return + # NB: the _total suffix is added by our prometheus client + val = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level}) + val = val or 0.0 + log.info("libmetrics_tracing_event_count: %s", val) + assert val > (before or 0.0) + + wait_until(10, 1, assert_metric_value) From 4911d7ce6f6ab1f89ac1b026add8514e8e84979d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 25 Apr 2023 15:22:23 +0300 Subject: [PATCH 301/426] feat: warn when requests get cancelled (#4064) Add a simple disarmable dropguard to log if request is cancelled before it is completed. We currently don't have this, and it makes for difficult to know when the request was dropped. --- libs/utils/src/http/endpoint.rs | 39 ++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 616f2b8468..b11aef9892 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -76,6 +76,7 @@ where let log_quietly = method == Method::GET; async move { + let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding(); if log_quietly { debug!("Handling request"); } else { @@ -87,7 +88,11 @@ where // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call. // // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally. - match (self.0)(request).await { + let res = (self.0)(request).await; + + cancellation_guard.disarm(); + + match res { Ok(response) => { let response_status = response.status(); if log_quietly && response_status.is_success() { @@ -105,6 +110,38 @@ where } } +/// Drop guard to WARN in case the request was dropped before completion. +struct RequestCancelled { + warn: Option, +} + +impl RequestCancelled { + /// Create the drop guard using the [`tracing::Span::current`] as the span. + fn warn_when_dropped_without_responding() -> Self { + RequestCancelled { + warn: Some(tracing::Span::current()), + } + } + + /// Consume the drop guard without logging anything. + fn disarm(mut self) { + self.warn = None; + } +} + +impl Drop for RequestCancelled { + fn drop(&mut self) { + if let Some(span) = self.warn.take() { + // the span has all of the info already, but the outer `.instrument(span)` has already + // been dropped, so we need to manually re-enter it for this message. + // + // this is what the instrument would do before polling so it is fine. + let _g = span.entered(); + warn!("request was dropped before completing"); + } + } +} + async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); From fa20e3757432a0b900f33a89441f7fee02fc06c9 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Apr 2023 14:22:48 +0200 Subject: [PATCH 302/426] add gauge for in-flight layer uploads (#3951) For the "worst-case /storage usage panel", we need to compute ``` remote size + local-only size ``` We currently don't have a metric for local-only layers. The number of in-flight layers in the upload queue is just that, so, let Prometheus scrape it. The metric is two counters (started and finished). The delta is the amount of in-flight uploads in the queue. The metrics are incremented in the respective `call_unfinished_metric_*` functions. These track ongoing operations by file_kind and op_kind. We only need this metric for layer uploads, so, there's the new RemoteTimelineClientMetricsCallTrackSize type that forces all call sites to decide whether they want the size tracked or not. If we find that other file_kinds or op_kinds are interesting (metadata uploads, layer downloads, layer deletes) are interesting, we can just enable them, and they'll be just another label combination within the metrics that this PR adds. fixes https://github.com/neondatabase/neon/issues/3922 --- pageserver/src/metrics.rs | 195 +++++++++++- .../src/tenant/remote_timeline_client.rs | 285 ++++++++++++++---- test_runner/fixtures/metrics.py | 2 + 3 files changed, 405 insertions(+), 77 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c075315683..cf60a1a404 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -385,6 +385,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new .expect("failed to define a metric") }); +static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_timeline_client_bytes_started", + "Incremented by the number of bytes associated with a remote timeline client operation. \ + The increment happens when the operation is scheduled.", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + ) + .expect("failed to define a metric") +}); + +static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_timeline_client_bytes_finished", + "Incremented by the number of bytes associated with a remote timeline client operation. \ + The increment happens when the operation finishes (regardless of success/failure/shutdown).", + &["tenant_id", "timeline_id", "file_kind", "op_kind"], + ) + .expect("failed to define a metric") +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -739,6 +759,8 @@ pub struct RemoteTimelineClientMetrics { remote_operation_time: Mutex>, calls_unfinished_gauge: Mutex>, calls_started_hist: Mutex>, + bytes_started_counter: Mutex>, + bytes_finished_counter: Mutex>, } impl RemoteTimelineClientMetrics { @@ -749,6 +771,8 @@ impl RemoteTimelineClientMetrics { remote_operation_time: Mutex::new(HashMap::default()), calls_unfinished_gauge: Mutex::new(HashMap::default()), calls_started_hist: Mutex::new(HashMap::default()), + bytes_started_counter: Mutex::new(HashMap::default()), + bytes_finished_counter: Mutex::new(HashMap::default()), remote_physical_size_gauge: Mutex::new(None), } } @@ -787,6 +811,7 @@ impl RemoteTimelineClientMetrics { }); metric.clone() } + fn calls_unfinished_gauge( &self, file_kind: &RemoteOpFileKind, @@ -828,32 +853,125 @@ impl RemoteTimelineClientMetrics { }); metric.clone() } + + fn bytes_started_counter( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntCounter { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.bytes_started_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } + + fn bytes_finished_counter( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntCounter { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.bytes_finished_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } +} + +#[cfg(test)] +impl RemoteTimelineClientMetrics { + pub fn get_bytes_started_counter_value( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Option { + let guard = self.bytes_started_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + guard.get(&key).map(|counter| counter.get()) + } + + pub fn get_bytes_finished_counter_value( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> Option { + let guard = self.bytes_finished_counter.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + guard.get(&key).map(|counter| counter.get()) + } } /// See [`RemoteTimelineClientMetrics::call_begin`]. #[must_use] -pub(crate) struct RemoteTimelineClientCallMetricGuard(Option); +pub(crate) struct RemoteTimelineClientCallMetricGuard { + /// Decremented on drop. + calls_unfinished_metric: Option, + /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop. + bytes_finished: Option<(IntCounter, u64)>, +} impl RemoteTimelineClientCallMetricGuard { - /// Consume this guard object without decrementing the metric. - /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out. + /// Consume this guard object without performing the metric updates it would do on `drop()`. + /// The caller vouches to do the metric updates manually. pub fn will_decrement_manually(mut self) { - self.0 = None; // prevent drop() from decrementing + let RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric, + bytes_finished, + } = &mut self; + calls_unfinished_metric.take(); + bytes_finished.take(); } } impl Drop for RemoteTimelineClientCallMetricGuard { fn drop(&mut self) { - if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self { + let RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric, + bytes_finished, + } = self; + if let Some(guard) = calls_unfinished_metric.take() { guard.dec(); } + if let Some((bytes_finished_metric, value)) = bytes_finished { + bytes_finished_metric.inc_by(*value); + } } } +/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to +/// track the byte size of this call in applicable metric(s). +pub(crate) enum RemoteTimelineClientMetricsCallTrackSize { + /// Do not account for this call's byte size in any metrics. + /// The `reason` field is there to make the call sites self-documenting + /// about why they don't need the metric. + DontTrackSize { reason: &'static str }, + /// Track the byte size of the call in applicable metric(s). + Bytes(u64), +} + impl RemoteTimelineClientMetrics { - /// Increment the metrics that track ongoing calls to the remote timeline client instance. + /// Update the metrics that change when a call to the remote timeline client instance starts. /// - /// Drop the returned guard object once the operation is finished to decrement the values. + /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions. /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that /// is more suitable. /// Never do both. @@ -861,24 +979,51 @@ impl RemoteTimelineClientMetrics { &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, + size: RemoteTimelineClientMetricsCallTrackSize, ) -> RemoteTimelineClientCallMetricGuard { - let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); self.calls_started_hist(file_kind, op_kind) - .observe(unfinished_metric.get() as f64); - unfinished_metric.inc(); - RemoteTimelineClientCallMetricGuard(Some(unfinished_metric)) + .observe(calls_unfinished_metric.get() as f64); + calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric + + let bytes_finished = match size { + RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => { + // nothing to do + None + } + RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { + self.bytes_started_counter(file_kind, op_kind).inc_by(size); + let finished_counter = self.bytes_finished_counter(file_kind, op_kind); + Some((finished_counter, size)) + } + }; + RemoteTimelineClientCallMetricGuard { + calls_unfinished_metric: Some(calls_unfinished_metric), + bytes_finished, + } } - /// Manually decrement the metric instead of using the guard object. + /// Manually udpate the metrics that track completions, instead of using the guard object. /// Using the guard object is generally preferable. /// See [`call_begin`] for more context. - pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) { - let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); + pub(crate) fn call_end( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + size: RemoteTimelineClientMetricsCallTrackSize, + ) { + let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); debug_assert!( - unfinished_metric.get() > 0, + calls_unfinished_metric.get() > 0, "begin and end should cancel out" ); - unfinished_metric.dec(); + calls_unfinished_metric.dec(); + match size { + RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {} + RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { + self.bytes_finished_counter(file_kind, op_kind).inc_by(size); + } + } } } @@ -891,6 +1036,8 @@ impl Drop for RemoteTimelineClientMetrics { remote_operation_time, calls_unfinished_gauge, calls_started_hist, + bytes_started_counter, + bytes_finished_counter, } = self; for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); @@ -911,6 +1058,22 @@ impl Drop for RemoteTimelineClientMetrics { b, ]); } + for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() { + let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } { let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 28c4943dbd..c42824a8b5 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -219,7 +219,8 @@ use utils::lsn::Lsn; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, - REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, + REMOTE_ONDEMAND_DOWNLOADED_LAYERS, }; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::{ @@ -367,9 +368,13 @@ impl RemoteTimelineClient { /// Download index file pub async fn download_index_file(&self) -> Result { - let _unfinished_gauge_guard = self - .metrics - .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download); + let _unfinished_gauge_guard = self.metrics.call_begin( + &RemoteOpFileKind::Index, + &RemoteOpKind::Download, + crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { + reason: "no need for a downloads gauge", + }, + ); download::download_index_part( self.conf, @@ -398,9 +403,13 @@ impl RemoteTimelineClient { layer_metadata: &LayerFileMetadata, ) -> anyhow::Result { let downloaded_size = { - let _unfinished_gauge_guard = self - .metrics - .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download); + let _unfinished_gauge_guard = self.metrics.call_begin( + &RemoteOpFileKind::Layer, + &RemoteOpKind::Download, + crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { + reason: "no need for a downloads gauge", + }, + ); download::download_layer_file( self.conf, &self.storage_impl, @@ -886,11 +895,32 @@ impl RemoteTimelineClient { fn calls_unfinished_metric_impl( &self, op: &UploadOp, - ) -> Option<(RemoteOpFileKind, RemoteOpKind)> { + ) -> Option<( + RemoteOpFileKind, + RemoteOpKind, + RemoteTimelineClientMetricsCallTrackSize, + )> { + use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize; let res = match op { - UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload), - UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload), - UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete), + UploadOp::UploadLayer(_, m) => ( + RemoteOpFileKind::Layer, + RemoteOpKind::Upload, + RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()), + ), + UploadOp::UploadMetadata(_, _) => ( + RemoteOpFileKind::Index, + RemoteOpKind::Upload, + DontTrackSize { + reason: "metadata uploads are tiny", + }, + ), + UploadOp::Delete(file_kind, _) => ( + *file_kind, + RemoteOpKind::Delete, + DontTrackSize { + reason: "should we track deletes? positive or negative sign?", + }, + ), UploadOp::Barrier(_) => { // we do not account these return None; @@ -900,20 +930,20 @@ impl RemoteTimelineClient { } fn calls_unfinished_metric_begin(&self, op: &UploadOp) { - let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { Some(x) => x, None => return, }; - let guard = self.metrics.call_begin(&file_kind, &op_kind); + let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes); guard.will_decrement_manually(); // in unfinished_ops_metric_end() } fn calls_unfinished_metric_end(&self, op: &UploadOp) { - let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) { + let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { Some(x) => x, None => return, }; - self.metrics.call_end(&file_kind, &op_kind); + self.metrics.call_end(&file_kind, &op_kind, track_bytes); } fn stop(&self) { @@ -981,11 +1011,19 @@ impl RemoteTimelineClient { mod tests { use super::*; use crate::{ - tenant::harness::{TenantHarness, TIMELINE_ID}, + context::RequestContext, + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + Tenant, + }, DEFAULT_PG_VERSION, }; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; - use std::{collections::HashSet, path::Path}; + use std::{ + collections::HashSet, + path::{Path, PathBuf}, + }; + use tokio::runtime::EnterGuard; use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { @@ -1034,39 +1072,80 @@ mod tests { assert_eq!(found, expected); } + struct TestSetup { + runtime: &'static tokio::runtime::Runtime, + entered_runtime: EnterGuard<'static>, + harness: TenantHarness<'static>, + tenant: Arc, + tenant_ctx: RequestContext, + remote_fs_dir: PathBuf, + client: Arc, + } + + impl TestSetup { + fn new(test_name: &str) -> anyhow::Result { + // Use a current-thread runtime in the test + let runtime = Box::leak(Box::new( + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?, + )); + let entered_runtime = runtime.enter(); + + let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); + let harness = TenantHarness::create(test_name)?; + let (tenant, ctx) = runtime.block_on(harness.load()); + // create an empty timeline directory + let timeline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let _ = timeline.initialize(&ctx).unwrap(); + + let remote_fs_dir = harness.conf.workdir.join("remote_fs"); + std::fs::create_dir_all(remote_fs_dir)?; + let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; + + let storage_config = RemoteStorageConfig { + max_concurrent_syncs: std::num::NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, + ) + .unwrap(), + max_sync_errors: std::num::NonZeroU32::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, + ) + .unwrap(), + storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + }; + + let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + + let client = Arc::new(RemoteTimelineClient { + conf: harness.conf, + runtime, + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + storage_impl: storage, + upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &harness.tenant_id, + &TIMELINE_ID, + )), + }); + + Ok(Self { + runtime, + entered_runtime, + harness, + tenant, + tenant_ctx: ctx, + remote_fs_dir, + client, + }) + } + } + // Test scheduling #[test] fn upload_scheduling() -> anyhow::Result<()> { - // Use a current-thread runtime in the test - let runtime = Box::leak(Box::new( - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?, - )); - let _entered = runtime.enter(); - - let harness = TenantHarness::create("upload_scheduling")?; - let (tenant, ctx) = runtime.block_on(harness.load()); - let _timeline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; - let timeline_path = harness.timeline_path(&TIMELINE_ID); - - let remote_fs_dir = harness.conf.workdir.join("remote_fs"); - std::fs::create_dir_all(remote_fs_dir)?; - let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; - - let storage_config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - ) - .unwrap(), - max_sync_errors: std::num::NonZeroU32::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - ) - .unwrap(), - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), - }; - // Test outline: // // Schedule upload of a bunch of layers. Check that they are started immediately, not queued @@ -1081,21 +1160,19 @@ mod tests { // Schedule another deletion. Check that it's launched immediately. // Schedule index upload. Check that it's queued - println!("workdir: {}", harness.conf.workdir.display()); - - let storage_impl = GenericRemoteStorage::from_config(&storage_config)?; - let client = Arc::new(RemoteTimelineClient { - conf: harness.conf, + let TestSetup { runtime, - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - storage_impl, - upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new( - &harness.tenant_id, - &TIMELINE_ID, - )), - }); + entered_runtime: _entered_runtime, + harness, + tenant: _tenant, + tenant_ctx: _tenant_ctx, + remote_fs_dir, + client, + } = TestSetup::new("upload_scheduling").unwrap(); + + let timeline_path = harness.timeline_path(&TIMELINE_ID); + + println!("workdir: {}", harness.conf.workdir.display()); let remote_timeline_dir = remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?); @@ -1216,4 +1293,90 @@ mod tests { Ok(()) } + + #[test] + fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> { + // Setup + + let TestSetup { + runtime, + harness, + client, + .. + } = TestSetup::new("metrics")?; + + let metadata = dummy_metadata(Lsn(0x10)); + client.init_upload_queue_for_empty_remote(&metadata)?; + + let timeline_path = harness.timeline_path(&TIMELINE_ID); + + let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let content_1 = dummy_contents("foo"); + std::fs::write( + timeline_path.join(layer_file_name_1.file_name()), + &content_1, + )?; + + #[derive(Debug, PartialEq)] + struct BytesStartedFinished { + started: Option, + finished: Option, + } + let get_bytes_started_stopped = || { + let started = client + .metrics + .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) + .map(|v| v.try_into().unwrap()); + let stopped = client + .metrics + .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload) + .map(|v| v.try_into().unwrap()); + BytesStartedFinished { + started, + finished: stopped, + } + }; + + // Test + + let init = get_bytes_started_stopped(); + + client.schedule_layer_file_upload( + &layer_file_name_1, + &LayerFileMetadata::new(content_1.len() as u64), + )?; + + let pre = get_bytes_started_stopped(); + + runtime.block_on(client.wait_completion())?; + + let post = get_bytes_started_stopped(); + + // Validate + + assert_eq!( + init, + BytesStartedFinished { + started: None, + finished: None + } + ); + assert_eq!( + pre, + BytesStartedFinished { + started: Some(content_1.len()), + // assert that the _finished metric is created eagerly so that subtractions work on first sample + finished: Some(0), + } + ); + assert_eq!( + post, + BytesStartedFinished { + started: Some(content_1.len()), + finished: Some(content_1.len()) + } + ); + + Ok(()) + } } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c88b985c8e..5fed6fcf84 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -45,6 +45,8 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]], *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]], "pageserver_remote_physical_size", + "pageserver_remote_timeline_client_bytes_started_total", + "pageserver_remote_timeline_client_bytes_finished_total", ) PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( From cb9473928df94148b42297fe30b0b99682609249 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 25 Apr 2023 16:22:16 +0300 Subject: [PATCH 303/426] feat: add rough timings for basebackup (#4062) just record the time needed for waiting the lsn and then the basebackup in a log message in millis. this is related to ongoing investigations to cold start performance. this could also be a a counter. it cannot be added next to smgr histograms, because we don't want another histogram per timeline. the aim is to allow drilling deeper into which timelines were slow, and to understand why some need two basebackups. --- pageserver/src/page_service.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bd38a7a2f3..135f08e846 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -700,6 +700,8 @@ impl PageServerHandler { full_backup: bool, ctx: RequestContext, ) -> anyhow::Result<()> { + let started = std::time::Instant::now(); + // check that the timeline exists let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); @@ -712,6 +714,8 @@ impl PageServerHandler { .context("invalid basebackup lsn")?; } + let lsn_awaited_after = started.elapsed(); + // switch client to COPYOUT pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; pgb.flush().await?; @@ -732,7 +736,17 @@ impl PageServerHandler { pgb.write_message_noflush(&BeMessage::CopyDone)?; pgb.flush().await?; - info!("basebackup complete"); + + let basebackup_after = started + .elapsed() + .checked_sub(lsn_awaited_after) + .unwrap_or(Duration::ZERO); + + info!( + lsn_await_millis = lsn_awaited_after.as_millis(), + basebackup_millis = basebackup_after.as_millis(), + "basebackup complete" + ); Ok(()) } From dbbe032c395f7f4a8a13e4e4631adb801a09c1bd Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 25 Apr 2023 15:33:30 +0200 Subject: [PATCH 304/426] neon_local: fix `tenant create -c eviction_policy:...` (#4004) And add corresponding unit test. The fix is to use `.remove()` instead of `.get()` when processing the arugments hash map. The code uses emptiness of the hash map to determine whether all arguments have been processed. This was likely a copy-paste error. refs https://github.com/neondatabase/neon/issues/3942 --- control_plane/src/pageserver.rs | 4 ++-- test_runner/regress/test_tenant_conf.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index b700d426ba..75991045a4 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -359,8 +359,8 @@ impl PageServerNode { .transpose() .context("Failed to parse 'trace_read_requests' as bool")?, eviction_policy: settings - .get("eviction_policy") - .map(|x| serde_json::from_str(x)) + .remove("eviction_policy") + .map(serde_json::from_str) .transpose() .context("Failed to parse 'eviction_policy' json")?, min_resident_size_override: settings diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 1ed86d19a2..b83bd5fc99 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,3 +1,4 @@ +import json from contextlib import closing import psycopg2.extras @@ -22,6 +23,7 @@ wait_lsn_timeout='111 s'; checkpoint_distance = 10000 compaction_target_size = 1048576 evictions_low_residence_duration_metric_threshold = "2 days" +eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" } """ env = neon_env_builder.init_start() @@ -44,6 +46,7 @@ evictions_low_residence_duration_metric_threshold = "2 days" "checkpoint_distance": "20000", "gc_period": "30sec", "evictions_low_residence_duration_metric_threshold": "42s", + "eviction_policy": json.dumps({"kind": "NoEviction"}), } tenant, _ = env.neon_cli.create_tenant(conf=new_conf) @@ -84,6 +87,11 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert effective_config["image_creation_threshold"] == 3 assert effective_config["pitr_interval"] == "7days" assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" + assert effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23h", + } # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: @@ -121,6 +129,9 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert ( new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s" ), "Should override default value" + assert new_effective_config["eviction_policy"] == { + "kind": "NoEviction" + }, "Specific 'eviction_policy' config should override the default value" assert new_effective_config["compaction_target_size"] == 1048576 assert new_effective_config["compaction_period"] == "20s" assert new_effective_config["compaction_threshold"] == 10 @@ -135,6 +146,9 @@ evictions_low_residence_duration_metric_threshold = "2 days" "compaction_period": "80sec", "image_creation_threshold": "2", "evictions_low_residence_duration_metric_threshold": "23h", + "eviction_policy": json.dumps( + {"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"} + ), } env.neon_cli.config_tenant( tenant_id=tenant, @@ -180,6 +194,11 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert ( updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h" ), "Should override default value" + assert updated_effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "period": "1m 20s", + "threshold": "1day 18h", + }, "Specific 'eviction_policy' config should override the default value" assert updated_effective_config["compaction_target_size"] == 1048576 assert updated_effective_config["compaction_threshold"] == 10 assert updated_effective_config["gc_horizon"] == 67108864 @@ -239,6 +258,11 @@ evictions_low_residence_duration_metric_threshold = "2 days" assert final_effective_config["gc_period"] == "1h" assert final_effective_config["image_creation_threshold"] == 3 assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" + assert final_effective_config["eviction_policy"] == { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23h", + } # restart the pageserver and ensure that the config is still correct env.pageserver.stop() From 78bbbccadbc66bef6715a5a2ad1324ccacb94587 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Tue, 25 Apr 2023 16:46:52 +0200 Subject: [PATCH 305/426] Deploy proxies for preview enviroments (#4052) ## Describe your changes Deploy `main` proxies to the preview environments We don't deploy storage there yet, as it's tricky. ## Issue ticket number and link https://github.com/neondatabase/cloud/issues/4737 --- .../ansible/staging.eu-central-1.hosts.yaml | 47 +++++++++++++ ...u-central-1-alpha.neon-storage-broker.yaml | 52 ++++++++++++++ .../preview-template.neon-proxy-scram.yaml | 67 +++++++++++++++++++ .github/workflows/deploy-dev.yml | 52 +++++++++++++- 4 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 .github/ansible/staging.eu-central-1.hosts.yaml create mode 100644 .github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml create mode 100644 .github/helm-values/preview-template.neon-proxy-scram.yaml diff --git a/.github/ansible/staging.eu-central-1.hosts.yaml b/.github/ansible/staging.eu-central-1.hosts.yaml new file mode 100644 index 0000000000..db1d1adcff --- /dev/null +++ b/.github/ansible/staging.eu-central-1.hosts.yaml @@ -0,0 +1,47 @@ +storage: + vars: + bucket_name: neon-dev-storage-eu-central-1 + bucket_region: eu-central-1 + # We only register/update storage in one preview console and manually copy to other instances + console_mgmt_base_url: http://neon-internal-api.helium.aws.neon.build + broker_endpoint: http://storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build:50051 + pageserver_config_stub: + pg_distrib_dir: /usr/local + metric_collection_endpoint: http://neon-internal-api.helium.aws.neon.build/billing/api/v1/usage_events + metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: eu-central-1 + ansible_aws_ssm_bucket_name: neon-dev-storage-eu-central-1 + console_region_id: aws-eu-central-1 + sentry_environment: staging + + children: + pageservers: + hosts: + pageserver-0.eu-central-1.aws.neon.build: + ansible_host: i-011f93ec26cfba2d4 + + safekeepers: + hosts: + safekeeper-0.eu-central-1.aws.neon.build: + ansible_host: i-0ff026d27babf8ddd + safekeeper-1.eu-central-1.aws.neon.build: + ansible_host: i-03983a49ee54725d9 + safekeeper-2.eu-central-1.aws.neon.build: + ansible_host: i-0bd025ecdb61b0db3 diff --git a/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml new file mode 100644 index 0000000000..aaa1ec59b4 --- /dev/null +++ b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml @@ -0,0 +1,52 @@ +# Helm chart values for neon-storage-broker +podLabels: + neon_env: staging + neon_service: storage-broker + +# Use L4 LB +service: + # service.annotations -- Annotations to add to the service + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet + # assign service to this name at external-dns + external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build + # service.type -- Service type + type: LoadBalancer + # service.port -- broker listen port + port: 50051 + +ingress: + enabled: false + +metrics: + enabled: false + +extraManifests: + - apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMServiceScrape + metadata: + name: "{{ include \"neon-storage-broker.fullname\" . }}" + labels: + helm.sh/chart: neon-storage-broker-{{ .Chart.Version }} + app.kubernetes.io/name: neon-storage-broker + app.kubernetes.io/instance: neon-storage-broker + app.kubernetes.io/version: "{{ .Chart.AppVersion }}" + app.kubernetes.io/managed-by: Helm + namespace: "{{ .Release.Namespace }}" + spec: + selector: + matchLabels: + app.kubernetes.io/name: "neon-storage-broker" + endpoints: + - port: broker + path: /metrics + interval: 10s + scrapeTimeout: 10s + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" + +settings: + sentryEnvironment: "staging" diff --git a/.github/helm-values/preview-template.neon-proxy-scram.yaml b/.github/helm-values/preview-template.neon-proxy-scram.yaml new file mode 100644 index 0000000000..f4bd418e28 --- /dev/null +++ b/.github/helm-values/preview-template.neon-proxy-scram.yaml @@ -0,0 +1,67 @@ +# Helm chart values for neon-proxy-scram. +# This is a YAML-formatted file. + +deploymentStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 100% + maxUnavailable: 50% + +image: + repository: neondatabase/neon + +settings: + authBackend: "console" + authEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/management/api/v2" + domain: "*.cloud.${PREVIEW_NAME}.aws.neon.build" + sentryEnvironment: "staging" + wssPort: 8443 + metricCollectionEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/billing/api/v1/usage_events" + metricCollectionInterval: "1min" + +# -- Additional labels for neon-proxy pods +podLabels: + neon_service: proxy-scram + neon_env: test + neon_region: ${PREVIEW_NAME}.eu-central-1 + + +exposedService: + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + external-dns.alpha.kubernetes.io/hostname: cloud.${PREVIEW_NAME}.aws.neon.build + httpsPort: 443 + +#metrics: +# enabled: true +# serviceMonitor: +# enabled: true +# selector: +# release: kube-prometheus-stack + +extraManifests: + - apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMServiceScrape + metadata: + name: "{{ include \"neon-proxy.fullname\" . }}" + labels: + helm.sh/chart: neon-proxy-{{ .Chart.Version }} + app.kubernetes.io/name: neon-proxy + app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}" + app.kubernetes.io/version: "{{ .Chart.AppVersion }}" + app.kubernetes.io/managed-by: Helm + namespace: "{{ .Release.Namespace }}" + spec: + selector: + matchLabels: + app.kubernetes.io/name: "neon-proxy" + endpoints: + - port: http + path: /metrics + interval: 10s + scrapeTimeout: 10s + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index b080a29f7c..fba292f0f9 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -48,7 +48,8 @@ jobs: shell: bash strategy: matrix: - target_region: [ eu-west-1, us-east-2 ] + # TODO(sergey): Fix storage deploy in eu-central-1 + target_region: [ eu-west-1, us-east-2] environment: name: dev-${{ matrix.target_region }} steps: @@ -133,6 +134,53 @@ jobs: - name: Cleanup helm folder run: rm -rf ~/.cache + + deploy-preview-proxy-new: + runs-on: [ self-hosted, gen3, small ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + if: inputs.deployProxy + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: eu-central-1 + target_cluster: dev-eu-central-1-alpha + environment: + name: dev-${{ matrix.target_region }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + ref: ${{ inputs.branch }} + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + role-to-assume: arn:aws:iam::369495373322:role/github-runner + aws-region: eu-central-1 + role-skip-session-tagging: true + role-duration-seconds: 1800 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Re-deploy preview proxies + run: | + DOCKER_TAG=${{ inputs.dockerTag }} + for PREVIEW_NAME in helium argon krypton xenon radon oganesson hydrogen nitrogen oxygen fluorine chlorine; do + export PREVIEW_NAME + envsubst <.github/helm-values/preview-template.neon-proxy-scram.yaml >preview-${PREVIEW_NAME}.neon-proxy-scram.yaml + helm upgrade neon-proxy-scram-${PREVIEW_NAME} neondatabase/neon-proxy --namespace neon-proxy-${PREVIEW_NAME} --create-namespace --install --atomic -f preview-${PREVIEW_NAME}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + done + + - name: Cleanup helm folder + run: rm -rf ~/.cache deploy-storage-broker-new: runs-on: [ self-hosted, gen3, small ] @@ -148,6 +196,8 @@ jobs: target_cluster: dev-us-east-2-beta - target_region: eu-west-1 target_cluster: dev-eu-west-1-zeta + - target_region: eu-central-1 + target_cluster: dev-central-1-alpha environment: name: dev-${{ matrix.target_region }} steps: From 7f80230fd21cacfb20cae09befc7725abb9c0efe Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 25 Apr 2023 18:07:04 +0300 Subject: [PATCH 306/426] fix: stop dead_code rustc lint (#4070) only happens without `--all-features` which is what `./run_clippy.sh` uses. --- pageserver/src/http/routes.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3318e5263c..b1251123b2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1201,6 +1201,7 @@ async fn handler_404(_: Request) -> Result, ApiError> { ) } +#[cfg(feature = "testing")] async fn post_tracing_event_handler(mut r: Request) -> Result, ApiError> { #[derive(Debug, serde::Deserialize)] #[serde(rename_all = "lowercase")] From bfd45dd6713a2e9038954cf0368b1a082937b045 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 25 Apr 2023 18:41:09 +0300 Subject: [PATCH 307/426] test_tenant_config: allow ERROR from eviction task (#4074) --- test_runner/regress/test_tenant_conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index b83bd5fc99..8677a554f7 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -27,6 +27,8 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = """ env = neon_env_builder.init_start() + # we configure eviction but no remote storage, there might be error lines + env.pageserver.allowed_errors.append(".* no remote storage configured, cannot evict layers .*") http_client = env.pageserver.http_client() # Check that we raise on misspelled configs From 05ac0e2493ce18992aab525c5c7419b954c1649a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 25 Apr 2023 17:54:10 +0100 Subject: [PATCH 308/426] Login to ECR and Docker Hub at once (#4067) - Update kaniko to 1.9.2 (from 1.7.0), problem with reproducible build is fixed - Login to ECR and Docker Hub at once, so we can push to several registries, it makes job `push-docker-hub` unneeded - `push-docker-hub` replaced with `promote-images` in `needs:` clause, Pushing images to production ECR moved to `promote-images` job --- .github/workflows/build_and_test.yml | 216 ++++++++++++++------------- 1 file changed, 115 insertions(+), 101 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3212b76731..bdcf2463bc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -541,7 +541,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init - needs: [ push-docker-hub, tag ] + needs: [ promote-images, tag ] steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -584,8 +584,7 @@ jobs: neon-image: runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - # https://github.com/GoogleContainerTools/kaniko/issues/2005 - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: shell: sh -eu {0} @@ -597,11 +596,32 @@ jobs: submodules: true fetch-depth: 0 - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build neon - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder @@ -652,7 +672,7 @@ jobs: compute-tools-image: runs-on: [ self-hosted, gen3, large ] needs: [ tag ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: shell: sh -eu {0} @@ -661,18 +681,41 @@ jobs: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build compute tools - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --dockerfile Dockerfile.compute-tools + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder run: rm -rf ~/.ecr compute-node-image: runs-on: [ self-hosted, gen3, large ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug + container: gcr.io/kaniko-project/executor:v1.9.2-debug needs: [ tag ] strategy: fail-fast: false @@ -689,12 +732,36 @@ jobs: submodules: true fetch-depth: 0 - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Configure ECR and Docker Hub login + run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + + cat <<-EOF > /kaniko/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Kaniko build compute node with extensions - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + run: + /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true + --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache + --context . + --build-arg GIT_VERSION=${{ github.sha }} + --build-arg PG_VERSION=${{ matrix.version }} + --dockerfile Dockerfile.compute-node + --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - name: Cleanup ECR folder run: rm -rf ~/.ecr @@ -786,26 +853,45 @@ jobs: runs-on: [ self-hosted, gen3, small ] needs: [ tag, test-images, vm-compute-node-image ] container: golang:1.19-bullseye - if: github.event_name != 'workflow_dispatch' + # Don't add if-condition here. + # The job should always be run because we have dependant other jobs that shouldn't be skipped steps: - name: Install Crane & ECR helper if: | (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + github.event_name != 'workflow_dispatch' run: | go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - - name: Configure ECR login + - name: Configure ECR and Docker Hub login + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' run: | + DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) + echo "::add-mask::${DOCKERHUB_AUTH}" + mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + cat <<-EOF > /github/home/.docker/config.json + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "${DOCKERHUB_AUTH}" + } + }, + "credHelpers": { + "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login", + "093970136003.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" + } + } + EOF - name: Add latest tag to images if: | (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + github.event_name != 'workflow_dispatch' run: | crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest @@ -814,50 +900,17 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - - name: Cleanup ECR folder - run: rm -rf ~/.ecr - - push-docker-hub: - runs-on: [ self-hosted, dev, x64 ] - needs: [ promote-images, tag ] - container: golang:1.19-bullseye - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Pull neon image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon - - - name: Pull compute tools image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools - - - name: Pull compute node v14 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 - - - name: Pull vm compute node v14 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 - - - name: Pull compute node v15 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15 - - - name: Pull vm compute node v15 image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - - - name: Pull rust image from ECR - run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust + crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - name: Push images to production ECR if: | (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + github.event_name != 'workflow_dispatch' run: | crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest @@ -866,45 +919,6 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest - - name: Configure Docker Hub login - run: | - # ECR Credential Helper & Docker Hub don't work together in config, hence reset - echo "" > /github/home/.docker/config.json - crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - - - name: Push neon image to Docker Hub - run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}} - - - name: Push compute tools image to Docker Hub - run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} - - - name: Push compute node v14 image to Docker Hub - run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} - - - name: Push vm compute node v14 image to Docker Hub - run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} - - - name: Push compute node v15 image to Docker Hub - run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} - - - name: Push vm compute node v15 image to Docker Hub - run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - - - name: Push rust image to Docker Hub - run: crane push rust neondatabase/rust:pinned - - - name: Add latest tag to images in Docker Hub - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - run: | - crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - - name: Cleanup ECR folder run: rm -rf ~/.ecr @@ -913,7 +927,7 @@ jobs: container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: | contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && github.event_name != 'workflow_dispatch' @@ -947,7 +961,7 @@ jobs: deploy: runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: - name: Fix git ownership @@ -984,7 +998,7 @@ jobs: container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ push-docker-hub, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests ] if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' steps: - name: Promote compatibility snapshot for the release From 8945fbdb31d9d28aa88194153b56eee6e4a39605 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 25 Apr 2023 20:45:36 +0300 Subject: [PATCH 309/426] Enable OpenTelemetry tracing in proxy in staging. (#4065) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depends on https://github.com/neondatabase/helm-charts/pull/32 Co-authored-by: Lassi Pölönen --- .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml | 1 + .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml | 1 + .../helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 1 + .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index a8567665d3..a7d8587ec2 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -23,6 +23,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.eu-west-1.aws.neon.build" + otelExporterOtlpEndpoint: "https://otel-collector.zeta.eu-west-1.internal.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index feca05aff6..893e0fab10 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -9,6 +9,7 @@ settings: authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/" uri: "https://console.stage.neon.tech/psql_session/" domain: "pg.neon.build" + otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build" sentryEnvironment: "staging" metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index 46cfdd2e69..77f6cf080e 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.cloud.stage.neon.tech" + otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index fdd869c122..2510d624cd 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -25,6 +25,7 @@ settings: authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.us-east-2.aws.neon.build" extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"] + otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" From 2d6fd72177c89645b2b718880796a6e04ff4ebfa Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 25 Apr 2023 23:58:59 +0100 Subject: [PATCH 310/426] GitHub Workflows: Fix crane for several registries (#4076) Follow-up fix after https://github.com/neondatabase/neon/pull/4067 ``` + crane tag neondatabase/vm-compute-node-v14:3064 latest Error: fetching "neondatabase/vm-compute-node-v14:3064": GET https://index.docker.io/v2/neondatabase/vm-compute-node-v14/manifests/3064: MANIFEST_UNKNOWN: manifest unknown; unknown tag=3064 ``` I reverted back the previous approach for promoting images (login to one registry, save images to local fs, logout and login to another registry, and push images from local fs). It turns out what works for one Google project (kaniko), doesn't work for another (crane) [sigh] --- .github/workflows/build_and_test.yml | 60 ++++++++++++++-------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bdcf2463bc..15a6a611b1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -858,35 +858,19 @@ jobs: steps: - name: Install Crane & ECR helper - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' run: | go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - - name: Configure ECR and Docker Hub login - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + - name: Configure ECR login run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" - mkdir /github/home/.docker/ - cat <<-EOF > /github/home/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login", - "093970136003.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Copy vm-compute-node images to Docker Hub + run: | + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - name: Add latest tag to images if: | @@ -900,13 +884,6 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - - name: Push images to production ECR if: | (github.ref_name == 'main' || github.ref_name == 'release') && @@ -919,6 +896,29 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest + - name: Configure Docker Hub login + run: | + # ECR Credential Helper & Docker Hub don't work together in config, hence reset + echo "" > /github/home/.docker/config.json + crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io + + - name: Push vm-compute-node to Docker Hub + run: | + crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} + crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} + + - name: Push latest tags to Docker Hub + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + - name: Cleanup ECR folder run: rm -rf ~/.ecr From 9d0cf08d5f26ba63691335f7169409454e3e608f Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 26 Apr 2023 09:29:44 +0200 Subject: [PATCH 311/426] Fix new storage-broker deploy for eu-central-1 (#4079) --- .github/workflows/deploy-dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index fba292f0f9..5d1c6e0e16 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -197,7 +197,7 @@ jobs: - target_region: eu-west-1 target_cluster: dev-eu-west-1-zeta - target_region: eu-central-1 - target_cluster: dev-central-1-alpha + target_cluster: dev-eu-central-1-alpha environment: name: dev-${{ matrix.target_region }} steps: From f19b70b379f426bc48fe692f368dab94f4a6af25 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 26 Apr 2023 09:36:26 +0200 Subject: [PATCH 312/426] Configure extra domain for us-east-1 (#4078) --- .../helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml index f113d1f861..1c7e646810 100644 --- a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml @@ -23,8 +23,8 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-1.aws.neon.tech" - # These domains haven't been delegated yet. - # extraDomains: ["*.us-east-1.retooldb.com", "*.us-east-1.postgres.vercel-storage.com"] + # *.us-east-1.retooldb.com hasn't been delegated yet. + extraDomains: ["*.us-east-1.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" From 850f6b1cb9baae004a879027d91858237546c56f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 26 Apr 2023 11:49:29 +0300 Subject: [PATCH 313/426] refactor: drop pageserver_ondisk_layers (#4071) I didn't get through #3775 fast enough so we wanted to remove this metric. Fixes #3705. --- pageserver/src/metrics.rs | 11 +++-------- pageserver/src/tenant/layer_map.rs | 4 ---- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index cf60a1a404..d6978a8cf6 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,9 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, - register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, - Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, - UIntGauge, UIntGaugeVec, + register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, Counter, CounterVec, + Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, + UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::models::TenantState; @@ -350,11 +350,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { - register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") - .expect("failed to define a metric") -}); - // remote storage metrics /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 02159ee291..0ee0c6f77d 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -48,7 +48,6 @@ mod layer_coverage; use crate::context::RequestContext; use crate::keyspace::KeyPartitioning; -use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; @@ -288,7 +287,6 @@ where self.l0_delta_layers.push(layer); } - NUM_ONDISK_LAYERS.inc(); Ok(()) } @@ -314,8 +312,6 @@ where "failed to locate removed historic layer from l0_delta_layers" ); } - - NUM_ONDISK_LAYERS.dec(); } pub(self) fn replace_historic_noflush( From 4625da316447a6bf8e345fefbfd30f860fb51074 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 26 Apr 2023 12:07:45 +0300 Subject: [PATCH 314/426] build: remove busted sk-1.us-east-2 from staging hosts (#4082) this should give us complete deployments while a new one is being brought up. --- .github/ansible/staging.us-east-2.hosts.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index e63ed6e639..dacc5567c3 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -48,8 +48,6 @@ storage: hosts: safekeeper-0.us-east-2.aws.neon.build: ansible_host: i-027662bd552bf5db0 - safekeeper-1.us-east-2.aws.neon.build: - ansible_host: i-0171efc3604a7b907 safekeeper-2.us-east-2.aws.neon.build: ansible_host: i-0de0b03a51676a6ce safekeeper-99.us-east-2.aws.neon.build: From 381c8fca4f1d700ad5118800e6b2b3f9e33a07b5 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 26 Apr 2023 12:39:17 +0300 Subject: [PATCH 315/426] feat: log how long tenant activation takes (#4080) Adds just a counter counting up from the creation to the tenant, logged after activation. Might help guide us with the investigation of #4025. --- pageserver/src/tenant.rs | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 11415b47c4..b5966b4618 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -118,6 +118,10 @@ pub struct Tenant { // Global pageserver config parameters pub conf: &'static PageServerConf, + /// The value creation timestamp, used to measure activation delay, see: + /// + loading_started_at: Instant, + state: watch::Sender, // Overridden tenant-specific config parameters. @@ -1476,7 +1480,7 @@ impl Tenant { TenantState::Loading | TenantState::Attaching => { *current_state = TenantState::Active; - info!("Activating tenant {}", self.tenant_id); + debug!(tenant_id = %self.tenant_id, "Activating tenant"); let timelines_accessor = self.timelines.lock().unwrap(); let not_broken_timelines = timelines_accessor @@ -1487,12 +1491,17 @@ impl Tenant { // down when they notice that the tenant is inactive. tasks::start_background_loops(self.tenant_id); + let mut activated_timelines = 0; + let mut timelines_broken_during_activation = 0; + for timeline in not_broken_timelines { match timeline .activate(ctx) .context("timeline activation for activating tenant") { - Ok(()) => {} + Ok(()) => { + activated_timelines += 1; + } Err(e) => { error!( "Failed to activate timeline {}: {:#}", @@ -1503,9 +1512,26 @@ impl Tenant { "failed to activate timeline {}: {}", timeline.timeline_id, e )); + + timelines_broken_during_activation += 1; } } } + + let elapsed = self.loading_started_at.elapsed(); + let total_timelines = timelines_accessor.len(); + + // log a lot of stuff, because some tenants sometimes suffer from user-visible + // times to activate. see https://github.com/neondatabase/neon/issues/4025 + info!( + since_creation_millis = elapsed.as_millis(), + tenant_id = %self.tenant_id, + activated_timelines, + timelines_broken_during_activation, + total_timelines, + post_state = <&'static str>::from(&*current_state), + "activation attempt finished" + ); } } }); @@ -1812,6 +1838,9 @@ impl Tenant { Tenant { tenant_id, conf, + // using now here is good enough approximation to catch tenants with really long + // activation times. + loading_started_at: Instant::now(), tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), gc_cs: tokio::sync::Mutex::new(()), From 31a3910fd9b60043651380d58771f97558f10771 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 14 Apr 2023 12:59:08 +0400 Subject: [PATCH 316/426] Remove wait_for_sk_commit_lsn_to_reach_remote_storage. It had a couple of inherent races: 1) Even if compute is killed before the call, some more data might still arrive to safekeepers after commit_lsn on them is polled, advancing it. Then checkpoint on pageserver might not include this tail, and so upload of expected LSN won't happen until one more checkpoint. 2) commit_lsn is updated asynchronously -- compute can commit transaction before communicating commit_lsn to even single safekeeper (sync-safekeepers can be used to forces the advancement). This makes semantics of wait_for_sk_commit_lsn_to_reach_remote_storage quite complicated. Replace it with last_flush_lsn_upload which 1) Learns last flush LSN on compute; 2) Waits for it to arrive to pageserver; 3) Checkpoints it; 4) Waits for the upload. In some tests this keeps compute alive longer than before, but this doesn't seem to be important. There is a chance this fixes https://github.com/neondatabase/neon/issues/3209 --- test_runner/fixtures/neon_fixtures.py | 40 ++++++------------- test_runner/fixtures/pageserver/utils.py | 5 +-- test_runner/regress/test_layer_eviction.py | 9 ++--- test_runner/regress/test_ondemand_download.py | 11 ++--- .../test_tenants_with_remote_storage.py | 7 +--- 5 files changed, 23 insertions(+), 49 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c6610ba062..f209dca560 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2928,32 +2928,18 @@ def fork_at_current_lsn( return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) -def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( - tenant_id: TenantId, - timeline_id: TimelineId, - safekeepers: List[Safekeeper], - pageserver: NeonPageserver, -): - sk_commit_lsns = [ - sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers - ] - lsn = max(sk_commit_lsns) - ps_http = pageserver.http_client() - wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn) - return lsn - - -def wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id: TenantId, - timeline_id: TimelineId, - safekeepers: List[Safekeeper], - pageserver: NeonPageserver, -): - lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( - tenant_id, timeline_id, safekeepers, pageserver - ) - ps_http = pageserver.http_client() +def last_flush_lsn_upload( + env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId +) -> Lsn: + """ + Wait for pageserver to catch to the latest flush LSN of given endpoint, + checkpoint pageserver, and wait for it to be uploaded (remote_consistent_lsn + reaching flush LSN). + """ + last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + ps_http = env.pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn) # force a checkpoint to trigger upload ps_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(ps_http, tenant_id, timeline_id, lsn) - return lsn + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + return last_flush_lsn diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index c060fc8dea..7f8bb40bda 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -54,10 +54,9 @@ def wait_for_upload( if current_lsn >= lsn: log.info("wait finished") return + lr_lsn = last_record_lsn(pageserver_http, tenant, timeline) log.info( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) + f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}" ) time.sleep(1) raise Exception( diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 1ae32fb398..a96532c0d8 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -6,7 +6,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, wait_for_last_flush_lsn, - wait_for_sk_commit_lsn_to_reach_remote_storage, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId @@ -199,7 +198,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): # with image_creation_threshold=1 which we will use on the last compaction cur.execute("vacuum") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + last_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) if i == 1 and j == 2 and k == 1: # last iteration; stop before checkpoint to avoid leaving an inmemory layer @@ -222,10 +221,8 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): tenant_update_config({"image_creation_threshold": "1"}) ps_http.timeline_compact(tenant_id, timeline_id) - # wait for all uploads to finish - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) + # wait for all uploads to finish (checkpoint has been done above) + wait_for_upload(ps_http, tenant_id, timeline_id, last_lsn) # shutdown safekeepers to avoid on-demand downloads from walreceiver for sk in env.safekeepers: diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index cb08b014fd..5c02708457 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -12,8 +12,8 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, available_remote_storages, + last_flush_lsn_upload, wait_for_last_flush_lsn, - wait_for_sk_commit_lsn_to_reach_remote_storage, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( @@ -207,9 +207,7 @@ def test_ondemand_download_timetravel( env.endpoints.stop_all() # wait until pageserver has successfully uploaded all the data to remote storage - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) + wait_for_upload(client, tenant_id, timeline_id, current_lsn) def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) @@ -347,12 +345,9 @@ def test_download_remote_layers_api( """ ) + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) env.endpoints.stop_all() - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) - def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) return d["current_physical_size"] diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index d7c0814570..dca2cd3d28 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -21,7 +21,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, available_remote_storages, - wait_for_sk_commit_lsn_to_reach_remote_storage, + last_flush_lsn_upload, ) from fixtures.pageserver.utils import ( assert_tenant_state, @@ -174,12 +174,9 @@ def test_tenants_attached_after_download( ) ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) env.endpoints.stop_all() - wait_for_sk_commit_lsn_to_reach_remote_storage( - tenant_id, timeline_id, env.safekeepers, env.pageserver - ) - env.pageserver.stop() timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) From 11df2ee5d70d23bac233051d5e974d830222a967 Mon Sep 17 00:00:00 2001 From: Sergey Melnikov Date: Wed, 26 Apr 2023 13:40:36 +0200 Subject: [PATCH 317/426] Add safekeeper-3.us-east-2.aws.neon.build (#4085) --- .github/ansible/staging.us-east-2.hosts.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index dacc5567c3..fb218c443d 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -50,5 +50,7 @@ storage: ansible_host: i-027662bd552bf5db0 safekeeper-2.us-east-2.aws.neon.build: ansible_host: i-0de0b03a51676a6ce + safekeeper-3.us-east-2.aws.neon.build: + ansible_host: i-05f8ba2cda243bd18 safekeeper-99.us-east-2.aws.neon.build: ansible_host: i-0d61b6a2ea32028d5 From 6861259be7ee63f6a4bb2a9fdb5546147bf20389 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Apr 2023 15:18:26 +0200 Subject: [PATCH 318/426] add global metric for unexpected on-demand downloads (#4069) Until we have toned down the prod logs to zero WARN and ERROR, we want a dedicated metric for which we can have a dedicated alert. fixes https://github.com/neondatabase/neon/issues/3924 --- pageserver/src/bin/pageserver.rs | 1 + pageserver/src/lib.rs | 2 ++ pageserver/src/metrics.rs | 16 ++++++++++++++++ pageserver/src/tenant/timeline.rs | 3 ++- test_runner/fixtures/metrics.py | 1 + 5 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ed23a18ee0..8e4897c09c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -226,6 +226,7 @@ fn start_pageserver( ); set_build_info_metric(GIT_VERSION); set_launch_timestamp_metric(launch_ts); + pageserver::preinitialize_metrics(); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 278658eba3..04863886cb 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -44,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); +pub use crate::metrics::preinitialize_metrics; + pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d6978a8cf6..deb20f21f8 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -205,6 +205,15 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_unexpected_ondemand_downloads_count", + "Number of unexpected on-demand downloads. \ + We log more context for each increment, so, forgo any labels in this metric.", + ) + .expect("failed to define a metric") +}); + /// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] pub struct EvictionsWithLowResidenceDuration { @@ -1132,3 +1141,10 @@ impl>, O, E> Future for MeasuredRemoteOp { poll_result } } + +pub fn preinitialize_metrics() { + // We want to alert on this metric increasing. + // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0. + assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0); + UNEXPECTED_ONDEMAND_DOWNLOADS.reset(); +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b8b1f963e5..6c34f5a5b5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -48,7 +48,7 @@ use crate::tenant::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::TimelineMetrics; +use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; @@ -2355,6 +2355,7 @@ impl Timeline { id, ctx.task_kind() ); + UNEXPECTED_ONDEMAND_DOWNLOADS.inc(); timeline.download_remote_layer(remote_layer).await?; continue 'layer_map_search; } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 5fed6fcf84..0e958ddd06 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -53,6 +53,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_storage_operations_seconds_global_count", "pageserver_storage_operations_seconds_global_sum", "pageserver_storage_operations_seconds_global_bucket", + "pageserver_unexpected_ondemand_downloads_count_total", "libmetrics_launch_timestamp", "libmetrics_build_info", "libmetrics_tracing_event_count_total", From 92214578af3311c8d2ea6885f59562c9b53df628 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 26 Apr 2023 17:47:54 +0300 Subject: [PATCH 319/426] Fix proxy_io_bytes_per_client metric: use branch_id identifier properly. (#4084) It fixes the miscalculation of the metric for projects that use multiple branches for the same endpoint. We were under billing users with such projects. So we need to communicate the change in Release Notes. --- proxy/src/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 445c2e930c..6ae1e3a447 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -95,7 +95,7 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { current_metrics.push(( Ids { endpoint_id: endpoint_id.to_string(), - branch_id: "".to_string(), + branch_id: branch_id.to_string(), }, (value, Utc::now()), )); From 0112a602e1b748b959bf578e7eaaecef392c09a3 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 25 Apr 2023 12:22:58 +0400 Subject: [PATCH 320/426] Add timeout on proxy -> compute connection establishment. Otherwise we sit up to default tcp_syn_retries (about 2+ min) before gettings os error 110 if compute has been migrated to another pod. --- proxy/src/compute.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index b5efc72803..0465703ae6 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,8 +1,8 @@ use crate::{cancellation::CancelClosure, error::UserFacingError}; -use futures::TryFutureExt; +use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{io, net::SocketAddr}; +use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; @@ -130,9 +130,23 @@ impl ConnCfg { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { use tokio_postgres::config::Host; + // wrap TcpStream::connect with timeout + let connect_with_timeout = |host, port| { + let connection_timeout = Duration::from_millis(10000); + tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map( + move |res| match res { + Ok(tcpstream_connect_res) => tcpstream_connect_res, + Err(_) => Err(io::Error::new( + io::ErrorKind::TimedOut, + format!("exceeded connection timeout {connection_timeout:?}"), + )), + }, + ) + }; + let connect_once = |host, port| { info!("trying to connect to compute node at {host}:{port}"); - TcpStream::connect((host, port)).and_then(|socket| async { + connect_with_timeout(host, port).and_then(|socket| async { let socket_addr = socket.peer_addr()?; // This prevents load balancer from severing the connection. socket2::SockRef::from(&socket).set_keepalive(true)?; @@ -165,7 +179,6 @@ impl ConnCfg { Host::Unix(_) => continue, // unix sockets are not welcome here }; - // TODO: maybe we should add a timeout. match connect_once(host, *port).await { Ok(socket) => return Ok(socket), Err(err) => { From 9ea7b5dd38cd1fc89311eba3fcb6e8987d51e787 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Apr 2023 11:54:48 +0200 Subject: [PATCH 321/426] clean up logging around on-demand downloads (#4030) - Remove repeated tenant & timeline from span - Demote logging of the path to debug level - Log completion at info level, in the same function where we log errors - distinguish between layer file download success & on-demand download succeeding as a whole in the log message wording - Assert that the span contains a tenant id and a timeline id fixes https://github.com/neondatabase/neon/issues/3945 Before: ``` INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{tenant_id=$TENANT_ID timeline_id=$TIMELINE_ID layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: download complete: /storage/pageserver/data/tenants/$TENANT_ID/timelines/$TIMELINE_ID/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91 INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{tenant_id=$TENANT_ID timeline_id=$TIMELINE_ID layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: Rebuilt layer map. Did 9 insertions to process a batch of 1 updates. ``` After: ``` INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: layer file download finished INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: Rebuilt layer map. Did 9 insertions to process a batch of 1 updates. INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: on-demand download successful ``` --- Cargo.lock | 11 + Cargo.toml | 1 + libs/remote_storage/tests/pagination_tests.rs | 6 +- libs/utils/Cargo.toml | 3 +- libs/utils/src/lib.rs | 2 + libs/utils/src/logging.rs | 50 ++- libs/utils/src/tracing_span_assert.rs | 287 ++++++++++++++++++ pageserver/src/bin/pageserver.rs | 16 +- pageserver/src/tenant.rs | 8 +- .../tenant/remote_timeline_client/download.rs | 5 +- pageserver/src/tenant/timeline.rs | 39 ++- .../walreceiver/connection_manager.rs | 2 +- safekeeper/src/bin/safekeeper.rs | 5 +- storage_broker/src/bin/storage_broker.rs | 5 +- 14 files changed, 413 insertions(+), 27 deletions(-) create mode 100644 libs/utils/src/tracing_span_assert.rs diff --git a/Cargo.lock b/Cargo.lock index ce24bbcee8..08b24d263c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4629,6 +4629,16 @@ dependencies = [ "valuable", ] +[[package]] +name = "tracing-error" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e" +dependencies = [ + "tracing", + "tracing-subscriber", +] + [[package]] name = "tracing-futures" version = "0.2.5" @@ -4879,6 +4889,7 @@ dependencies = [ "thiserror", "tokio", "tracing", + "tracing-error", "tracing-subscriber", "url", "uuid", diff --git a/Cargo.toml b/Cargo.toml index 0b545e6190..f4872433cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,6 +110,7 @@ toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" +tracing-error = "0.2.0" tracing-opentelemetry = "0.18.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } url = "2.2" diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs index 048e99d841..86a6888f98 100644 --- a/libs/remote_storage/tests/pagination_tests.rs +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -99,7 +99,11 @@ struct S3WithTestBlobs { #[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledS3 { async fn setup() -> Self { - utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed"); + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + ) + .expect("logging init failed"); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { info!( "`{}` env variable is not set, skipping the test", diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index dc6326e73e..2b04dfdef6 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -27,7 +27,8 @@ signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true tracing.workspace = true -tracing-subscriber = { workspace = true, features = ["json"] } +tracing-error.workspace = true +tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true serde_with.workspace = true strum.workspace = true diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index d4176911ac..9b52aa75b7 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -54,6 +54,8 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod tracing_span_assert; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index ed856b6804..2b8c852d86 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -56,7 +56,20 @@ where } } -pub fn init(log_format: LogFormat) -> anyhow::Result<()> { +/// Whether to add the `tracing_error` crate's `ErrorLayer` +/// to the global tracing subscriber. +/// +pub enum TracingErrorLayerEnablement { + /// Do not add the `ErrorLayer`. + Disabled, + /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset. + EnableWithRustLogFilter, +} + +pub fn init( + log_format: LogFormat, + tracing_error_layer_enablement: TracingErrorLayerEnablement, +) -> anyhow::Result<()> { // We fall back to printing all spans at info-level or above if // the RUST_LOG environment variable is not set. let rust_log_env_filter = || { @@ -67,21 +80,26 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> { // NB: the order of the with() calls does not matter. // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering use tracing_subscriber::prelude::*; - tracing_subscriber::registry() - .with({ - let log_layer = tracing_subscriber::fmt::layer() - .with_target(false) - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_writer(std::io::stdout); - let log_layer = match log_format { - LogFormat::Json => log_layer.json().boxed(), - LogFormat::Plain => log_layer.boxed(), - LogFormat::Test => log_layer.with_test_writer().boxed(), - }; - log_layer.with_filter(rust_log_env_filter()) - }) - .with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())) - .init(); + let r = tracing_subscriber::registry(); + let r = r.with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_writer(std::io::stdout); + let log_layer = match log_format { + LogFormat::Json => log_layer.json().boxed(), + LogFormat::Plain => log_layer.boxed(), + LogFormat::Test => log_layer.with_test_writer().boxed(), + }; + log_layer.with_filter(rust_log_env_filter()) + }); + let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter())); + match tracing_error_layer_enablement { + TracingErrorLayerEnablement::EnableWithRustLogFilter => r + .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())) + .init(), + TracingErrorLayerEnablement::Disabled => r.init(), + } Ok(()) } diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs new file mode 100644 index 0000000000..b9f7986442 --- /dev/null +++ b/libs/utils/src/tracing_span_assert.rs @@ -0,0 +1,287 @@ +//! Assert that the current [`tracing::Span`] has a given set of fields. +//! +//! # Usage +//! +//! ``` +//! use tracing_subscriber::prelude::*; +//! let registry = tracing_subscriber::registry() +//! .with(tracing_error::ErrorLayer::default()); +//! +//! // Register the registry as the global subscriber. +//! // In this example, we'll only use it as a thread-local subscriber. +//! let _guard = tracing::subscriber::set_default(registry); +//! +//! // Then, in the main code: +//! +//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let _guard = span.enter(); +//! +//! // ... down the call stack +//! +//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; +//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! match check_fields_present([&extractor]) { +//! Ok(()) => {}, +//! Err(missing) => { +//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::>()); +//! } +//! } +//! ``` +//! +//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering +//! + +use std::{ + collections::HashSet, + fmt::{self}, + hash::{Hash, Hasher}, +}; + +pub enum ExtractionResult { + Present, + Absent, +} + +pub trait Extractor: Send + Sync + std::fmt::Debug { + fn name(&self) -> &str; + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; +} + +#[derive(Debug)] +pub struct MultiNameExtractor { + name: &'static str, + field_names: [&'static str; L], +} + +impl MultiNameExtractor { + pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { + MultiNameExtractor { name, field_names } + } +} +impl Extractor for MultiNameExtractor { + fn name(&self) -> &str { + self.name + } + fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { + if fields.iter().any(|f| self.field_names.contains(&f.name())) { + ExtractionResult::Present + } else { + ExtractionResult::Absent + } + } +} + +struct MemoryIdentity<'a>(&'a dyn Extractor); + +impl<'a> MemoryIdentity<'a> { + fn as_ptr(&self) -> *const () { + self.0 as *const _ as *const () + } +} +impl<'a> PartialEq for MemoryIdentity<'a> { + fn eq(&self, other: &Self) -> bool { + self.as_ptr() == other.as_ptr() + } +} +impl<'a> Eq for MemoryIdentity<'a> {} +impl<'a> Hash for MemoryIdentity<'a> { + fn hash(&self, state: &mut H) { + self.as_ptr().hash(state); + } +} +impl<'a> fmt::Debug for MemoryIdentity<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + } +} + +/// The extractor names passed as keys to [`new`]. +pub fn check_fields_present( + must_be_present: [&dyn Extractor; L], +) -> Result<(), Vec<&dyn Extractor>> { + let mut missing: HashSet = + HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r))); + let trace = tracing_error::SpanTrace::capture(); + trace.with_spans(|md, _formatted_fields| { + missing.retain(|extractor| match extractor.0.extract(md.fields()) { + ExtractionResult::Present => false, + ExtractionResult::Absent => true, + }); + !missing.is_empty() // continue walking up until we've found all missing + }); + if missing.is_empty() { + Ok(()) + } else { + Err(missing.into_iter().map(|mi| mi.0).collect()) + } +} + +#[cfg(test)] +mod tests { + + use tracing_subscriber::prelude::*; + + use super::*; + + struct Setup { + _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, + tenant_extractor: MultiNameExtractor<2>, + timeline_extractor: MultiNameExtractor<2>, + } + + fn setup_current_thread() -> Setup { + let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); + let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + + let registry = tracing_subscriber::registry() + .with(tracing_subscriber::fmt::layer()) + .with(tracing_error::ErrorLayer::default()); + + let guard = tracing::subscriber::set_default(registry); + + Setup { + _current_thread_subscriber_guard: guard, + tenant_extractor, + timeline_extractor, + } + } + + fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) { + let missing: HashSet = + HashSet::from_iter(missing.into_iter().map(MemoryIdentity)); + let expected: HashSet = + HashSet::from_iter(expected.into_iter().map(MemoryIdentity)); + assert_eq!(missing, expected); + } + + #[test] + fn positive_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + } + + #[test] + fn negative_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn positive_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); + let _guard = span.enter(); + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn positive_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", tenant_id = "tenant-1"); + let _guard = span.enter(); + + let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); + let _guard = span.enter(); + + check_fields_present([&setup.tenant_extractor]).unwrap(); + } + + #[test] + fn negative_subset_one_level() { + let setup = setup_current_thread(); + let span = tracing::info_span!("root", timeline_id = "timeline-1"); + let _guard = span.enter(); + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn negative_subset_multiple_levels() { + let setup = setup_current_thread(); + + let span = tracing::info_span!("root"); + let _guard = span.enter(); + + let span = tracing::info_span!("child", timeline_id = "timeline-1"); + let _guard = span.enter(); + + let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + assert_missing(missing, vec![&setup.tenant_extractor]); + } + + #[test] + fn tracing_error_subscriber_not_set_up() { + // no setup + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } + + #[test] + #[should_panic] + fn panics_if_tracing_error_subscriber_has_wrong_filter() { + let r = tracing_subscriber::registry().with({ + tracing_error::ErrorLayer::default().with_filter( + tracing_subscriber::filter::dynamic_filter_fn(|md, _| { + if md.is_span() && *md.level() == tracing::Level::INFO { + return false; + } + true + }), + ) + }); + + let _guard = tracing::subscriber::set_default(r); + + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let extractor = MultiNameExtractor::new("E", ["e"]); + let missing = check_fields_present([&extractor]).unwrap_err(); + assert_missing(missing, vec![&extractor]); + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 8e4897c09c..d843b01ed7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -25,6 +25,7 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::logging::TracingErrorLayerEnablement; use utils::signals::ShutdownSignals; use utils::{ auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, @@ -86,8 +87,19 @@ fn main() -> anyhow::Result<()> { } }; - // Initialize logging, which must be initialized before the custom panic hook is installed. - logging::init(conf.log_format)?; + // Initialize logging. + // + // It must be initialized before the custom panic hook is installed below. + // + // Regarding tracing_error enablement: at this time, we only use the + // tracing_error crate to debug_assert that log spans contain tenant and timeline ids. + // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module + let tracing_error_layer_enablement = if cfg!(debug_assertions) { + TracingErrorLayerEnablement::EnableWithRustLogFilter + } else { + TracingErrorLayerEnablement::Disabled + }; + logging::init(conf.log_format, tracing_error_layer_enablement)?; // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. // disarming this hook on pageserver, because we never tear down tracing. diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b5966b4618..d69d5e4b45 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2886,7 +2886,13 @@ pub mod harness { }; LOG_HANDLE.get_or_init(|| { - logging::init(logging::LogFormat::Test).expect("Failed to init test logging") + logging::init( + logging::LogFormat::Test, + // enable it in case in case the tests exercise code paths that use + // debug_assert_current_span_has_tenant_and_timeline_id + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + ) + .expect("Failed to init test logging") }); let repo_dir = PageServerConf::test_repo_dir(test_name); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index bda095d850..a0d8c0193a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -16,6 +16,7 @@ use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; @@ -43,6 +44,8 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); let local_path = timeline_path.join(layer_file_name.file_name()); @@ -154,7 +157,7 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("Could not fsync layer file {}", local_path.display(),)) .map_err(DownloadError::Other)?; - tracing::info!("download complete: {}", local_path.display()); + tracing::debug!("download complete: {}", local_path.display()); Ok(bytes_amount) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6c34f5a5b5..87f03f30b6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -19,6 +19,7 @@ use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; +use utils::tracing_span_assert; use std::cmp::{max, min, Ordering}; use std::collections::BinaryHeap; @@ -936,6 +937,7 @@ impl Timeline { } } + #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) }; let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) }; @@ -3819,11 +3821,13 @@ impl Timeline { /// If the caller has a deadline or needs a timeout, they can simply stop polling: /// we're **cancellation-safe** because the download happens in a separate task_mgr task. /// So, the current download attempt will run to completion even if we stop polling. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + #[instrument(skip_all, fields(layer=%remote_layer.short_id()))] pub async fn download_remote_layer( &self, remote_layer: Arc, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); + use std::sync::atomic::Ordering::Relaxed; let permit = match Arc::clone(&remote_layer.ongoing_download) @@ -3867,6 +3871,8 @@ impl Timeline { .await; if let Ok(size) = &result { + info!("layer file download finished"); + // XXX the temp file is still around in Err() case // and consumes space until we clean up upon pageserver restart. self_clone.metrics.resident_physical_size_gauge.add(*size); @@ -3938,6 +3944,8 @@ impl Timeline { updates.flush(); drop(layers); + info!("on-demand download successful"); + // Now that we've inserted the download into the layer map, // close the semaphore. This will make other waiters for // this download return Ok(()). @@ -3945,7 +3953,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. - error!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); + error!("layer file download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download @@ -4256,3 +4264,30 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } + +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< + tracing_span_assert::MultiNameExtractor<2>, + > = once_cell::sync::Lazy::new(|| { + tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]) + }); + + pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy< + tracing_span_assert::MultiNameExtractor<2>, + > = once_cell::sync::Lazy::new(|| { + tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]) + }); + + #[cfg(debug_assertions)] + match tracing_span_assert::check_fields_present([ + &*TENANT_ID_EXTRACTOR, + &*TIMELINE_ID_EXTRACTOR, + ]) { + Ok(()) => (), + Err(missing) => panic!( + "missing extractors: {:?}", + missing.into_iter().map(|e| e.name()).collect::>() + ), + } +} diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index efcbfbce3d..731c5c4644 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -348,7 +348,7 @@ impl ConnectionManagerState { .context("walreceiver connection handling failure") } .instrument( - info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id), + info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id), ) }); diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index ace921a26d..3699a2a74c 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -134,7 +134,10 @@ fn main() -> anyhow::Result<()> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::init( + LogFormat::from_config(&args.log_format)?, + logging::TracingErrorLayerEnablement::Disabled, + )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); info!("version: {GIT_VERSION}"); diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index de7b634ba0..597d9860d8 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -430,7 +430,10 @@ async fn main() -> Result<(), Box> { // 1. init logging // 2. tracing panic hook // 3. sentry - logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::init( + LogFormat::from_config(&args.log_format)?, + logging::TracingErrorLayerEnablement::Disabled, + )?; logging::replace_panic_hook_with_tracing_panic_hook().forget(); // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); From 5b911e1f9f6f5e49fabfb3fde12084b1e69bd4a2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Apr 2023 14:01:27 +0200 Subject: [PATCH 322/426] build: run clippy for powerset of features (#4077) This will catch compiler & clippy warnings in all feature combinations. We should probably use cargo hack for build and test as well, but, that's quite expensive and would add to overall CI wait times. obsoletes https://github.com/neondatabase/neon/pull/4073 refs https://github.com/neondatabase/neon/pull/4070 --- .github/workflows/build_and_test.yml | 17 +++++++++++++++-- .neon_clippy_args | 4 ++++ run_clippy.sh | 15 ++++++++++----- 3 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 .neon_clippy_args diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 15a6a611b1..e5ba7aa3eb 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -111,8 +111,21 @@ jobs: - name: Get postgres headers run: make postgres-headers -j$(nproc) - - name: Run cargo clippy - run: ./run_clippy.sh + # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. + # This will catch compiler & clippy warnings in all feature combinations. + # TODO: use cargo hack for build and test as well, but, that's quite expensive. + # NB: keep clippy args in sync with ./run_clippy.sh + - run: | + CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" + if [ "$CLIPPY_COMMON_ARGS" = "" ]; then + echo "No clippy args found in .neon_clippy_args" + exit 1 + fi + echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS + - name: Run cargo clippy (release) + run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting diff --git a/.neon_clippy_args b/.neon_clippy_args new file mode 100644 index 0000000000..25e09c61a6 --- /dev/null +++ b/.neon_clippy_args @@ -0,0 +1,4 @@ +# * `-A unknown_lints` – do not warn about unknown lint suppressions +# that people with newer toolchains might use +# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) +export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings" diff --git a/run_clippy.sh b/run_clippy.sh index 9adfddedc2..ae2a17ec0c 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -1,4 +1,5 @@ -#!/bin/bash +#!/usr/bin/env bash +set -euo pipefail # If you save this in your path under the name "cargo-zclippy" (or whatever # name you like), then you can run it as "cargo zclippy" from the shell prompt. @@ -8,7 +9,11 @@ # warnings and errors right in the editor. # In vscode, this setting is Rust-analyzer>Check On Save:Command -# * `-A unknown_lints` – do not warn about unknown lint suppressions -# that people with newer toolchains might use -# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings +# NB: the CI runs the full feature powerset, so, it catches slightly more errors +# at the expense of longer runtime. This script is used by developers, so, don't +# do that here. + +thisscript="${BASH_SOURCE[0]}" +thisscript_dir="$(dirname "$thisscript")" +CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" +exec cargo clippy --all-features $CLIPPY_COMMON_ARGS From e6ec2400fc0a8c2975b96f23c5b391064362e0da Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 27 Apr 2023 15:26:44 +0200 Subject: [PATCH 323/426] Enable hot standby PostgreSQL replicas. Notes: - This still needs UI support from the Console - I've not tuned any GUCs for PostgreSQL to make this work better - Safekeeper has gotten a tweak in which WAL is sent and how: It now sends zero-ed WAL data from the start of the timeline's first segment up to the first byte of the timeline to be compatible with normal PostgreSQL WAL streaming. - This includes the commits of #3714 Fixes one part of https://github.com/neondatabase/neon/issues/769 Co-authored-by: Anastasia Lubennikova --- compute_tools/src/compute.rs | 62 +++++- compute_tools/src/pg_helpers.rs | 7 + compute_tools/src/spec.rs | 16 ++ control_plane/src/bin/neon_local.rs | 79 ++++++- control_plane/src/endpoint.rs | 182 ++++++++++------ control_plane/src/postgresql_conf.rs | 2 +- .../var/db/postgres/specs/spec.json | 5 - libs/postgres_ffi/src/lib.rs | 7 +- libs/postgres_ffi/src/pg_constants.rs | 1 + libs/postgres_ffi/src/xlog_utils.rs | 57 ++++- libs/utils/src/lsn.rs | 19 ++ pageserver/src/basebackup.rs | 10 +- pgxn/neon/file_cache.c | 69 ++++++- pgxn/neon/libpagestore.c | 6 + pgxn/neon/neon.c | 1 + pgxn/neon/neon.h | 8 + pgxn/neon/pagestore_client.h | 1 + pgxn/neon/pagestore_smgr.c | 194 +++++++++++++++++- pgxn/neon/walproposer.c | 20 +- safekeeper/src/handler.rs | 9 +- safekeeper/src/wal_storage.rs | 86 +++++++- test_runner/fixtures/neon_fixtures.py | 43 ++++ test_runner/regress/test_compute_ctl.py | 5 - test_runner/regress/test_hot_standby.py | 79 +++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 26 files changed, 851 insertions(+), 121 deletions(-) create mode 100644 test_runner/regress/test_hot_standby.py diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 507dac9c0d..b6bc234beb 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -249,18 +249,63 @@ impl ComputeNode { /// safekeepers sync, basebackup, etc. #[instrument(skip(self, compute_state))] pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { + #[derive(Clone)] + enum Replication { + Primary, + Static { lsn: Lsn }, + HotStandby, + } + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); + let hot_replica = if let Some(option) = spec.cluster.settings.find_ref("hot_standby") { + if let Some(value) = &option.value { + anyhow::ensure!(option.vartype == "bool"); + matches!(value.as_str(), "on" | "yes" | "true") + } else { + false + } + } else { + false + }; + + let replication = if hot_replica { + Replication::HotStandby + } else if let Some(lsn) = spec.cluster.settings.find("recovery_target_lsn") { + Replication::Static { + lsn: Lsn::from_str(&lsn)?, + } + } else { + Replication::Primary + }; + // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?; - info!("starting safekeepers syncing"); - let lsn = self - .sync_safekeepers(pspec.storage_auth_token.clone()) - .with_context(|| "failed to sync safekeepers")?; - info!("safekeepers synced at LSN {}", lsn); + // Syncing safekeepers is only safe with primary nodes: if a primary + // is already connected it will be kicked out, so a secondary (standby) + // cannot sync safekeepers. + let lsn = match &replication { + Replication::Primary => { + info!("starting safekeepers syncing"); + let lsn = self + .sync_safekeepers(pspec.storage_auth_token.clone()) + .with_context(|| "failed to sync safekeepers")?; + info!("safekeepers synced at LSN {}", lsn); + lsn + } + Replication::Static { lsn } => { + info!("Starting read-only node at static LSN {}", lsn); + *lsn + } + Replication::HotStandby => { + info!("Initializing standby from latest Pageserver LSN"); + Lsn(0) + } + }; info!( "getting basebackup@{} from pageserver {}", @@ -276,6 +321,13 @@ impl ComputeNode { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; + match &replication { + Replication::Primary | Replication::Static { .. } => {} + Replication::HotStandby => { + add_standby_signal(pgdata_path)?; + } + } + Ok(()) } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index bb787d0506..40dbea6907 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -94,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions { pub trait GenericOptionsSearch { fn find(&self, name: &str) -> Option; + fn find_ref(&self, name: &str) -> Option<&GenericOption>; } impl GenericOptionsSearch for GenericOptions { @@ -103,6 +104,12 @@ impl GenericOptionsSearch for GenericOptions { let op = ops.iter().find(|s| s.name == name)?; op.value.clone() } + + /// Lookup option by name, returning ref + fn find_ref(&self, name: &str) -> Option<&GenericOption> { + let ops = self.as_ref()?; + ops.iter().find(|s| s.name == name) + } } pub trait RoleExt { diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 28e0ef41b7..bf3c407202 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::path::Path; use std::str::FromStr; @@ -145,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { Ok(()) } +/// Create a standby.signal file +pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { + // XXX: consider making it a part of spec.json + info!("adding standby.signal"); + let signalfile = pgdata_path.join("standby.signal"); + + if !signalfile.exists() { + info!("created standby.signal"); + File::create(signalfile)?; + } else { + info!("reused pre-existing standby.signal"); + } + Ok(()) +} + /// Given a cluster spec json and open transaction it handles roles creation, /// deletion and update. #[instrument(skip_all)] diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 665cad8783..09278e1726 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use control_plane::endpoint::ComputeControlPlane; +use control_plane::endpoint::Replication; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; @@ -474,7 +475,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; println!("Creating endpoint for imported timeline ..."); - cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?; + cplane.new_endpoint( + tenant_id, + name, + timeline_id, + None, + pg_version, + Replication::Primary, + )?; println!("Done"); } Some(("branch", branch_match)) => { @@ -560,20 +568,20 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .iter() .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) { - let lsn_str = match endpoint.lsn { - None => { - // -> primary endpoint + let lsn_str = match endpoint.replication { + Replication::Static(lsn) => { + // -> read-only endpoint + // Use the node's LSN. + lsn.to_string() + } + _ => { + // -> primary endpoint or hot replica // Use the LSN at the end of the timeline. timeline_infos .get(&endpoint.timeline_id) .map(|bi| bi.last_record_lsn.to_string()) .unwrap_or_else(|| "?".to_string()) } - Some(lsn) => { - // -> read-only endpoint - // Use the endpoint's LSN. - lsn.to_string() - } }; let branch_name = timeline_name_mappings @@ -619,7 +627,26 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .context("Failed to parse postgres version from the argument string")?; - cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?; + let hot_standby = sub_args + .get_one::("hot-standby") + .copied() + .unwrap_or(false); + + let replication = match (lsn, hot_standby) { + (Some(lsn), false) => Replication::Static(lsn), + (None, true) => Replication::Replica, + (None, false) => Replication::Primary, + (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), + }; + + cplane.new_endpoint( + tenant_id, + &endpoint_id, + timeline_id, + port, + pg_version, + replication, + )?; } "start" => { let port: Option = sub_args.get_one::("port").copied(); @@ -637,7 +664,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( None }; + let hot_standby = sub_args + .get_one::("hot-standby") + .copied() + .unwrap_or(false); + if let Some(endpoint) = endpoint { + match (&endpoint.replication, hot_standby) { + (Replication::Static(_), true) => { + bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") + } + (Replication::Primary, true) => { + bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") + } + _ => {} + } println!("Starting existing endpoint {endpoint_id}..."); endpoint.start(&auth_token)?; } else { @@ -659,6 +700,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .get_one::("pg-version") .copied() .context("Failed to `pg-version` from the argument string")?; + + let replication = match (lsn, hot_standby) { + (Some(lsn), false) => Replication::Static(lsn), + (None, true) => Replication::Replica, + (None, false) => Replication::Primary, + (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), + }; + // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X @@ -670,9 +719,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( tenant_id, endpoint_id, timeline_id, - lsn, port, pg_version, + replication, )?; ep.start(&auth_token)?; } @@ -928,6 +977,12 @@ fn cli() -> Command { .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") .required(false); + let hot_standby_arg = Arg::new("hot-standby") + .value_parser(value_parser!(bool)) + .long("hot-standby") + .help("If set, the node will be a hot replica on the specified timeline") + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1052,6 +1107,7 @@ fn cli() -> Command { .long("config-only") .required(false)) .arg(pg_version_arg.clone()) + .arg(hot_standby_arg.clone()) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") @@ -1062,6 +1118,7 @@ fn cli() -> Command { .arg(lsn_arg) .arg(port_arg) .arg(pg_version_arg) + .arg(hot_standby_arg) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 9e85138e68..7d3485518f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -68,18 +68,19 @@ impl ComputeControlPlane { tenant_id: TenantId, name: &str, timeline_id: TimelineId, - lsn: Option, port: Option, pg_version: u32, + replication: Replication, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); + let ep = Arc::new(Endpoint { name: name.to_owned(), address: SocketAddr::new("127.0.0.1".parse().unwrap(), port), env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), timeline_id, - lsn, + replication, tenant_id, pg_version, }); @@ -95,6 +96,18 @@ impl ComputeControlPlane { /////////////////////////////////////////////////////////////////////////////// +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum Replication { + // Regular read-write node + Primary, + // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN + Static(Lsn), + // Hot standby; read-only replica. + // Future versions may want to distinguish between replicas with hot standby + // feedback and other kinds of replication configurations. + Replica, +} + #[derive(Debug)] pub struct Endpoint { /// used as the directory name @@ -102,7 +115,7 @@ pub struct Endpoint { pub tenant_id: TenantId, pub timeline_id: TimelineId, // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary. - pub lsn: Option, + pub replication: Replication, // port and address of the Postgres server pub address: SocketAddr, @@ -153,9 +166,17 @@ impl Endpoint { fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); let pg_version = u32::from_str(&pg_version_str)?; - // parse recovery_target_lsn, if any - let recovery_target_lsn: Option = - conf.parse_field_optional("recovery_target_lsn", &context)?; + // parse recovery_target_lsn and primary_conninfo into Recovery Target, if any + let replication = if let Some(lsn_str) = conf.get("recovery_target_lsn") { + Replication::Static(Lsn::from_str(lsn_str)?) + } else if let Some(slot_name) = conf.get("primary_slot_name") { + let slot_name = slot_name.to_string(); + let prefix = format!("repl_{}_", timeline_id); + assert!(slot_name.starts_with(&prefix)); + Replication::Replica + } else { + Replication::Primary + }; // ok now Ok(Endpoint { @@ -164,7 +185,7 @@ impl Endpoint { env: env.clone(), pageserver: Arc::clone(pageserver), timeline_id, - lsn: recovery_target_lsn, + replication, tenant_id, pg_version, }) @@ -299,50 +320,83 @@ impl Endpoint { conf.append("neon.pageserver_connstring", &pageserver_connstr); conf.append("neon.tenant_id", &self.tenant_id.to_string()); conf.append("neon.timeline_id", &self.timeline_id.to_string()); - if let Some(lsn) = self.lsn { - conf.append("recovery_target_lsn", &lsn.to_string()); - } conf.append_line(""); - // Configure backpressure - // - Replication write lag depends on how fast the walreceiver can process incoming WAL. - // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, - // so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB. - // Actually latency should be much smaller (better if < 1sec). But we assume that recently - // updates pages are not requested from pageserver. - // - Replication flush lag depends on speed of persisting data by checkpointer (creation of - // delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to - // remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long - // recovery time (in case of pageserver crash) and disk space overflow at safekeepers. - // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread. - // To be able to restore database in case of pageserver node crash, safekeeper should not - // remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers - // (if they are not able to upload WAL to S3). - conf.append("max_replication_write_lag", "15MB"); - conf.append("max_replication_flush_lag", "10GB"); + // Replication-related configurations, such as WAL sending + match &self.replication { + Replication::Primary => { + // Configure backpressure + // - Replication write lag depends on how fast the walreceiver can process incoming WAL. + // This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec, + // so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB. + // Actually latency should be much smaller (better if < 1sec). But we assume that recently + // updates pages are not requested from pageserver. + // - Replication flush lag depends on speed of persisting data by checkpointer (creation of + // delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to + // remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long + // recovery time (in case of pageserver crash) and disk space overflow at safekeepers. + // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread. + // To be able to restore database in case of pageserver node crash, safekeeper should not + // remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers + // (if they are not able to upload WAL to S3). + conf.append("max_replication_write_lag", "15MB"); + conf.append("max_replication_flush_lag", "10GB"); - if !self.env.safekeepers.is_empty() { - // Configure Postgres to connect to the safekeepers - conf.append("synchronous_standby_names", "walproposer"); + if !self.env.safekeepers.is_empty() { + // Configure Postgres to connect to the safekeepers + conf.append("synchronous_standby_names", "walproposer"); - let safekeepers = self - .env - .safekeepers - .iter() - .map(|sk| format!("localhost:{}", sk.pg_port)) - .collect::>() - .join(","); - conf.append("neon.safekeepers", &safekeepers); - } else { - // We only use setup without safekeepers for tests, - // and don't care about data durability on pageserver, - // so set more relaxed synchronous_commit. - conf.append("synchronous_commit", "remote_write"); + let safekeepers = self + .env + .safekeepers + .iter() + .map(|sk| format!("localhost:{}", sk.pg_port)) + .collect::>() + .join(","); + conf.append("neon.safekeepers", &safekeepers); + } else { + // We only use setup without safekeepers for tests, + // and don't care about data durability on pageserver, + // so set more relaxed synchronous_commit. + conf.append("synchronous_commit", "remote_write"); - // Configure the node to stream WAL directly to the pageserver - // This isn't really a supported configuration, but can be useful for - // testing. - conf.append("synchronous_standby_names", "pageserver"); + // Configure the node to stream WAL directly to the pageserver + // This isn't really a supported configuration, but can be useful for + // testing. + conf.append("synchronous_standby_names", "pageserver"); + } + } + Replication::Static(lsn) => { + conf.append("recovery_target_lsn", &lsn.to_string()); + } + Replication::Replica => { + assert!(!self.env.safekeepers.is_empty()); + + // TODO: use future host field from safekeeper spec + // Pass the list of safekeepers to the replica so that it can connect to any of them, + // whichever is availiable. + let sk_ports = self + .env + .safekeepers + .iter() + .map(|x| x.pg_port.to_string()) + .collect::>() + .join(","); + let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(","); + + let connstr = format!( + "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true", + sk_hosts, + sk_ports, + &self.timeline_id.to_string(), + &self.tenant_id.to_string(), + ); + + let slot_name = format!("repl_{}_", self.timeline_id); + conf.append("primary_conninfo", connstr.as_str()); + conf.append("primary_slot_name", slot_name.as_str()); + conf.append("hot_standby", "on"); + } } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; @@ -355,21 +409,27 @@ impl Endpoint { } fn load_basebackup(&self, auth_token: &Option) -> Result<()> { - let backup_lsn = if let Some(lsn) = self.lsn { - Some(lsn) - } else if !self.env.safekeepers.is_empty() { - // LSN 0 means that it is bootstrap and we need to download just - // latest data from the pageserver. That is a bit clumsy but whole bootstrap - // procedure evolves quite actively right now, so let's think about it again - // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; - if lsn == Lsn(0) { - None - } else { - Some(lsn) + let backup_lsn = match &self.replication { + Replication::Primary => { + if !self.env.safekeepers.is_empty() { + // LSN 0 means that it is bootstrap and we need to download just + // latest data from the pageserver. That is a bit clumsy but whole bootstrap + // procedure evolves quite actively right now, so let's think about it again + // when things would be more stable (TODO). + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; + if lsn == Lsn(0) { + None + } else { + Some(lsn) + } + } else { + None + } + } + Replication::Static(lsn) => Some(*lsn), + Replication::Replica => { + None // Take the latest snapshot available to start with } - } else { - None }; self.do_basebackup(backup_lsn)?; @@ -466,7 +526,7 @@ impl Endpoint { // 3. Load basebackup self.load_basebackup(auth_token)?; - if self.lsn.is_some() { + if self.replication != Replication::Primary { File::create(self.pgdata().join("standby.signal"))?; } diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 34dc769e78..638575eb82 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -13,7 +13,7 @@ use std::io::BufRead; use std::str::FromStr; /// In-memory representation of a postgresql.conf file -#[derive(Default)] +#[derive(Default, Debug)] pub struct PostgresConf { lines: Vec, hash: HashMap, diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index 10ae0b0ecf..565e5e368e 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -28,11 +28,6 @@ "value": "replica", "vartype": "enum" }, - { - "name": "hot_standby", - "value": "on", - "vartype": "bool" - }, { "name": "wal_log_hints", "value": "on", diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 492ec9748a..b8eb469cb0 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -95,10 +95,13 @@ pub fn generate_wal_segment( segno: u64, system_id: u64, pg_version: u32, + lsn: Lsn, ) -> Result { + assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); + match pg_version { - 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), - 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn), _ => Err(SerializeError::BadInput), } } diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 09678353af..6bc89ed37e 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -195,6 +195,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_LONG_HEADER: u16 = 0x0002; /* From fsm_internals.h */ diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 272c4d6dcc..8ed00a9e13 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -270,6 +270,11 @@ impl XLogPageHeaderData { use utils::bin_ser::LeSer; XLogPageHeaderData::des_from(&mut buf.reader()) } + + pub fn encode(&self) -> Result { + use utils::bin_ser::LeSer; + self.ser().map(|b| b.into()) + } } impl XLogLongPageHeaderData { @@ -328,22 +333,32 @@ impl CheckPoint { } } -// -// Generate new, empty WAL segment. -// We need this segment to start compute node. -// -pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { +/// Generate new, empty WAL segment, with correct block headers at the first +/// page of the segment and the page that contains the given LSN. +/// We need this segment to start compute node. +pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result { let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + + let page_off = lsn.block_offset(); + let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE); + + let first_page_only = seg_off < XLOG_BLCKSZ; + let (shdr_rem_len, infoflags) = if first_page_only { + (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD) + } else { + (0, 0) + }; + let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: pg_constants::XLP_LONG_HEADER, + xlp_info: pg_constants::XLP_LONG_HEADER | infoflags, xlp_tli: PG_TLI, xlp_pageaddr: pageaddr, - xlp_rem_len: 0, + xlp_rem_len: shdr_rem_len as u32, ..Default::default() // Put 0 in padding fields. } }, @@ -357,9 +372,37 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result= pg_constants::SIZE_OF_PAGE_HEADER as u64 { + pg_constants::XLP_FIRST_IS_CONTRECORD + } else { + 0 + }, + xlp_tli: PG_TLI, + xlp_pageaddr: lsn.page_lsn().0, + xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { + page_off as u32 + } else { + 0u32 + }, + ..Default::default() // Put 0 in padding fields. + }; + let hdr_bytes = header.encode()?; + + debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len()); + debug_assert_ne!(block_offset, 0); + + seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]); + } + Ok(seg_buf.freeze()) } + #[repr(C)] #[derive(Serialize)] struct XlLogicalMessage { diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index acf5ea28d7..0493d43088 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -62,29 +62,48 @@ impl Lsn { } /// Compute the offset into a segment + #[inline] pub fn segment_offset(self, seg_sz: usize) -> usize { (self.0 % seg_sz as u64) as usize } /// Compute LSN of the segment start. + #[inline] pub fn segment_lsn(self, seg_sz: usize) -> Lsn { Lsn(self.0 - (self.0 % seg_sz as u64)) } /// Compute the segment number + #[inline] pub fn segment_number(self, seg_sz: usize) -> u64 { self.0 / seg_sz as u64 } /// Compute the offset into a block + #[inline] pub fn block_offset(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; self.0 % BLCKSZ } + /// Compute the block offset of the first byte of this Lsn within this + /// segment + #[inline] + pub fn page_lsn(self) -> Lsn { + Lsn(self.0 - self.block_offset()) + } + + /// Compute the block offset of the first byte of this Lsn within this + /// segment + #[inline] + pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 { + (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0 + } + /// Compute the bytes remaining in this block /// /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`. + #[inline] pub fn remaining_in_block(self) -> u64 { const BLCKSZ: u64 = XLOG_BLCKSZ as u64; BLCKSZ - (self.0 % BLCKSZ) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 41fa0a67bb..c666fc785c 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -463,9 +463,13 @@ where let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; - let wal_seg = - postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) - .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + let wal_seg = postgres_ffi::generate_wal_segment( + segno, + system_identifier, + self.timeline.pg_version, + self.lsn, + ) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..]).await?; Ok(()) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 8dff259f02..cc46fb5a25 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -370,6 +370,74 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Evict a page (if present) from the local file cache + */ +void +lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +{ + BufferTag tag; + FileCacheEntry* entry; + ssize_t rc; + bool found; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return; + + INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1))); + + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found); + + if (!found) + { + /* nothing to do */ + LWLockRelease(lfc_lock); + return; + } + + /* remove the page from the cache */ + entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); + + /* + * If the chunk has no live entries, we can position the chunk to be + * recycled first. + */ + if (entry->bitmap[chunk_offs >> 5] == 0) + { + bool has_remaining_pages; + + for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) { + if (entry->bitmap[i] != 0) + { + has_remaining_pages = true; + break; + } + } + + /* + * Put the entry at the position that is first to be reclaimed when + * we have no cached pages remaining in the chunk + */ + if (!has_remaining_pages) + { + dlist_delete(&entry->lru_node); + dlist_push_head(&lfc_ctl->lru, &entry->lru_node); + } + } + + /* + * Done: apart from empty chunks, we don't move chunks in the LRU when + * they're empty because eviction isn't usage. + */ + + LWLockRelease(lfc_lock); +} + /* * Try to read page from local cache. * Returns true if page is found in local cache. @@ -528,7 +596,6 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, LWLockRelease(lfc_lock); } - /* * Record structure holding the to be exposed cache data. */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index c44e8fcda5..21330c018f 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -17,6 +17,8 @@ #include "pagestore_client.h" #include "fmgr.h" #include "access/xlog.h" +#include "access/xlogutils.h" +#include "storage/buf_internals.h" #include "libpq-fe.h" #include "libpq/pqformat.h" @@ -57,6 +59,8 @@ int n_unflushed_requests = 0; int flush_every_n_requests = 8; int readahead_buffer_size = 128; +bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; + static void pageserver_flush(void); static bool @@ -467,6 +471,8 @@ pg_init_libpagestore(void) smgr_hook = smgr_neon; smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; + old_redo_read_buffer_filter = redo_read_buffer_filter; + redo_read_buffer_filter = neon_redo_read_buffer_filter; } lfc_init(); } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 5c98902554..217c1974a0 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -24,6 +24,7 @@ #include "neon.h" #include "walproposer.h" +#include "pagestore_client.h" PG_MODULE_MAGIC; void _PG_init(void); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 3eac8f4570..60d321a945 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -11,6 +11,7 @@ #ifndef NEON_H #define NEON_H +#include "access/xlogreader.h" /* GUCs */ extern char *neon_auth_token; @@ -20,4 +21,11 @@ extern char *neon_tenant; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ +extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); +extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + #endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index a1f05ac685..22f5cdb73a 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -207,6 +207,7 @@ extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 5b30641856..528d4eb051 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -189,6 +189,7 @@ typedef struct PrfHashEntry { #define SH_DEFINE #define SH_DECLARE #include "lib/simplehash.h" +#include "neon.h" /* * PrefetchState maintains the state of (prefetch) getPage@LSN requests. @@ -1209,6 +1210,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch if (ShutdownRequestPending) return; + /* Don't log any pages if we're not allowed to do so. */ + if (!XLogInsertAllowed()) + return; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1375,8 +1379,18 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN if (RecoveryInProgress()) { + /* + * We don't know if WAL has been generated but not yet replayed, so + * we're conservative in our estimates about latest pages. + */ *latest = false; - lsn = GetXLogReplayRecPtr(NULL); + + /* + * Get the last written LSN of this page. + */ + lsn = GetLastWrittenLSN(rnode, forknum, blkno); + lsn = nm_adjust_lsn(lsn); + elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", (uint32) ((lsn) >> 32), (uint32) (lsn)); } @@ -1559,6 +1573,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) /* * Newly created relation is empty, remember that in the relsize cache. * + * Note that in REDO, this is called to make sure the relation fork exists, + * but it does not truncate the relation. So, we can only update the + * relsize if it didn't exist before. + * + * Also, in redo, we must make sure to update the cached size of the + * relation, as that is the primary source of truth for REDO's + * file length considerations, and as file extension isn't (perfectly) + * logged, we need to take care of that before we hit file size checks. + * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created * relation. Currently, we don't call SetLastWrittenLSN() when a new @@ -1566,7 +1589,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * cache, we might call smgrnblocks() on the newly-created relation before * the creation WAL record hass been received by the page server. */ - set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + if (isRedo) + { + update_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + get_cached_relsize(reln->smgr_rnode.node, forkNum, + &reln->smgr_cached_nblocks[forkNum]); + } + else + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1831,6 +1861,26 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, .blockNum = blkno, }; + /* + * The redo process does not lock pages that it needs to replay but are + * not in the shared buffers, so a concurrent process may request the + * page after redo has decided it won't redo that page and updated the + * LwLSN for that page. + * If we're in hot standby we need to take care that we don't return + * until after REDO has finished replaying up to that LwLSN, as the page + * should have been locked up to that point. + * + * See also the description on neon_redo_read_buffer_filter below. + * + * NOTE: It is possible that the WAL redo process will still do IO due to + * concurrent failed read IOs. Those IOs should never have a request_lsn + * that is as large as the WAL record we're currently replaying, if it + * weren't for the behaviour of the LwLsn cache that uses the highest + * value of the LwLsn cache when the entry is not found. + */ + if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) + XLogWaitForReplayOf(request_lsn); + /* * Try to find prefetched page in the list of received pages. */ @@ -2584,3 +2634,143 @@ smgr_init_neon(void) smgr_init_standard(); neon_init(); } + + +/* + * Return whether we can skip the redo for this block. + * + * The conditions for skipping the IO are: + * + * - The block is not in the shared buffers, and + * - The block is not in the local file cache + * + * ... because any subsequent read of the page requires us to read + * the new version of the page from the PageServer. We do not + * check the local file cache; we instead evict the page from LFC: it + * is cheaper than going through the FS calls to read the page, and + * limits the number of lock operations used in the REDO process. + * + * We have one exception to the rules for skipping IO: We always apply + * changes to shared catalogs' pages. Although this is mostly out of caution, + * catalog updates usually result in backends rebuilding their catalog snapshot, + * which means it's quite likely the modified page is going to be used soon. + * + * It is important to note that skipping WAL redo for a page also means + * the page isn't locked by the redo process, as there is no Buffer + * being returned, nor is there a buffer descriptor to lock. + * This means that any IO that wants to read this block needs to wait + * for the WAL REDO process to finish processing the WAL record before + * it allows the system to start reading the block, as releasing the + * block early could lead to phantom reads. + * + * For example, REDO for a WAL record that modifies 3 blocks could skip + * the first block, wait for a lock on the second, and then modify the + * third block. Without skipping, all blocks would be locked and phantom + * reads would not occur, but with skipping, a concurrent process could + * read block 1 with post-REDO contents and read block 3 with pre-REDO + * contents, where with REDO locking it would wait on block 1 and see + * block 3 with post-REDO contents only. + */ +bool +neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) +{ + XLogRecPtr end_recptr = record->EndRecPtr; + XLogRecPtr prev_end_recptr = record->ReadRecPtr - 1; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + BufferTag tag; + uint32 hash; + LWLock *partitionLock; + Buffer buffer; + bool no_redo_needed; + BlockNumber relsize; + + if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) + return true; + +#if PG_VERSION_NUM < 150000 + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + elog(PANIC, "failed to locate backup block with ID %d", block_id); +#else + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno); +#endif + + /* + * Out of an abundance of caution, we always run redo on shared catalogs, + * regardless of whether the block is stored in shared buffers. + * See also this function's top comment. + */ + if (!OidIsValid(rnode.dbNode)) + return false; + + INIT_BUFFERTAG(tag, rnode, forknum, blkno); + hash = BufTableHashCode(&tag); + partitionLock = BufMappingPartitionLock(hash); + + /* + * Lock the partition of shared_buffers so that it can't be updated + * concurrently. + */ + LWLockAcquire(partitionLock, LW_SHARED); + + /* Try to find the relevant buffer */ + buffer = BufTableLookup(&tag, hash); + + no_redo_needed = buffer < 0; + + /* we don't have the buffer in memory, update lwLsn past this record */ + if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno); + lfc_evict(rnode, forknum, blkno); + } + else + { + SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno); + } + + LWLockRelease(partitionLock); + + /* Extend the relation if we know its size */ + if (get_cached_relsize(rnode, forknum, &relsize)) + { + if (relsize < blkno + 1) + update_cached_relsize(rnode, forknum, blkno + 1); + } + else + { + /* + * Size was not cached. We populate the cache now, with the size of the + * relation measured after this WAL record is applied. + * + * This length is later reused when we open the smgr to read the block, + * which is fine and expected. + */ + + NeonResponse *response; + NeonNblocksResponse *nbresponse; + NeonNblocksRequest request = { + .req = (NeonRequest) { + .lsn = end_recptr, + .latest = false, + .tag = T_NeonNblocksRequest, + }, + .rnode = rnode, + .forknum = forknum, + }; + + response = page_server_request(&request); + + Assert(response->tag == T_NeonNblocksResponse); + nbresponse = (NeonNblocksResponse *) response; + + Assert(nbresponse->n_blocks > blkno); + + set_cached_relsize(rnode, forknum, nbresponse->n_blocks); + + elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks); + } + + return no_redo_needed; +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 45037a8c01..a99be40955 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1964,18 +1964,26 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) { if (safekeeper[i].appendResponse.hs.ts != 0) { - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs; + if (FullTransactionIdIsNormal(skhs->xmin) + && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) { - hs->xmin = safekeeper[i].appendResponse.hs.xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; + hs->xmin = skhs->xmin; + hs->ts = skhs->ts; } - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + if (FullTransactionIdIsNormal(skhs->catalog_xmin) + && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) { - hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; + hs->catalog_xmin = skhs->catalog_xmin; + hs->ts = skhs->ts; } } } + + if (hs->xmin.value == ~0) + hs->xmin = InvalidFullTransactionId; + if (hs->catalog_xmin.value == ~0) + hs->catalog_xmin = InvalidFullTransactionId; } /* diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index a589fe1869..2c3d1cea0e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -3,6 +3,7 @@ use anyhow::Context; use std::str; +use std::str::FromStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span, Instrument}; @@ -49,12 +50,14 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { - let re = - Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap(); + let re = Regex::new( + r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)", + ) + .unwrap(); let mut caps = re.captures_iter(cmd); let start_lsn = caps .next() - .map(|cap| cap[1].parse::()) + .map(|cap| Lsn::from_str(&cap[1])) .context("parse start LSN from START_REPLICATION command")??; Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) } else if cmd.starts_with("IDENTIFY_SYSTEM") { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 9b385630c2..54e27714ea 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -18,6 +18,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF use postgres_ffi::{XLogSegNo, PG_TLI}; use std::cmp::{max, min}; +use bytes::Bytes; use std::fs::{self, remove_file, File, OpenOptions}; use std::io::Write; use std::path::{Path, PathBuf}; @@ -36,6 +37,7 @@ use postgres_ffi::XLOG_BLCKSZ; use postgres_ffi::waldecoder::WalStreamDecoder; +use pq_proto::SystemId; use tokio::io::{AsyncReadExt, AsyncSeekExt}; pub trait Storage { @@ -478,6 +480,13 @@ pub struct WalReader { // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, + // We will respond with zero-ed bytes before this Lsn as long as + // pos is in the same segment as timeline_start_lsn. + timeline_start_lsn: Lsn, + // integer version number of PostgreSQL, e.g. 14; 15; 16 + pg_version: u32, + system_id: SystemId, + timeline_start_segment: Option, } impl WalReader { @@ -488,19 +497,27 @@ impl WalReader { start_pos: Lsn, enable_remote_read: bool, ) -> Result { - if start_pos < state.timeline_start_lsn { + if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { + bail!("state uninitialized, no data to read"); + } + + // TODO: Upgrade to bail!() once we know this couldn't possibly happen + if state.timeline_start_lsn == Lsn(0) { + warn!("timeline_start_lsn uninitialized before initializing wal reader"); + } + + if start_pos + < state + .timeline_start_lsn + .segment_lsn(state.server.wal_seg_size as usize) + { bail!( - "Requested streaming from {}, which is before the start of the timeline {}", + "Requested streaming from {}, which is before the start of the timeline {}, and also doesn't start at the first segment of that timeline", start_pos, state.timeline_start_lsn ); } - // TODO: add state.timeline_start_lsn == Lsn(0) check - if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { - bail!("state uninitialized, no data to read"); - } - Ok(Self { workdir, timeline_dir, @@ -509,10 +526,65 @@ impl WalReader { wal_segment: None, enable_remote_read, local_start_lsn: state.local_start_lsn, + timeline_start_lsn: state.timeline_start_lsn, + pg_version: state.server.pg_version / 10000, + system_id: state.server.system_id, + timeline_start_segment: None, }) } pub async fn read(&mut self, buf: &mut [u8]) -> Result { + // If this timeline is new, we may not have a full segment yet, so + // we pad the first bytes of the timeline's first WAL segment with 0s + if self.pos < self.timeline_start_lsn { + debug_assert_eq!( + self.pos.segment_number(self.wal_seg_size), + self.timeline_start_lsn.segment_number(self.wal_seg_size) + ); + + // All bytes after timeline_start_lsn are in WAL, but those before + // are not, so we manually construct an empty segment for the bytes + // not available in this timeline. + if self.timeline_start_segment.is_none() { + let it = postgres_ffi::generate_wal_segment( + self.timeline_start_lsn.segment_number(self.wal_seg_size), + self.system_id, + self.pg_version, + self.timeline_start_lsn, + )?; + self.timeline_start_segment = Some(it); + } + + assert!(self.timeline_start_segment.is_some()); + let segment = self.timeline_start_segment.take().unwrap(); + + let seg_bytes = &segment[..]; + + // How much of the current segment have we already consumed? + let pos_seg_offset = self.pos.segment_offset(self.wal_seg_size); + + // How many bytes may we consume in total? + let tl_start_seg_offset = self.timeline_start_lsn.segment_offset(self.wal_seg_size); + + debug_assert!(seg_bytes.len() > pos_seg_offset); + debug_assert!(seg_bytes.len() > tl_start_seg_offset); + + // Copy as many bytes as possible into the buffer + let len = (tl_start_seg_offset - pos_seg_offset).min(buf.len()); + buf[0..len].copy_from_slice(&seg_bytes[pos_seg_offset..pos_seg_offset + len]); + + self.pos += len as u64; + + // If we're done with the segment, we can release it's memory. + // However, if we're not yet done, store it so that we don't have to + // construct the segment the next time this function is called. + if self.pos < self.timeline_start_lsn { + self.timeline_start_segment = Some(segment); + } + + return Ok(len); + } + let mut wal_segment = match self.wal_segment.take() { Some(reader) => reader, None => self.open_segment().await?, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f209dca560..a46c19d7fd 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1451,6 +1451,7 @@ class NeonCli(AbstractNeonCli): branch_name: str, endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, + hot_standby: bool = False, lsn: Optional[Lsn] = None, port: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": @@ -1470,6 +1471,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--port", str(port)]) if endpoint_id is not None: args.append(endpoint_id) + if hot_standby: + args.extend(["--hot-standby", "true"]) res = self.raw_cli(args) res.check_returncode() @@ -2206,6 +2209,7 @@ class Endpoint(PgProtocol): super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env self.running = False + self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id @@ -2217,6 +2221,7 @@ class Endpoint(PgProtocol): self, branch_name: str, endpoint_id: Optional[str] = None, + hot_standby: bool = False, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Endpoint": @@ -2231,12 +2236,14 @@ class Endpoint(PgProtocol): if endpoint_id is None: endpoint_id = self.env.generate_endpoint_id() self.endpoint_id = endpoint_id + self.branch_name = branch_name self.env.neon_cli.endpoint_create( branch_name, endpoint_id=self.endpoint_id, tenant_id=self.tenant_id, lsn=lsn, + hot_standby=hot_standby, port=self.port, ) path = Path("endpoints") / self.endpoint_id / "pgdata" @@ -2361,6 +2368,7 @@ class Endpoint(PgProtocol): self, branch_name: str, endpoint_id: Optional[str] = None, + hot_standby: bool = False, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, ) -> "Endpoint": @@ -2375,6 +2383,7 @@ class Endpoint(PgProtocol): branch_name=branch_name, endpoint_id=endpoint_id, config_lines=config_lines, + hot_standby=hot_standby, lsn=lsn, ).start() @@ -2408,6 +2417,7 @@ class EndpointFactory: endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, + hot_standby: bool = False, config_lines: Optional[List[str]] = None, ) -> Endpoint: ep = Endpoint( @@ -2421,6 +2431,7 @@ class EndpointFactory: return ep.create_start( branch_name=branch_name, endpoint_id=endpoint_id, + hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, ) @@ -2431,6 +2442,7 @@ class EndpointFactory: endpoint_id: Optional[str] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, + hot_standby: bool = False, config_lines: Optional[List[str]] = None, ) -> Endpoint: ep = Endpoint( @@ -2449,6 +2461,7 @@ class EndpointFactory: branch_name=branch_name, endpoint_id=endpoint_id, lsn=lsn, + hot_standby=hot_standby, config_lines=config_lines, ) @@ -2458,6 +2471,36 @@ class EndpointFactory: return self + def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]): + branch_name = origin.branch_name + assert origin in self.endpoints + assert branch_name is not None + + return self.create( + branch_name=branch_name, + endpoint_id=endpoint_id, + tenant_id=origin.tenant_id, + lsn=None, + hot_standby=True, + config_lines=config_lines, + ) + + def new_replica_start( + self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None + ): + branch_name = origin.branch_name + assert origin in self.endpoints + assert branch_name is not None + + return self.create_start( + branch_name=branch_name, + endpoint_id=endpoint_id, + tenant_id=origin.tenant_id, + lsn=None, + hot_standby=True, + config_lines=config_lines, + ) + @dataclass class SafekeeperPort: diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py index aa99a01c83..d72ffe078d 100644 --- a/test_runner/regress/test_compute_ctl.py +++ b/test_runner/regress/test_compute_ctl.py @@ -59,11 +59,6 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): "value": "replica", "vartype": "enum" }, - { - "name": "hot_standby", - "value": "on", - "vartype": "bool" - }, { "name": "neon.safekeepers", "value": """ diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py new file mode 100644 index 0000000000..12e034cea2 --- /dev/null +++ b/test_runner/regress/test_hot_standby.py @@ -0,0 +1,79 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv + + +@pytest.mark.timeout(1800) +def test_hot_standby(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: + primary_lsn = None + cought_up = False + queries = [ + "SHOW neon.timeline_id", + "SHOW neon.tenant_id", + "SELECT relname FROM pg_class WHERE relnamespace = current_schema()::regnamespace::oid", + "SELECT COUNT(*), SUM(i) FROM test", + ] + responses = dict() + + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i") + + # Explicit commit to make sure other connections (and replicas) can + # see the changes of this commit. + p_con.commit() + + with p_con.cursor() as p_cur: + p_cur.execute("SELECT pg_current_wal_insert_lsn()::text") + res = p_cur.fetchone() + assert res is not None + (lsn,) = res + primary_lsn = lsn + + # Explicit commit to make sure other connections (and replicas) can + # see the changes of this commit. + # Note that this may generate more WAL if the transaction has changed + # things, but we don't care about that. + p_con.commit() + + for query in queries: + with p_con.cursor() as p_cur: + p_cur.execute(query) + res = p_cur.fetchone() + assert res is not None + response = res + responses[query] = response + + with secondary.connect() as s_con: + with s_con.cursor() as s_cur: + s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") + res = s_cur.fetchone() + assert res is not None + + while not cought_up: + with s_con.cursor() as secondary_cursor: + secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()") + res = secondary_cursor.fetchone() + assert res is not None + (secondary_lsn,) = res + # There may be more changes on the primary after we got our LSN + # due to e.g. autovacuum, but that shouldn't impact the content + # of the tables, so we check whether we've replayed up to at + # least after the commit of the `test` table. + cought_up = secondary_lsn >= primary_lsn + + # Explicit commit to flush any transient transaction-level state. + s_con.commit() + + for query in queries: + with s_con.cursor() as secondary_cursor: + secondary_cursor.execute(query) + response = secondary_cursor.fetchone() + assert response is not None + assert response == responses[query] diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 3e70693c91..a2daebc6b4 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 3e70693c9178878404d14a61c96b15b74eb02688 +Subproject commit a2daebc6b445dcbcca9c18e1711f47c1db7ffb04 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 4ad87b0f36..aee72b7be9 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 4ad87b0f364a2313600c1d9774ca33df00e606f4 +Subproject commit aee72b7be903e52d9bdc6449aa4c17fb852d8708 From 3be81dd36bcda1288ad25ea2ff5d3acd8b26b24f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 27 Apr 2023 16:07:25 +0200 Subject: [PATCH 324/426] fix `clippy --release` failure introduced in #4030 (#4095) PR `build: run clippy for powerset of features (#4077)` brought us a `clippy --release` pass. It was merged after #4030, which fails under `clippy --release` with ``` error: static `TENANT_ID_EXTRACTOR` is never used --> pageserver/src/tenant/timeline.rs:4270:16 | 4270 | pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< | ^^^^^^^^^^^^^^^^^^^ | = note: `-D dead-code` implied by `-D warnings` error: static `TIMELINE_ID_EXTRACTOR` is never used --> pageserver/src/tenant/timeline.rs:4276:16 | 4276 | pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy< | ^^^^^^^^^^^^^^^^^^^^^ ``` A merge queue would have prevented this. --- pageserver/src/tenant/timeline.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 87f03f30b6..5c671ffd63 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -19,7 +19,6 @@ use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; -use utils::tracing_span_assert; use std::cmp::{max, min, Ordering}; use std::collections::BinaryHeap; @@ -4265,8 +4264,15 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } +#[cfg(not(debug_assertions))] +#[inline] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} + +#[cfg(debug_assertions)] #[inline] pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + use utils::tracing_span_assert; + pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< tracing_span_assert::MultiNameExtractor<2>, > = once_cell::sync::Lazy::new(|| { @@ -4279,7 +4285,6 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]) }); - #[cfg(debug_assertions)] match tracing_span_assert::check_fields_present([ &*TENANT_ID_EXTRACTOR, &*TIMELINE_ID_EXTRACTOR, From f5b4697c90cb37cc1386b77b03cc2a013fde1af3 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 5 Apr 2023 14:55:55 +0400 Subject: [PATCH 325/426] Log session_id when proxy per client task errors out. --- proxy/src/proxy.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 9945e3697f..1169d76160 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -95,9 +95,9 @@ pub async fn task_main( handle_client(config, &cancel_map, session_id, socket).await } - .unwrap_or_else(|e| { + .unwrap_or_else(move |e| { // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); + error!(?session_id, "per-client task finished with an error: {e:#}"); }), ); } From d1e86d65dc64635bf0a3cef1aaa26766e683cd4c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 27 Apr 2023 17:27:21 +0300 Subject: [PATCH 326/426] Run rustfmt to fix whitespace. Commit e6ec2400fc introduced some trivial whitespace issues. --- libs/postgres_ffi/src/xlog_utils.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 8ed00a9e13..4d7bb61883 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -346,7 +346,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result Result Result Date: Thu, 27 Apr 2023 18:51:57 +0300 Subject: [PATCH 327/426] refactor: Cleanup page service (#4097) Refactoring part of #4093. Numerious `Send + Sync` bounds were a distraction, that were not needed at all. The proper `Bytes` usage and one `"error_message".to_string()` are just drive-by fixes. Not using the `PostgresBackendTCP` allows us to start setting read timeouts (and more). `PostgresBackendTCP` is still used from proxy, so it cannot be removed. --- pageserver/src/import_datadir.rs | 8 ++-- pageserver/src/page_service.rs | 64 +++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 39e434a023..936de35eb9 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -114,7 +114,7 @@ async fn import_rel( path: &Path, spcoid: Oid, dboid: Oid, - reader: &mut (impl AsyncRead + Send + Sync + Unpin), + reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -200,7 +200,7 @@ async fn import_slru( modification: &mut DatadirModification<'_>, slru: SlruKind, path: &Path, - reader: &mut (impl AsyncRead + Send + Sync + Unpin), + reader: &mut (impl AsyncRead + Unpin), len: usize, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -612,8 +612,8 @@ async fn import_file( Ok(None) } -async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result { +async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result { let mut buf: Vec = vec![]; reader.read_to_end(&mut buf).await?; - Ok(Bytes::copy_from_slice(&buf[..])) + Ok(Bytes::from(buf)) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 135f08e846..3610704f2c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -20,7 +20,6 @@ use pageserver_api::models::{ PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, }; -use postgres_backend::PostgresBackendTCP; use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; @@ -32,6 +31,7 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; use tracing::*; use utils::id::ConnectionId; @@ -57,7 +57,10 @@ use crate::trace::Tracer; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; -fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream> + '_ { +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ +where + IO: AsyncRead + AsyncWrite + Unpin, +{ async_stream::try_stream! { loop { let msg = tokio::select! { @@ -65,8 +68,8 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream { // We were requested to shut down. - let msg = "pageserver is shutting down".to_string(); - let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)); + let msg = "pageserver is shutting down"; + let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None)); Err(QueryError::Other(anyhow::anyhow!(msg))) } @@ -125,7 +128,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream anyhow::Result<()> { +async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { use tokio::io::AsyncReadExt; let mut buf = [0u8; 512]; @@ -245,12 +248,14 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; + let peer_addr = socket.peer_addr().context("get peer address")?; + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx); - let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend .run(&mut conn_handler, task_mgr::shutdown_watcher) @@ -332,13 +337,16 @@ impl PageServerHandler { } #[instrument(skip(self, pgb, ctx))] - async fn handle_pagerequests( + async fn handle_pagerequests( &self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, ctx: RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result<()> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); @@ -436,16 +444,19 @@ impl PageServerHandler { #[allow(clippy::too_many_arguments)] #[instrument(skip(self, pgb, ctx))] - async fn handle_import_basebackup( + async fn handle_import_basebackup( &self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, pg_version: u32, ctx: RequestContext, - ) -> Result<(), QueryError> { + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); @@ -486,15 +497,18 @@ impl PageServerHandler { } #[instrument(skip(self, pgb, ctx))] - async fn handle_import_wal( + async fn handle_import_wal( &self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, start_lsn: Lsn, end_lsn: Lsn, ctx: RequestContext, - ) -> Result<(), QueryError> { + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; @@ -690,16 +704,19 @@ impl PageServerHandler { #[allow(clippy::too_many_arguments)] #[instrument(skip(self, pgb, ctx))] - async fn handle_basebackup_request( + async fn handle_basebackup_request( &mut self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, lsn: Option, prev_lsn: Option, full_backup: bool, ctx: RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result<()> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { let started = std::time::Instant::now(); // check that the timeline exists @@ -770,10 +787,13 @@ impl PageServerHandler { } #[async_trait::async_trait] -impl postgres_backend::Handler for PageServerHandler { +impl postgres_backend::Handler for PageServerHandler +where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, +{ fn check_auth_jwt( &mut self, - _pgb: &mut PostgresBackendTCP, + _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> Result<(), QueryError> { // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT @@ -801,7 +821,7 @@ impl postgres_backend::Handler for PageServerHandler { fn startup( &mut self, - _pgb: &mut PostgresBackendTCP, + _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { Ok(()) @@ -809,7 +829,7 @@ impl postgres_backend::Handler for PageServerHandler { async fn process_query( &mut self, - pgb: &mut PostgresBackendTCP, + pgb: &mut PostgresBackend, query_string: &str, ) -> Result<(), QueryError> { let ctx = self.connection_ctx.attached_child(); From c4e1cafb6304f4cc7e2c65b77b6e3b4ef4afb17b Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 27 Apr 2023 17:08:00 +0100 Subject: [PATCH 328/426] scripts/flaky_tests.py: handle connection error (#4096) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Increase `connect_timeout` to 30s, which should be enough for most of the cases - If the script cannot connect to the DB (or any other `psycopg2.OperationalError` occur) — do not fail the script, log the error and proceed. Problems with fetching flaky tests shouldn't block the PR --- scripts/flaky_tests.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 829cc814e8..262950b61d 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -42,12 +42,16 @@ def main(args: argparse.Namespace): res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] res = defaultdict(lambda: defaultdict(dict)) - logging.info("connecting to the database...") - with psycopg2.connect(connstr, connect_timeout=10) as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - logging.info("fetching flaky tests...") - cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) - rows = cur.fetchall() + try: + logging.info("connecting to the database...") + with psycopg2.connect(connstr, connect_timeout=30) as conn: + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + logging.info("fetching flaky tests...") + cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) + rows = cur.fetchall() + except psycopg2.OperationalError as exc: + logging.error("cannot fetch flaky tests from the DB due to an error", exc) + rows = [] for row in rows: logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}") From fe0b6162992b32f874fe9c21d48b0580013a556f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 27 Apr 2023 20:55:35 +0300 Subject: [PATCH 329/426] feat(page_service): read timeouts (#4093) Introduce read timeouts to our `page_service` connections. Without read timeouts, we essentially leak connections. This is a port of #3995. Split the refactorings to the other PR: #4097. Fixes #4028. --- Cargo.lock | 1 + pageserver/Cargo.toml | 1 + pageserver/src/page_service.rs | 9 +++++++++ 3 files changed, 11 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 08b24d263c..2f5878dc6e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2681,6 +2681,7 @@ dependencies = [ "tenant_size_model", "thiserror", "tokio", + "tokio-io-timeout", "tokio-postgres", "tokio-tar", "tokio-util", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0bc7eba95e..ea81544cbe 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -52,6 +52,7 @@ sync_wrapper.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3610704f2c..8b0795db3c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -250,6 +250,15 @@ async fn page_service_conn_main( let peer_addr = socket.peer_addr().context("get peer address")?; + // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements: + // - long enough for most valid compute connections + // - less than infinite to stop us from "leaking" connections to long-gone computes + // + // no write timeout is used, because the kernel is assumed to error writes after some time. + let mut socket = tokio_io_timeout::TimeoutReader::new(socket); + socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10))); + let socket = std::pin::pin!(socket); + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler From b2a3981eaded4a1c277068563b0f69c5d6f6f986 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 12 Apr 2023 13:26:07 +0400 Subject: [PATCH 330/426] Move tracking of walsenders out of Timeline. Refactors walsenders out of timeline.rs to makes it less convoluted into separate WalSenders with its own lock, but otherwise having the same structure. Tracking of in-memory remote_consistent_lsn is also moved there as it is mainly received from pageserver. State of walsender (feedback) is also restructured to be cleaner; now it is either PageserverFeedback or StandbyFeedback(StandbyReply, HotStandbyFeedback), but not both. --- libs/pq_proto/src/lib.rs | 5 +- safekeeper/src/broker.rs | 2 +- safekeeper/src/debug_dump.rs | 4 +- safekeeper/src/http/routes.rs | 4 +- safekeeper/src/metrics.rs | 38 +- safekeeper/src/safekeeper.rs | 22 +- safekeeper/src/send_wal.rs | 472 ++++++++++++++++++++--- safekeeper/src/timeline.rs | 218 +++-------- test_runner/fixtures/neon_fixtures.py | 2 + test_runner/regress/test_wal_acceptor.py | 11 +- 10 files changed, 514 insertions(+), 264 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index ed0239072a..1e7afa9bc0 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -947,9 +947,10 @@ impl<'a> BeMessage<'a> { pub struct PageserverFeedback { /// Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, - /// LSN last received and ingested by the pageserver. + /// LSN last received and ingested by the pageserver. Controls backpressure. pub last_received_lsn: u64, /// LSN up to which data is persisted by the pageserver to its local disc. + /// Controls backpressure. pub disk_consistent_lsn: u64, /// LSN up to which data is persisted by the pageserver on s3; safekeepers /// consider WAL before it can be removed. @@ -968,7 +969,7 @@ impl PageserverFeedback { last_received_lsn: 0, remote_consistent_lsn: 0, disk_consistent_lsn: 0, - replytime: SystemTime::now(), + replytime: *PG_EPOCH, } } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 92f35bf51f..6a98d8fd84 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -91,7 +91,7 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { // connection to the broker. // note: there are blocking operations below, but it's considered fine for now - tli.record_safekeeper_info(&msg).await? + tli.record_safekeeper_info(msg).await? } } bail!("end of stream"); diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 674cf9f6eb..954fbfc438 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -22,7 +22,7 @@ use crate::safekeeper::SafekeeperMemState; use crate::safekeeper::TermHistory; use crate::SafeKeeperConf; -use crate::timeline::ReplicaState; +use crate::send_wal::WalSenderState; use crate::GlobalTimelines; /// Various filters that influence the resulting JSON output. @@ -87,7 +87,7 @@ pub struct Timeline { pub struct Memory { pub is_cancelled: bool, pub peers_info_len: usize, - pub replicas: Vec>, + pub walsenders: Vec, pub wal_backup_active: bool, pub active: bool, pub num_computes: u32, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index cdec45c148..ef691c5fe6 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -144,7 +144,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result Result<()>) -> Result { /// Metrics for a single timeline. pub struct FullTimelineInfo { pub ttid: TenantTimelineId, - pub replicas: Vec, + pub ps_feedback: PageserverFeedback, pub wal_backup_active: bool, pub timeline_is_active: bool, pub num_computes: u32, @@ -242,6 +242,7 @@ pub struct FullTimelineInfo { pub persisted_state: SafeKeeperState, pub flush_lsn: Lsn, + pub remote_consistent_lsn: Lsn, pub wal_storage: WalStorageMetrics, } @@ -514,19 +515,6 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; - for replica in tli.replicas.iter() { - if let Some(replica_feedback) = replica.pageserver_feedback { - if let Some(current) = most_advanced { - if current.last_received_lsn < replica_feedback.last_received_lsn { - most_advanced = Some(replica_feedback); - } - } else { - most_advanced = Some(replica_feedback); - } - } - } - self.commit_lsn .with_label_values(labels) .set(tli.mem_state.commit_lsn.into()); @@ -544,7 +532,7 @@ impl Collector for TimelineCollector { .set(tli.mem_state.peer_horizon_lsn.into()); self.remote_consistent_lsn .with_label_values(labels) - .set(tli.mem_state.remote_consistent_lsn.into()); + .set(tli.remote_consistent_lsn.into()); self.timeline_active .with_label_values(labels) .set(tli.timeline_is_active as u64); @@ -567,15 +555,17 @@ impl Collector for TimelineCollector { .with_label_values(labels) .set(tli.wal_storage.flush_wal_seconds); - if let Some(feedback) = most_advanced { - self.ps_last_received_lsn + self.ps_last_received_lsn + .with_label_values(labels) + .set(tli.ps_feedback.last_received_lsn); + if let Ok(unix_time) = tli + .ps_feedback + .replytime + .duration_since(SystemTime::UNIX_EPOCH) + { + self.feedback_last_time_seconds .with_label_values(labels) - .set(feedback.last_received_lsn); - if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) { - self.feedback_last_time_seconds - .with_label_values(labels) - .set(unix_time.as_secs()); - } + .set(unix_time.as_secs()); } if tli.last_removed_segno != 0 { diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 10b4842cbd..6864a9713d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -212,7 +212,6 @@ pub struct SafekeeperMemState { pub commit_lsn: Lsn, pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, - pub remote_consistent_lsn: Lsn, #[serde(with = "hex")] pub proposer_uuid: PgUuid, } @@ -540,7 +539,6 @@ where commit_lsn: state.commit_lsn, backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, - remote_consistent_lsn: state.remote_consistent_lsn, proposer_uuid: state.proposer_uuid, }, state, @@ -781,10 +779,6 @@ where // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); - // Initializing remote_consistent_lsn sets that we have nothing to - // stream to pageserver(s) immediately after creation. - self.inmem.remote_consistent_lsn = - max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); self.persist_control_file(state)?; @@ -837,7 +831,6 @@ where state.commit_lsn = self.inmem.commit_lsn; state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; - state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; self.state.persist(&state) } @@ -940,14 +933,12 @@ where self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; self.inmem.backup_lsn = new_backup_lsn; - let new_remote_consistent_lsn = max( - Lsn(sk_info.remote_consistent_lsn), - self.inmem.remote_consistent_lsn, - ); + // value in sk_info should be maximized over our local in memory value. + let new_remote_consistent_lsn = Lsn(sk_info.remote_consistent_lsn); + assert!(self.state.remote_consistent_lsn <= new_remote_consistent_lsn); sync_control_file |= self.state.remote_consistent_lsn + (self.state.server.wal_seg_size as u64) < new_remote_consistent_lsn; - self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn); sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) @@ -955,7 +946,12 @@ where self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; if sync_control_file { - self.persist_control_file(self.state.clone())?; + let mut state = self.state.clone(); + // Note: we do not persist remote_consistent_lsn in other paths of + // persisting cf -- that is not much needed currently. We could do + // that by storing Arc to walsenders in Safekeeper. + state.remote_consistent_lsn = new_remote_consistent_lsn; + self.persist_control_file(state)?; } Ok(()) } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a6ca89efa4..abd213deff 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -1,12 +1,14 @@ //! This module implements the streaming side of replication protocol, starting -//! with the "START_REPLICATION" message. +//! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline}; +use crate::timeline::Timeline; +use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; use anyhow::Context as AnyhowContext; use bytes::Bytes; +use parking_lot::Mutex; use postgres_backend::PostgresBackend; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; @@ -14,8 +16,12 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; +use utils::http::json::display_serialize; +use utils::id::TenantTimelineId; +use utils::lsn::AtomicLsn; -use std::cmp::min; +use std::cmp::{max, min}; +use std::net::SocketAddr; use std::str; use std::sync::Arc; use std::time::Duration; @@ -40,6 +46,8 @@ pub struct HotStandbyFeedback { pub catalog_xmin: FullTransactionId, } +const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; + impl HotStandbyFeedback { pub fn empty() -> HotStandbyFeedback { HotStandbyFeedback { @@ -51,24 +59,293 @@ impl HotStandbyFeedback { } /// Standby status update -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyReply { - pub write_lsn: Lsn, // last lsn received by pageserver - pub flush_lsn: Lsn, // pageserver's disk consistent lSN - pub apply_lsn: Lsn, // pageserver's remote consistent lSN - pub reply_ts: TimestampTz, + pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. + pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. + pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. + pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. pub reply_requested: bool, } -/// Scope guard to unregister replication connection from timeline -struct ReplicationConnGuard { - replica: usize, // replica internal ID assigned by timeline - timeline: Arc, +impl StandbyReply { + fn empty() -> Self { + StandbyReply { + write_lsn: Lsn::INVALID, + flush_lsn: Lsn::INVALID, + apply_lsn: Lsn::INVALID, + reply_ts: 0, + reply_requested: false, + } + } } -impl Drop for ReplicationConnGuard { +#[derive(Debug, Clone, Copy, Serialize)] +pub struct StandbyFeedback { + reply: StandbyReply, + hs_feedback: HotStandbyFeedback, +} + +/// WalSenders registry. Timeline holds it (wrapped in Arc). +pub struct WalSenders { + /// Lsn maximized over all walsenders *and* peer data, so might be higher + /// than what we receive from replicas. + remote_consistent_lsn: AtomicLsn, + mutex: Mutex, +} + +impl WalSenders { + pub fn new(remote_consistent_lsn: Lsn) -> Arc { + Arc::new(WalSenders { + remote_consistent_lsn: AtomicLsn::from(remote_consistent_lsn), + mutex: Mutex::new(WalSendersShared::new()), + }) + } + + /// Register new walsender. Returned guard provides access to the slot and + /// automatically deregisters in Drop. + fn register( + self: &Arc, + ttid: TenantTimelineId, + addr: SocketAddr, + conn_id: ConnectionId, + appname: Option, + ) -> WalSenderGuard { + let slots = &mut self.mutex.lock().slots; + let walsender_state = WalSenderState { + ttid, + addr, + conn_id, + appname, + feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), + }; + // find empty slot or create new one + let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) { + slots[pos] = Some(walsender_state); + pos + } else { + let pos = slots.len(); + slots.push(Some(walsender_state)); + pos + }; + WalSenderGuard { + id: pos, + walsenders: self.clone(), + } + } + + /// Get state of all walsenders. + pub fn get_all(self: &Arc) -> Vec { + self.mutex.lock().slots.iter().flatten().cloned().collect() + } + + /// Get aggregated pageserver feedback. + pub fn get_ps_feedback(self: &Arc) -> PageserverFeedback { + self.mutex.lock().agg_ps_feedback + } + + /// Get aggregated pageserver and hot standby feedback (we send them to compute). + pub fn get_feedbacks(self: &Arc) -> (PageserverFeedback, HotStandbyFeedback) { + let shared = self.mutex.lock(); + (shared.agg_ps_feedback, shared.agg_hs_feedback) + } + + /// Record new pageserver feedback, update aggregated values. + fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { + let mut shared = self.mutex.lock(); + shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); + shared.update_ps_feedback(); + self.update_remote_consistent_lsn(Lsn(shared.agg_ps_feedback.remote_consistent_lsn)); + } + + /// Record standby reply. + fn record_standby_reply(self: &Arc, id: WalSenderId, reply: &StandbyReply) { + let mut shared = self.mutex.lock(); + let slot = shared.get_slot_mut(id); + match &mut slot.feedback { + ReplicationFeedback::Standby(sf) => sf.reply = *reply, + ReplicationFeedback::Pageserver(_) => { + slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + reply: *reply, + hs_feedback: HotStandbyFeedback::empty(), + }) + } + } + } + + /// Record hot standby feedback, update aggregated value. + fn record_hs_feedback(self: &Arc, id: WalSenderId, feedback: &HotStandbyFeedback) { + let mut shared = self.mutex.lock(); + let slot = shared.get_slot_mut(id); + match &mut slot.feedback { + ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback, + ReplicationFeedback::Pageserver(_) => { + slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: *feedback, + }) + } + } + shared.update_hs_feedback(); + } + + /// Get remote_consistent_lsn reported by the pageserver. Returns None if + /// client is not pageserver. + fn get_ws_remote_consistent_lsn(self: &Arc, id: WalSenderId) -> Option { + let shared = self.mutex.lock(); + let slot = shared.get_slot(id); + match slot.feedback { + ReplicationFeedback::Pageserver(feedback) => Some(Lsn(feedback.remote_consistent_lsn)), + _ => None, + } + } + + /// Get remote_consistent_lsn maximized across all walsenders and peers. + pub fn get_remote_consistent_lsn(self: &Arc) -> Lsn { + self.remote_consistent_lsn.load() + } + + /// Update maximized remote_consistent_lsn, return new (potentially) value. + pub fn update_remote_consistent_lsn(self: &Arc, candidate: Lsn) -> Lsn { + self.remote_consistent_lsn + .fetch_max(candidate) + .max(candidate) + } + + /// Unregister walsender. + fn unregister(self: &Arc, id: WalSenderId) { + let mut shared = self.mutex.lock(); + shared.slots[id] = None; + shared.update_hs_feedback(); + } +} + +struct WalSendersShared { + // aggregated over all walsenders value + agg_hs_feedback: HotStandbyFeedback, + // aggregated over all walsenders value + agg_ps_feedback: PageserverFeedback, + slots: Vec>, +} + +impl WalSendersShared { + fn new() -> Self { + WalSendersShared { + agg_hs_feedback: HotStandbyFeedback::empty(), + agg_ps_feedback: PageserverFeedback::empty(), + slots: Vec::new(), + } + } + + /// Get content of provided id slot, it must exist. + fn get_slot(&self, id: WalSenderId) -> &WalSenderState { + self.slots[id].as_ref().expect("walsender doesn't exist") + } + + /// Get mut content of provided id slot, it must exist. + fn get_slot_mut(&mut self, id: WalSenderId) -> &mut WalSenderState { + self.slots[id].as_mut().expect("walsender doesn't exist") + } + + /// Update aggregated hot standy feedback. We just take min of valid xmins + /// and ts. + fn update_hs_feedback(&mut self) { + let mut agg = HotStandbyFeedback::empty(); + for ws_state in self.slots.iter().flatten() { + if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { + let hs_feedback = standby_feedback.hs_feedback; + // doing Option math like op1.iter().chain(op2.iter()).min() + // would be nicer, but we serialize/deserialize this struct + // directly, so leave as is for now + if hs_feedback.xmin != INVALID_FULL_TRANSACTION_ID { + if agg.xmin != INVALID_FULL_TRANSACTION_ID { + agg.xmin = min(agg.xmin, hs_feedback.xmin); + } else { + agg.xmin = hs_feedback.xmin; + } + agg.ts = min(agg.ts, hs_feedback.ts); + } + if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID { + if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID { + agg.catalog_xmin = min(agg.catalog_xmin, hs_feedback.catalog_xmin); + } else { + agg.catalog_xmin = hs_feedback.catalog_xmin; + } + agg.ts = min(agg.ts, hs_feedback.ts); + } + } + } + self.agg_hs_feedback = agg; + } + + /// Update aggregated pageserver feedback. LSNs (last_received, + /// disk_consistent, remote_consistent) and reply timestamp are just + /// maximized; timeline_size if taken from feedback with highest + /// last_received lsn. This is generally reasonable, but we might want to + /// implement other policies once multiple pageservers start to be actively + /// used. + fn update_ps_feedback(&mut self) { + let init = PageserverFeedback::empty(); + let acc = + self.slots + .iter() + .flatten() + .fold(init, |mut acc, ws_state| match ws_state.feedback { + ReplicationFeedback::Pageserver(feedback) => { + if feedback.last_received_lsn > acc.last_received_lsn { + acc.current_timeline_size = feedback.current_timeline_size; + } + acc.last_received_lsn = + max(feedback.last_received_lsn, acc.last_received_lsn); + acc.disk_consistent_lsn = + max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn); + acc.remote_consistent_lsn = + max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn); + acc.replytime = max(feedback.replytime, acc.replytime); + acc + } + ReplicationFeedback::Standby(_) => acc, + }); + self.agg_ps_feedback = acc; + } +} + +// Serialized is used only for pretty printing in json. +#[derive(Debug, Clone, Serialize)] +pub struct WalSenderState { + #[serde(serialize_with = "display_serialize")] + ttid: TenantTimelineId, + addr: SocketAddr, + conn_id: ConnectionId, + // postgres application_name + appname: Option, + feedback: ReplicationFeedback, +} + +// Receiver is either pageserver or regular standby, which have different +// feedbacks. +#[derive(Debug, Clone, Copy, Serialize)] +enum ReplicationFeedback { + Pageserver(PageserverFeedback), + Standby(StandbyFeedback), +} + +// id of the occupied slot in WalSenders to access it (and save in the +// WalSenderGuard). We could give Arc directly to the slot, but there is not +// much sense in that as values aggregation which is performed on each feedback +// receival iterates over all walsenders. +pub type WalSenderId = usize; + +/// Scope guard to access slot in WalSenders registry and unregister from it in +/// Drop. +pub struct WalSenderGuard { + id: WalSenderId, + walsenders: Arc, +} + +impl Drop for WalSenderGuard { fn drop(&mut self) { - self.timeline.remove_replica(self.replica); + self.walsenders.unregister(self.id); } } @@ -97,16 +374,13 @@ impl SafekeeperPostgresHandler { let tli = GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; - let state = ReplicaState::new(); - // This replica_id is used below to check if it's time to stop replication. - let replica_id = tli.add_replica(state); - - // Use a guard object to remove our entry from the timeline, when the background - // thread and us have both finished using it. - let _guard = Arc::new(ReplicationConnGuard { - replica: replica_id, - timeline: tli.clone(), - }); + // Use a guard object to remove our entry from the timeline when we are done. + let ws_guard = Arc::new(tli.get_walsenders().register( + self.ttid, + *pgb.get_peer_addr(), + self.conn_id, + self.appname.clone(), + )); // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will @@ -154,16 +428,11 @@ impl SafekeeperPostgresHandler { end_pos, stop_pos, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), - replica_id, + ws_guard: ws_guard.clone(), wal_reader, send_buf: [0; MAX_SEND_SIZE], }; - let mut reply_reader = ReplyReader { - reader, - tli, - replica_id, - feedback: ReplicaState::new(), - }; + let mut reply_reader = ReplyReader { reader, ws_guard }; let res = tokio::select! { // todo: add read|write .context to these errors @@ -190,7 +459,7 @@ struct WalSender<'a, IO> { // in recovery. stop_pos: Option, commit_lsn_watch_rx: Receiver, - replica_id: usize, + ws_guard: Arc, wal_reader: WalReader, // buffer for readling WAL into to send it send_buf: [u8; MAX_SEND_SIZE], @@ -264,14 +533,20 @@ impl WalSender<'_, IO> { return Ok(()); } // Timed out waiting for WAL, check for termination and send KA - if self.tli.should_walsender_stop(self.replica_id) { - // Terminate if there is nothing more to send. - // TODO close the stream properly - return Err(CopyStreamHandlerEnd::ServerInitiated(format!( - "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", - self.appname, self.start_pos, - ))); + if let Some(remote_consistent_lsn) = self + .ws_guard + .walsenders + .get_ws_remote_consistent_lsn(self.ws_guard.id) + { + if self.tli.should_walsender_stop(remote_consistent_lsn) { + // Terminate if there is nothing more to send. + return Err(CopyStreamHandlerEnd::ServerInitiated(format!( + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); + } } + self.pgb .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { sent_ptr: self.end_pos.0, @@ -286,9 +561,7 @@ impl WalSender<'_, IO> { /// A half driving receiving replies. struct ReplyReader { reader: PostgresBackendReader, - tli: Arc, - replica_id: usize, - feedback: ReplicaState, + ws_guard: Arc, } impl ReplyReader { @@ -303,29 +576,32 @@ impl ReplyReader { match msg.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { // Note: deserializing is on m[1..] because we skip the tag byte. - self.feedback.hs_feedback = HotStandbyFeedback::des(&msg[1..]) + let hs_feedback = HotStandbyFeedback::des(&msg[1..]) .context("failed to deserialize HotStandbyFeedback")?; - self.tli - .update_replica_state(self.replica_id, self.feedback); + self.ws_guard + .walsenders + .record_hs_feedback(self.ws_guard.id, &hs_feedback); } Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { - let _reply = + let reply = StandbyReply::des(&msg[1..]).context("failed to deserialize StandbyReply")?; - // This must be a regular postgres replica, - // because pageserver doesn't send this type of messages to safekeeper. - // Currently we just ignore this, tracking progress for them is not supported. + self.ws_guard + .walsenders + .record_standby_reply(self.ws_guard.id, &reply); } Some(NEON_STATUS_UPDATE_TAG_BYTE) => { // pageserver sends this. // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. let buf = Bytes::copy_from_slice(&msg[9..]); - let reply = PageserverFeedback::parse(buf); + let ps_feedback = PageserverFeedback::parse(buf); - trace!("PageserverFeedback is {:?}", reply); - self.feedback.pageserver_feedback = Some(reply); - - self.tli - .update_replica_state(self.replica_id, self.feedback); + trace!("PageserverFeedback is {:?}", ps_feedback); + self.ws_guard + .walsenders + .record_ps_feedback(self.ws_guard.id, &ps_feedback); + // in principle new remote_consistent_lsn could allow to + // deactivate the timeline, but we check that regularly through + // broker updated, not need to do it here } _ => warn!("unexpected message {:?}", msg), } @@ -368,3 +644,89 @@ async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> anyhow::Result