From f787eb1d3b42cc5e598fdbc86d709867cc0bed2b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 12 Jun 2024 21:09:35 +0200 Subject: [PATCH] test_vm_bit_clear_on_heap_lock_blackbox: dump layer map while flaky code is running refs https://github.com/neondatabase/neon/issues/6967 --- control_plane/Cargo.toml | 4 +++ control_plane/src/pageserver.rs | 12 +++++++ libs/pageserver_api/Cargo.toml | 4 +++ libs/pageserver_api/src/models.rs | 2 ++ pageserver/Cargo.toml | 2 +- pageserver/src/tenant.rs | 2 ++ pageserver/src/tenant/config.rs | 16 +++++++++ pageserver/src/tenant/timeline.rs | 56 +++++++++++++++++++++++++++++ test_runner/regress/test_vm_bits.py | 7 ++++ 9 files changed, 104 insertions(+), 1 deletion(-) diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index e62f3b8a47..650ab06ab9 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +testing = [ "pageserver_api/testing" ] + [dependencies] anyhow.workspace = true async-trait.workspace = true diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 5a84763697..4d9357de19 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -383,6 +383,12 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'switch_aux_file_policy'")?, + #[cfg(feature = "testing")] + test_vm_bit_debug_logging: settings + .remove("test_vm_bit_debug_logging") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'test_vm_bit_debug_logging' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -506,6 +512,12 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'switch_aux_file_policy'")?, + #[cfg(feature = "testing")] + test_vm_bit_debug_logging: settings + .remove("test_vm_bit_debug_logging") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'test_vm_bit_debug_logging' as bool")?, } }; diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 3bba89c76d..84beeafcab 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +default = [] +testing = [] + [dependencies] serde.workspace = true serde_with.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 9311dab33c..66e78a81b9 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -322,6 +322,8 @@ pub struct TenantConfig { pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, pub switch_aux_file_policy: Option, + #[cfg(feature = "testing")] + pub test_vm_bit_debug_logging: Option, } /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy` diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 4335f38f1e..5d7c2c9686 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints"] +testing = ["fail/failpoints", "pageserver_api/testing"] [dependencies] anyhow.workspace = true diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 89aec12a84..cf5ab8b33a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3831,6 +3831,8 @@ pub(crate) mod harness { tenant_conf.image_layer_creation_check_threshold, ), switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy), + #[cfg(feature = "testing")] + test_vm_bit_debug_logging: Some(tenant_conf.test_vm_bit_debug_logging), } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 342d705954..0d1ad544de 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -377,6 +377,9 @@ pub struct TenantConf { /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux /// file is written. pub switch_aux_file_policy: AuxFilePolicy, + + #[cfg(feature = "testing")] + pub test_vm_bit_debug_logging: bool, } /// Same as TenantConf, but this struct preserves the information about @@ -476,6 +479,11 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub switch_aux_file_policy: Option, + + #[cfg(feature = "testing")] + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub test_vm_bit_debug_logging: Option, } impl TenantConfOpt { @@ -538,6 +546,10 @@ impl TenantConfOpt { switch_aux_file_policy: self .switch_aux_file_policy .unwrap_or(global_conf.switch_aux_file_policy), + #[cfg(feature = "testing")] + test_vm_bit_debug_logging: self + .test_vm_bit_debug_logging + .unwrap_or(global_conf.test_vm_bit_debug_logging), } } } @@ -582,6 +594,8 @@ impl Default for TenantConf { timeline_get_throttle: crate::tenant::throttle::Config::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, switch_aux_file_policy: AuxFilePolicy::default_tenant_config(), + #[cfg(feature = "testing")] + test_vm_bit_debug_logging: false, } } } @@ -657,6 +671,8 @@ impl From for models::TenantConfig { timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, switch_aux_file_policy: value.switch_aux_file_policy, + #[cfg(feature = "testing")] + test_vm_bit_debug_logging: value.test_vm_bit_debug_logging, } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 84a12d7522..164fee9275 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -924,16 +924,59 @@ impl Timeline { let mut reconstruct_state = ValuesReconstructState::new(); // Only add the cached image to the reconstruct state when it exists. + let cached_page_img_lsn = cached_page_img.as_ref().map(|(lsn, _)| *lsn); if cached_page_img.is_some() { let mut key_state = VectoredValueReconstructState::default(); key_state.img = cached_page_img; reconstruct_state.keys.insert(key, Ok(key_state)); } + let debug_log = { + #[cfg(feature = "testing")] + { + self.get_test_vm_bit_debug_logging() + } + #[cfg(not(feature = "testing"))] + { + false + } + }; + + if debug_log { + tracing::info!(%key, %lsn, ?cached_page_img_lsn, "debug-logging page reconstruction"); + } + + if debug_log { + tracing::info!( + location = "before vectored get", + "debug-logging page reconstruction" + ); + self.layers + .read() + .await + .layer_map() + .dump(false, ctx) + .await + .unwrap(); + } + let vectored_res = self .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) .await; + if debug_log { + tracing::info!( + location = "before validation", + "debug-logging page reconstruction" + ); + self.layers + .read() + .await + .layer_map() + .dump(false, ctx) + .await + .unwrap(); + } if self.conf.validate_vectored_get { self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) .await; @@ -2196,6 +2239,15 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } + #[cfg(feature = "testing")] + fn get_test_vm_bit_debug_logging(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .test_vm_bit_debug_logging + .unwrap_or(self.conf.default_tenant_conf.test_vm_bit_debug_logging) + } + fn get_image_layer_creation_check_threshold(&self) -> u8 { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2211,6 +2263,10 @@ impl Timeline { pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. + #[cfg(feature = "testing")] + { + info!(?new_conf.test_vm_bit_debug_logging, "updating tenant conf"); + } // The threshold is embedded in the metric. So, we need to update it. { diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 225b952e73..f94adaab5c 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -287,6 +287,13 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): # already truncated away. # # ERROR: could not access status of transaction 1027 + + # Debugging https://github.com/neondatabase/neon/issues/6967 + # the select() below fails occassionally at get_impl="vectored" validation + env.pageserver.http_client().patch_tenant_config_client_side( + tenant_id, + {"test_vm_bit_debug_logging": True}, + ) cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update") tup = cur.fetchall() log.info(f"tuple = {tup}")