From aa9112efce42869472fbee7bfa0048f12d3ff81a Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 8 Nov 2024 10:16:04 +0000 Subject: [PATCH] pageserver: add `no_sync` for use in regression tests (1/2) (#9677) ## Problem In test environments, the `syncfs` that the pageserver does on startup can take a long time, as other tests running concurrently might have many gigabytes of dirty pages. ## Summary of changes - Add a `no_sync` option to the pageserver's config. - Skip syncfs on startup if this is set - A subsequent PR (https://github.com/neondatabase/neon/pull/9678) will enable this by default in tests. We need to wait until after the next release to avoid breaking compat tests, which would fail if we set no_sync & use an old pageserver binary. Q: Why is this a different mechanism than safekeeper, which as a --no-sync CLI? A: Because the way we manage pageservers in neon_local depends on the pageserver.toml containing the full configuration, whereas safekeepers have a config file which is neon-local-specific and can drive a CLI flag. Q: Why is the option no_sync rather than sync? A: For boolean configs with a dangerous value, it's preferable to make "false" the safe option, so that any downstream future config tooling that might have a "booleans are false by default" behavior (e.g. golang structs) is safe by default. Q: Why only skip the syncfs, and not all fsyncs? A: Skipping all fsyncs would require more code changes, and the most acute problem isn't fsyncs themselves (these just slow down a running test), it's the syncfs (which makes a pageserver startup slow as a result of _other_ tests) --- control_plane/src/bin/neon_local.rs | 3 +++ control_plane/src/local_env.rs | 10 ++++++++++ control_plane/src/pageserver.rs | 1 + libs/pageserver_api/src/config.rs | 3 +++ pageserver/src/bin/pageserver.rs | 18 +++++++++++------- pageserver/src/config.rs | 5 +++++ 6 files changed, 33 insertions(+), 7 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 48438adf43..c4063bbd1a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, other: Default::default(), + // Typical developer machines use disks with slow fsync, and we don't care + // about data integrity: disable disk syncs. + no_sync: true, } }) .collect(), diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 9dc2a0c36b..032c88a829 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -225,6 +225,7 @@ pub struct PageServerConf { pub listen_http_addr: String, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, + pub no_sync: bool, } impl Default for PageServerConf { @@ -235,6 +236,7 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, + no_sync: false, } } } @@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf { pub listen_http_addr: String, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, + #[serde(default, skip_serializing_if = "std::ops::Not::not")] + pub no_sync: bool, #[serde(flatten)] pub other: HashMap, } @@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { listen_http_addr, pg_auth_type, http_auth_type, + no_sync, other: _, } = conf; Self { @@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { listen_http_addr: listen_http_addr.clone(), pg_auth_type: *pg_auth_type, http_auth_type: *http_auth_type, + no_sync: *no_sync, } } } @@ -569,6 +575,8 @@ impl LocalEnv { listen_http_addr: String, pg_auth_type: AuthType, http_auth_type: AuthType, + #[serde(default)] + no_sync: bool, } let config_toml_path = dentry.path().join("pageserver.toml"); let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( @@ -591,6 +599,7 @@ impl LocalEnv { listen_http_addr, pg_auth_type, http_auth_type, + no_sync, } = config_toml; let IdentityTomlSubset { id: identity_toml_id, @@ -607,6 +616,7 @@ impl LocalEnv { listen_http_addr, pg_auth_type, http_auth_type, + no_sync, }; pageservers.push(conf); } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index eab76e14c3..ae5e22ddc6 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -273,6 +273,7 @@ impl PageServerNode { ) })?; let args = vec!["-D", datadir_path_str]; + background_process::start_process( "pageserver", &datadir, diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 00cc426c3c..6de34fdd35 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -106,6 +106,8 @@ pub struct ConfigToml { pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, pub virtual_file_io_mode: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub no_sync: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -389,6 +391,7 @@ impl Default for ConfigToml { l0_flush: None, virtual_file_io_mode: None, tenant_config: TenantConfigToml::default(), + no_sync: None, } } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 782122139e..fe2a31167d 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -154,13 +154,17 @@ fn main() -> anyhow::Result<()> { }, }; - let started = Instant::now(); - syncfs(dirfd)?; - let elapsed = started.elapsed(); - info!( - elapsed_ms = elapsed.as_millis(), - "made tenant directory contents durable" - ); + if conf.no_sync { + info!("Skipping syncfs on startup"); + } else { + let started = Instant::now(); + syncfs(dirfd)?; + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "made tenant directory contents durable" + ); + } } // Initialize up failpoints support diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 06d4326459..d62066ac22 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -178,6 +178,9 @@ pub struct PageServerConf { /// Direct IO settings pub virtual_file_io_mode: virtual_file::IoMode, + + /// Optionally disable disk syncs (unsafe!) + pub no_sync: bool, } /// Token for authentication to safekeepers @@ -332,6 +335,7 @@ impl PageServerConf { concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, tenant_config, + no_sync, } = config_toml; let mut conf = PageServerConf { @@ -409,6 +413,7 @@ impl PageServerConf { .map(crate::l0_flush::L0FlushConfig::from) .unwrap_or_default(), virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), + no_sync: no_sync.unwrap_or(false), }; // ------------------------------------------------------------