From 68241f5a3e2c8b23f2db5a1100066fd19f3890e4 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 26 Jul 2024 17:44:57 +0200 Subject: [PATCH] raise wait_lsn timeout from 60s to 300s (#8529) Problem ------- wait_lsn timeouts result in a user-facing errors like ``` $ /tmp/neon/pg_install/v16/bin/pgbench -s3424 -i -I dtGvp user=neondb_owner dbname=neondb host=ep-tiny-wave-w23owa37.eastus2.azure.neon.build sslmode=require options='-cstatement_timeout=0 ' dropping old tables... NOTICE: table "pgbench_accounts" does not exist, skipping NOTICE: table "pgbench_branches" does not exist, skipping NOTICE: table "pgbench_history" does not exist, skipping NOTICE: table "pgbench_tellers" does not exist, skipping creating tables... generating data (server-side)... vacuuming... pgbench: error: query failed: ERROR: [NEON_SMGR] [shard 0] could not read block 214338 in rel 1663/16389/16839.0 from page server at lsn C/E1C12828 DETAIL: page server returned error: LSN timeout: Timed out while waiting for WAL record at LSN C/E1418528 to arrive, last_record_lsn 6/999D9CA8 disk consistent LSN=6/999D9CA8, WalReceiver status: (update 2024-07-25 08:30:07): connecting to node 25, safekeeper candidates (id|update_time|commit_lsn): [(21|08:30:16|C/E1C129E0), (23|08:30:16|C/E1C129E0), (25|08:30:17|C/E1C129E0)] CONTEXT: while scanning block 214338 of relation "public.pgbench_accounts" pgbench: detail: Query was: vacuum analyze pgbench_accounts ``` Solution -------- Its better to be slow than to fail the queries. If the app has a deadline, it can use `statement_timeout`. In the long term, we want to eliminate wait_lsn timeout. In the short term (this PR), we bump the wait_lsn timeout to a larger value to reduce the frequency at which these wait_lsn timeouts occur. We will observe SLOs and specifically `pageserver_wait_lsn_seconds_bucket` before we eliminate the timeout completely. --- pageserver/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 614bbf3392..100c6c1ac5 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -52,7 +52,7 @@ pub mod defaults { use pageserver_api::models::ImageCompressionAlgorithm; pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "cloud_admin";