pageserver: increase DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG (#6970)

## Problem

At high ingest rates, pageservers spuriously disconnect from safekeepers
because stats updates don't come in frequently enough to keep the
broker/safekeeper LSN delta under the wal lag limit.

## Summary of changes

- Increase DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG from 10MiB to 1GiB. This
should be enough for realistic per-timeline throughputs.
This commit is contained in:
John Spray
2024-03-01 16:49:37 +00:00
committed by GitHub
parent d999c46692
commit e34059cd18
2 changed files with 5 additions and 2 deletions

View File

@@ -52,7 +52,10 @@ pub mod defaults {
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
// throughputs up to 1GiB/s per timeline.
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

View File

@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
"period": "20s",
"threshold": "23h",
}
assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
# restart the pageserver and ensure that the config is still correct
env.pageserver.stop()