Add 'wait_lsn_timeout' and 'wal_redo_timeout' pageserver config options instead of hardcoded defaults

This commit is contained in:
anastasia
2022-02-23 16:56:45 +03:00
committed by Anastasia Lubennikova
parent 58ee5d005f
commit 87edbd38c7
3 changed files with 47 additions and 11 deletions

View File

@@ -36,6 +36,9 @@ pub mod defaults {
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
pub const DEFAULT_GC_PERIOD: &str = "100 s";
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
@@ -59,6 +62,9 @@ pub mod defaults {
#gc_period = '{DEFAULT_GC_PERIOD}'
#gc_horizon = {DEFAULT_GC_HORIZON}
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
# initial superuser role name to use when creating a new tenant
@@ -85,6 +91,12 @@ pub struct PageServerConf {
pub gc_horizon: u64,
pub gc_period: Duration,
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
pub wait_lsn_timeout: Duration,
// How long to wait for WAL redo to complete.
pub wal_redo_timeout: Duration,
pub superuser: String,
pub page_cache_size: usize,
@@ -232,6 +244,8 @@ impl PageServerConf {
checkpoint_period: humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)?,
gc_horizon: DEFAULT_GC_HORIZON,
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)?,
wait_lsn_timeout: humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)?,
wal_redo_timeout: humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)?,
page_cache_size: DEFAULT_PAGE_CACHE_SIZE,
max_file_descriptors: DEFAULT_MAX_FILE_DESCRIPTORS,
@@ -252,6 +266,8 @@ impl PageServerConf {
"checkpoint_period" => conf.checkpoint_period = parse_toml_duration(key, item)?,
"gc_horizon" => conf.gc_horizon = parse_toml_u64(key, item)?,
"gc_period" => conf.gc_period = parse_toml_duration(key, item)?,
"wait_lsn_timeout" => conf.wait_lsn_timeout = parse_toml_duration(key, item)?,
"wal_redo_timeout" => conf.wal_redo_timeout = parse_toml_duration(key, item)?,
"initial_superuser_name" => conf.superuser = parse_toml_string(key, item)?,
"page_cache_size" => conf.page_cache_size = parse_toml_u64(key, item)? as usize,
"max_file_descriptors" => {
@@ -386,6 +402,8 @@ impl PageServerConf {
checkpoint_period: Duration::from_secs(10),
gc_horizon: defaults::DEFAULT_GC_HORIZON,
gc_period: Duration::from_secs(10),
wait_lsn_timeout: Duration::from_secs(60),
wal_redo_timeout: Duration::from_secs(60),
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
@@ -456,6 +474,9 @@ checkpoint_period = '111 s'
gc_period = '222 s'
gc_horizon = 222
wait_lsn_timeout = '111 s'
wal_redo_timeout = '111 s'
page_cache_size = 444
max_file_descriptors = 333
@@ -486,6 +507,8 @@ initial_superuser_name = 'zzzz'
checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?,
gc_horizon: defaults::DEFAULT_GC_HORIZON,
gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
superuser: defaults::DEFAULT_SUPERUSER.to_string(),
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
@@ -527,6 +550,8 @@ initial_superuser_name = 'zzzz'
checkpoint_period: Duration::from_secs(111),
gc_horizon: 222,
gc_period: Duration::from_secs(222),
wait_lsn_timeout: Duration::from_secs(111),
wal_redo_timeout: Duration::from_secs(111),
superuser: "zzzz".to_string(),
page_cache_size: 444,
max_file_descriptors: 333,

View File

@@ -29,7 +29,7 @@ use std::ops::{Bound::Included, Deref};
use std::path::{Path, PathBuf};
use std::sync::atomic::{self, AtomicBool, AtomicUsize};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard};
use std::time::{Duration, Instant};
use std::time::Instant;
use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
use crate::config::PageServerConf;
@@ -83,9 +83,6 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
static TIMEOUT: Duration = Duration::from_secs(60);
// Metrics collected on operations on the storage repository.
lazy_static! {
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
@@ -816,7 +813,7 @@ impl Timeline for LayeredTimeline {
);
self.last_record_lsn
.wait_for_timeout(lsn, TIMEOUT)
.wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
.with_context(|| {
format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",

View File

@@ -102,8 +102,6 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
}
}
static TIMEOUT: Duration = Duration::from_secs(20);
// Metrics collected on WAL redo operations
//
// We collect the time spent in actual WAL redo ('redo'), and time waiting
@@ -221,7 +219,14 @@ impl WalRedoManager for PostgresRedoManager {
let result = if batch_zenith {
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i])
} else {
self.apply_batch_postgres(rel, blknum, lsn, img, &records[batch_start..i])
self.apply_batch_postgres(
rel,
blknum,
lsn,
img,
&records[batch_start..i],
self.conf.wal_redo_timeout,
)
};
img = Some(result?);
@@ -233,7 +238,14 @@ impl WalRedoManager for PostgresRedoManager {
if batch_zenith {
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..])
} else {
self.apply_batch_postgres(rel, blknum, lsn, img, &records[batch_start..])
self.apply_batch_postgres(
rel,
blknum,
lsn,
img,
&records[batch_start..],
self.conf.wal_redo_timeout,
)
}
}
}
@@ -261,6 +273,7 @@ impl PostgresRedoManager {
lsn: Lsn,
base_img: Option<Bytes>,
records: &[(Lsn, ZenithWalRecord)],
wal_redo_timeout: Duration,
) -> Result<Bytes, WalRedoError> {
let start_time = Instant::now();
@@ -281,7 +294,7 @@ impl PostgresRedoManager {
let result = if let RelishTag::Relation(rel) = rel {
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
apply_result = process.apply_wal_records(buf_tag, base_img, records);
apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout);
apply_result.map_err(WalRedoError::IoError)
} else {
@@ -603,6 +616,7 @@ impl PostgresRedoProcess {
tag: BufferTag,
base_img: Option<Bytes>,
records: &[(Lsn, ZenithWalRecord)],
wal_redo_timeout: Duration,
) -> Result<Bytes, std::io::Error> {
// Serialize all the messages to send the WAL redo process first.
//
@@ -653,7 +667,7 @@ impl PostgresRedoProcess {
// If we have more data to write, wake up if 'stdin' becomes writeable or
// we have data to read. Otherwise only wake up if there's data to read.
let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
let n = nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32)?;
if n == 0 {
return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));