mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
fix(pageserver): LSN lease edge cases around restarts/migrations (#9055)
Part of #7497, closes #8817. ## Problem See #8817. ## Summary of changes **compute_ctl** - Renew lsn lease as soon as `/configure` updates pageserver_connstr, use `state_changed` Condvar for synchronization. **pageserver** As mentioned in https://github.com/neondatabase/neon/issues/8817#issuecomment-2315768076, we still want some permanent error reported if a lease cannot be granted. By considering attachment mode and the added `lsn_lease_deadline` when processing lease requests, we can also bound the case of bad requests to a very short period after migration/restart. - Refactor https://github.com/neondatabase/neon/pull/9024 and move `lsn_lease_deadline` to `AttachedTenantConf` so timeline can easily access it. - Have separate HTTP `init_lsn_lease` and libpq `renew_lsn_lease` API. - Always do LSN verification for the initial HTTP lease request. - LSN verification for the renewal is **still done** when tenants are not in `AttachedSingle` and we have pass the `lsn_lease_deadline`, which give plenty of time for compute to renew the lease. **neon_local** - add and call `timeline_init_lsn_lease` mgmt_api at static endpoint start. The initial lsn lease http request is sent when we run `cargo neon endpoint start <static endpoint>`. ## Testing - Extend `test_readonly_node_gc` to do pageserver restarts and migration. ## Future Work - The control plane should make the initial lease request through HTTP when creating a static endpoint. This is currently only done in `neon_local`. Signed-off-by: Yuchen Liang <yuchen@neon.tech>
This commit is contained in:
@@ -10,6 +10,7 @@ use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -1398,6 +1399,36 @@ LIMIT 100",
|
||||
}
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
|
||||
/// Waits until current thread receives a state changed notification and
|
||||
/// the pageserver connection strings has changed.
|
||||
///
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_connstr = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_connstr = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
if !unchanged {
|
||||
info!("Pageserver config changed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
|
||||
@@ -57,10 +57,10 @@ fn lsn_lease_bg_task(
|
||||
.max(valid_duration / 2);
|
||||
|
||||
info!(
|
||||
"Succeeded, sleeping for {} seconds",
|
||||
"Request succeeded, sleeping for {} seconds",
|
||||
sleep_duration.as_secs()
|
||||
);
|
||||
thread::sleep(sleep_duration);
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,10 +89,7 @@ fn acquire_lsn_lease_with_retry(
|
||||
.map(|connstr| {
|
||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token.clone());
|
||||
} else {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
config
|
||||
})
|
||||
@@ -108,9 +105,11 @@ fn acquire_lsn_lease_with_retry(
|
||||
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");
|
||||
|
||||
thread::sleep(Duration::from_millis(retry_period_ms as u64));
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
|
||||
retry_period_ms as u64,
|
||||
));
|
||||
retry_period_ms *= 1.5;
|
||||
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user