mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-18 05:30:37 +00:00
Compare commits
45 Commits
release-45
...
problame/w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
70f993331c | ||
|
|
c87c19a646 | ||
|
|
92280727df | ||
|
|
31fc069482 | ||
|
|
16090c876d | ||
|
|
02dc0db633 | ||
|
|
8e04de6ef9 | ||
|
|
0713f367d4 | ||
|
|
93d0f5e93d | ||
|
|
20957d6c4e | ||
|
|
f4de9adb1d | ||
|
|
98ee0d9012 | ||
|
|
6933f5d089 | ||
|
|
853f77eb11 | ||
|
|
ccfc9741f6 | ||
|
|
c6d09f8942 | ||
|
|
c8d36dab59 | ||
|
|
7662df6ca0 | ||
|
|
c119af8ddd | ||
|
|
a2e083ebe0 | ||
|
|
73a944205b | ||
|
|
34ebfbdd6f | ||
|
|
ef7c9c2ccc | ||
|
|
6c79e12630 | ||
|
|
753d97bd77 | ||
|
|
edc962f1d7 | ||
|
|
65b4e6e7d6 | ||
|
|
17b256679b | ||
|
|
673a865055 | ||
|
|
fb518aea0d | ||
|
|
42f41afcbd | ||
|
|
f71110383c | ||
|
|
ae3eaf9995 | ||
|
|
aa9f1d4b69 | ||
|
|
946c6a0006 | ||
|
|
ce13281d54 | ||
|
|
4e1d16f311 | ||
|
|
091a0cda9d | ||
|
|
ea9fad419e | ||
|
|
e92c9f42c0 | ||
|
|
aaaa39d9f5 | ||
|
|
e79a19339c | ||
|
|
dbd36e40dc | ||
|
|
90ef48aab8 | ||
|
|
9a43c04a19 |
6
Cargo.lock
generated
6
Cargo.lock
generated
@@ -1161,6 +1161,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper",
|
||||
"nix 0.26.2",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
"opentelemetry",
|
||||
@@ -1171,6 +1172,7 @@ dependencies = [
|
||||
"rust-ini",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -4403,12 +4405,14 @@ dependencies = [
|
||||
"async-stream",
|
||||
"aws-config",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-async",
|
||||
"bincode",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
@@ -4447,6 +4451,7 @@ dependencies = [
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"fail",
|
||||
"fs2",
|
||||
"futures",
|
||||
"git-version",
|
||||
@@ -5878,6 +5883,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"const_format",
|
||||
"criterion",
|
||||
"fail",
|
||||
"futures",
|
||||
"heapless",
|
||||
"hex",
|
||||
|
||||
@@ -13,6 +13,7 @@ clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
@@ -20,6 +21,7 @@ postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
|
||||
@@ -40,18 +40,22 @@ use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version;
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
@@ -67,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
for sig in signals.forever() {
|
||||
handle_exit_signal(sig);
|
||||
}
|
||||
});
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG")
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string();
|
||||
@@ -346,6 +357,7 @@ fn main() -> Result<()> {
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
PG_PID.store(0, Ordering::SeqCst);
|
||||
info!("Postgres exited with code {}, shutting down", ecode);
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
@@ -519,6 +531,24 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
kill(pg_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_cli() {
|
||||
cli().debug_assert()
|
||||
|
||||
@@ -6,6 +6,8 @@ use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Instant;
|
||||
@@ -34,6 +36,9 @@ use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::{config, extension_server};
|
||||
|
||||
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
|
||||
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
// Url type maintains proper escaping
|
||||
@@ -501,6 +506,7 @@ impl ComputeNode {
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
@@ -508,6 +514,7 @@ impl ComputeNode {
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
@@ -662,6 +669,7 @@ impl ComputeNode {
|
||||
})
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
PG_PID.store(pg.id(), Ordering::SeqCst);
|
||||
|
||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||
|
||||
|
||||
@@ -46,6 +46,8 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -439,11 +441,14 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
if send_sigterm {
|
||||
kill(pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -733,10 +738,15 @@ impl Endpoint {
|
||||
&None,
|
||||
)?;
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
// Also wait for the compute_ctl process to die. It might have some
|
||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||
// etc.
|
||||
//
|
||||
self.wait_for_compute_ctl_to_exit()?;
|
||||
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
||||
// want this cleanup: tests intentionally do stop when majority of
|
||||
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
||||
// could be a separate flag though.
|
||||
self.wait_for_compute_ctl_to_exit(destroy)?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
|
||||
@@ -124,6 +124,9 @@ impl KeySpaceAccum {
|
||||
if range.start == accum.end {
|
||||
accum.end = range.end;
|
||||
} else {
|
||||
// TODO: to efficiently support small sharding stripe sizes, we should avoid starting
|
||||
// a new range here if the skipped region was all keys that don't belong on this shard.
|
||||
// (https://github.com/neondatabase/neon/issues/6247)
|
||||
assert!(range.start > accum.end);
|
||||
self.ranges.push(accum.clone());
|
||||
*accum = range;
|
||||
|
||||
@@ -557,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState {
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
|
||||
@@ -422,6 +422,21 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
|
||||
@@ -4,6 +4,12 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
@@ -16,6 +22,7 @@ chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
|
||||
@@ -1,3 +1,14 @@
|
||||
//! Failpoint support code shared between pageserver and safekeepers.
|
||||
|
||||
use crate::http::{
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
};
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
@@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
|
||||
// Helper function used by the macro. (A function has nicer scoping so we
|
||||
// don't need to decorate everything with "::")
|
||||
#[doc(hidden)]
|
||||
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||
let millis = duration_str.parse::<u64>().unwrap();
|
||||
let d = std::time::Duration::from_millis(millis);
|
||||
|
||||
@@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> {
|
||||
scenario
|
||||
}
|
||||
|
||||
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||
if actions == "exit" {
|
||||
fail::cfg_callback(name, exit_failpoint)
|
||||
} else {
|
||||
@@ -84,3 +95,45 @@ fn exit_failpoint() {
|
||||
tracing::info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
/// Configure failpoints through http.
|
||||
pub async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot manage failpoints because storage was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -83,6 +83,8 @@ pub mod timeout;
|
||||
|
||||
pub mod sync;
|
||||
|
||||
pub mod failpoint_support;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -425,7 +425,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||
println!("walprop_log[{}] {}", level, msg);
|
||||
println!("wp_log[{}] {}", level, msg);
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||
|
||||
@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
|
||||
@@ -115,15 +115,8 @@ impl PagestreamClient {
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
key: RelTagBlockNo,
|
||||
lsn: Lsn,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let req = PagestreamGetPageRequest {
|
||||
latest: false,
|
||||
rel: key.rel_tag,
|
||||
blkno: key.block_no,
|
||||
lsn,
|
||||
};
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
|
||||
@@ -3,7 +3,7 @@ use futures::future::join_all;
|
||||
use pageserver::pgdatadir_mapping::key_to_rel_block;
|
||||
use pageserver::repository;
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_client::page_service::RelTagBlockNo;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -39,6 +39,9 @@ pub(crate) struct Args {
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
per_target_rate_limit: Option<usize>,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long, default_value = "1")]
|
||||
req_latest_probability: f64,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
@@ -200,18 +203,26 @@ async fn main_impl(
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
loop {
|
||||
let (range, key) = {
|
||||
let (timeline, req) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &all_ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(r, RelTagBlockNo { rel_tag, block_no })
|
||||
(
|
||||
r.timeline,
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&range.timeline).unwrap();
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
|
||||
sender.send(req).await.ok().unwrap();
|
||||
}
|
||||
}),
|
||||
Some(rps_limit) => Box::pin(async move {
|
||||
@@ -240,16 +251,21 @@ async fn main_impl(
|
||||
);
|
||||
loop {
|
||||
ticker.tick().await;
|
||||
let (range, key) = {
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
(r, RelTagBlockNo { rel_tag, block_no })
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
|
||||
sender.send(req).await.ok().unwrap();
|
||||
}
|
||||
})
|
||||
};
|
||||
@@ -303,7 +319,7 @@ async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
|
||||
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
@@ -317,10 +333,10 @@ async fn client(
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some((key, lsn)) = work.recv().await {
|
||||
while let Some(req) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(key, lsn)
|
||||
.getpage(req)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
|
||||
@@ -23,6 +23,7 @@ use tracing::*;
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
@@ -174,7 +175,7 @@ where
|
||||
] {
|
||||
for segno in self
|
||||
.timeline
|
||||
.list_slru_segments(kind, self.lsn, self.ctx)
|
||||
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
@@ -192,7 +193,7 @@ where
|
||||
// Otherwise only include init forks of unlogged relations.
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
@@ -267,7 +268,7 @@ where
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, self.lsn, false, self.ctx)
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
@@ -288,7 +289,7 @@ where
|
||||
for blknum in startblk..endblk {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
|
||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
@@ -310,7 +311,7 @@ where
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
|
||||
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
@@ -352,7 +353,7 @@ where
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
|
||||
ensure!(
|
||||
@@ -399,7 +400,7 @@ where
|
||||
if !has_relmap_file
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
|
||||
@@ -31,6 +31,7 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::failpoint_support;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
@@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = pageserver::failpoint_support::init();
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
|
||||
@@ -76,6 +76,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -88,6 +90,7 @@ pub mod defaults {
|
||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||
|
||||
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
|
||||
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
||||
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
@@ -108,6 +111,8 @@ pub mod defaults {
|
||||
|
||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||
|
||||
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -233,6 +238,9 @@ pub struct PageServerConf {
|
||||
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
|
||||
/// heatmap uploads vs. other remote storage operations.
|
||||
pub heatmap_upload_concurrency: usize,
|
||||
|
||||
/// Maximum number of WAL records to be ingested and committed at the same time
|
||||
pub ingest_batch_size: u64,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -314,6 +322,8 @@ struct PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: BuilderValue<bool>,
|
||||
|
||||
heatmap_upload_concurrency: BuilderValue<usize>,
|
||||
|
||||
ingest_batch_size: BuilderValue<u64>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -386,6 +396,8 @@ impl Default for PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: Set(false),
|
||||
|
||||
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||
|
||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -534,6 +546,10 @@ impl PageServerConfigBuilder {
|
||||
self.heatmap_upload_concurrency = BuilderValue::Set(value)
|
||||
}
|
||||
|
||||
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
|
||||
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
@@ -632,10 +648,12 @@ impl PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: self
|
||||
.control_plane_emergency_mode
|
||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||
|
||||
heatmap_upload_concurrency: self
|
||||
.heatmap_upload_concurrency
|
||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -878,6 +896,7 @@ impl PageServerConf {
|
||||
"heatmap_upload_concurrency" => {
|
||||
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
|
||||
},
|
||||
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -949,6 +968,7 @@ impl PageServerConf {
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1177,7 +1197,8 @@ background_task_maximum_delay = '334 s'
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1238,7 +1259,8 @@ background_task_maximum_delay = '334 s'
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
ingest_batch_size: 100,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -25,6 +25,7 @@ use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
@@ -66,9 +67,6 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
// Imports only used for testing APIs
|
||||
use pageserver_api::models::ConfigureFailpointsRequest;
|
||||
|
||||
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||
// failed API calls while tenants are activating.
|
||||
@@ -1293,34 +1291,6 @@ async fn handle_tenant_break(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Cannot manage failpoints because pageserver was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
async fn timeline_gc_handler(
|
||||
mut request: Request<Body>,
|
||||
|
||||
@@ -21,6 +21,7 @@ use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::Timeline;
|
||||
@@ -312,13 +313,16 @@ async fn import_wal(
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
let mut modification = tline.begin_modification(endpoint);
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
WAL_INGEST.records_committed.inc();
|
||||
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
nrecords += 1;
|
||||
@@ -448,13 +452,14 @@ pub async fn import_wal_from_tar(
|
||||
|
||||
waldecoder.feed_bytes(&bytes[offset..]);
|
||||
|
||||
let mut modification = tline.begin_modification(end_lsn);
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||
|
||||
@@ -25,8 +25,6 @@ pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod failpoint_support;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
|
||||
@@ -53,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::rel_block_to_key;
|
||||
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -747,7 +747,7 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
let exists = timeline
|
||||
.get_rel_exists(req.rel, lsn, req.latest, ctx)
|
||||
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
@@ -766,7 +766,9 @@ impl PageServerHandler {
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
|
||||
let n_blocks = timeline
|
||||
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
@@ -785,7 +787,13 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
let total_blocks = timeline
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
|
||||
.get_db_size(
|
||||
DEFAULTTABLESPACE_OID,
|
||||
req.dbnode,
|
||||
Version::Lsn(lsn),
|
||||
req.latest,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
@@ -794,7 +802,7 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
async fn do_handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
@@ -804,20 +812,25 @@ impl PageServerHandler {
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
/*
|
||||
// Add a 1s delay to some requests. The delay helps the requests to
|
||||
// hit the race condition from github issue #1047 more easily.
|
||||
use rand::Rng;
|
||||
if rand::thread_rng().gen::<u8>() < 5 {
|
||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
||||
}
|
||||
*/
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
let page = if timeline.get_shard_identity().is_key_local(&key) {
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
if timeline.get_shard_identity().is_key_local(&key) {
|
||||
self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
|
||||
.await
|
||||
} else {
|
||||
// The Tenant shard we looked up at connection start does not hold this particular
|
||||
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
||||
@@ -844,6 +857,8 @@ impl PageServerHandler {
|
||||
// informed yet.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/6038
|
||||
tracing::warn!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
|
||||
timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
|
||||
return Err(anyhow::anyhow!("Request routed to wrong shard"));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
@@ -852,14 +867,10 @@ impl PageServerHandler {
|
||||
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
||||
// the GateGuard was already held over the whole connection.
|
||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
};
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -147,6 +147,7 @@ impl Timeline {
|
||||
{
|
||||
DatadirModification {
|
||||
tline: self,
|
||||
pending_lsns: Vec::new(),
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
@@ -163,7 +164,7 @@ impl Timeline {
|
||||
&self,
|
||||
tag: RelTag,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
@@ -173,17 +174,20 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
|
||||
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, lsn, nblocks
|
||||
tag,
|
||||
blknum,
|
||||
version.get_lsn(),
|
||||
nblocks
|
||||
);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
self.get(key, lsn, ctx).await
|
||||
version.get(self, key, ctx).await
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
@@ -191,16 +195,16 @@ impl Timeline {
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<usize, PageReconstructError> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
|
||||
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
|
||||
|
||||
for rel in rels {
|
||||
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
|
||||
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
|
||||
total_blocks += n_blocks as usize;
|
||||
}
|
||||
Ok(total_blocks)
|
||||
@@ -210,7 +214,7 @@ impl Timeline {
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
@@ -220,12 +224,12 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
|
||||
&& !self.get_rel_exists(tag, version, latest, ctx).await?
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||
@@ -235,7 +239,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let key = rel_size_to_key(tag);
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
if latest {
|
||||
@@ -246,7 +250,7 @@ impl Timeline {
|
||||
// latest=true, then it can not cause cache corruption, because with latest=true
|
||||
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
|
||||
// associated with most recent value of LSN.
|
||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
||||
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||
}
|
||||
Ok(nblocks)
|
||||
}
|
||||
@@ -255,7 +259,7 @@ impl Timeline {
|
||||
pub async fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
_latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
@@ -266,12 +270,12 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
return Ok(true);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -291,12 +295,12 @@ impl Timeline {
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -332,11 +336,11 @@ impl Timeline {
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
@@ -345,12 +349,12 @@ impl Timeline {
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -501,11 +505,11 @@ impl Timeline {
|
||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||
) -> Result<T, PageReconstructError> {
|
||||
for segno in self
|
||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
||||
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
|
||||
.await?
|
||||
{
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
|
||||
.await?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page = self
|
||||
@@ -531,13 +535,13 @@ impl Timeline {
|
||||
pub async fn list_slru_segments(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.segments),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
@@ -548,12 +552,12 @@ impl Timeline {
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
@@ -652,7 +656,10 @@ impl Timeline {
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
||||
for rel in self
|
||||
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
|
||||
.await?
|
||||
{
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
@@ -692,7 +699,7 @@ impl Timeline {
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, lsn, ctx)
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
@@ -799,18 +806,39 @@ pub struct DatadirModification<'a> {
|
||||
/// in the state in 'tline' yet.
|
||||
pub tline: &'a Timeline,
|
||||
|
||||
/// Lsn assigned by begin_modification
|
||||
pub lsn: Lsn,
|
||||
/// Current LSN of the modification
|
||||
lsn: Lsn,
|
||||
|
||||
// The modifications are not applied directly to the underlying key-value store.
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_updates: HashMap<Key, Value>,
|
||||
pending_deletions: Vec<Range<Key>>,
|
||||
pending_lsns: Vec<Lsn>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
/// Get the current lsn
|
||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
lsn >= self.lsn,
|
||||
"setting an older lsn {} than {} is not allowed",
|
||||
lsn,
|
||||
self.lsn
|
||||
);
|
||||
if lsn > self.lsn {
|
||||
self.pending_lsns.push(self.lsn);
|
||||
self.lsn = lsn;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initialize a completely new repository.
|
||||
///
|
||||
/// This inserts the directory metadata entries that are assumed to
|
||||
@@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> {
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let req_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
|
||||
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
|
||||
.await?;
|
||||
|
||||
// Remove entry from dbdir
|
||||
@@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let last_lsn = self.tline.get_last_record_lsn();
|
||||
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), true, ctx)
|
||||
.await?
|
||||
{
|
||||
let size_key = rel_size_to_key(rel);
|
||||
// Fetch the old size first
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
@@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> {
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::new();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, self.lsn, &value, ctx).await?;
|
||||
} else {
|
||||
retained_pending_updates.insert(key, value);
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
for (lsn, value) in values {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
} else {
|
||||
retained_pending_updates
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((lsn, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
self.pending_updates.extend(retained_pending_updates);
|
||||
|
||||
self.pending_updates = retained_pending_updates;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> {
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer().await;
|
||||
let lsn = self.lsn;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
}
|
||||
for key_range in self.pending_deletions.drain(..) {
|
||||
writer.delete(key_range, lsn).await?;
|
||||
if !self.pending_updates.is_empty() {
|
||||
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||
self.pending_updates.clear();
|
||||
}
|
||||
|
||||
writer.finish_write(lsn);
|
||||
if !self.pending_deletions.is_empty() {
|
||||
writer.delete_batch(&self.pending_deletions).await?;
|
||||
self.pending_deletions.clear();
|
||||
}
|
||||
|
||||
self.pending_lsns.push(self.lsn);
|
||||
for pending_lsn in self.pending_lsns.drain(..) {
|
||||
// Ideally, we should be able to call writer.finish_write() only once
|
||||
// with the highest LSN. However, the last_record_lsn variable in the
|
||||
// timeline keeps track of the latest LSN and the immediate previous LSN
|
||||
// so we need to record every LSN to not leave a gap between them.
|
||||
writer.finish_write(pending_lsn);
|
||||
}
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.pending_updates.len() + self.pending_deletions.len()
|
||||
}
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
// Have we already updated the same key? Read the pending updated
|
||||
// Have we already updated the same key? Read the latest pending updated
|
||||
// version in that case.
|
||||
//
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(value) = self.pending_updates.get(&key) {
|
||||
if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
};
|
||||
}
|
||||
} else {
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
self.pending_updates.insert(key, val);
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
*last_value = val;
|
||||
return;
|
||||
}
|
||||
}
|
||||
values.push((self.lsn, val));
|
||||
}
|
||||
|
||||
fn delete(&mut self, key_range: Range<Key>) {
|
||||
trace!("DELETE {}-{}", key_range.start, key_range.end);
|
||||
self.pending_deletions.push(key_range);
|
||||
self.pending_deletions.push((key_range, self.lsn));
|
||||
}
|
||||
}
|
||||
|
||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
||||
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
||||
/// need to look up the keys in the modification first before looking them up in the
|
||||
/// timeline to not miss the latest updates.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum Version<'a> {
|
||||
Lsn(Lsn),
|
||||
Modified(&'a DatadirModification<'a>),
|
||||
}
|
||||
|
||||
impl<'a> Version<'a> {
|
||||
async fn get(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match self {
|
||||
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
|
||||
Version::Modified(modification) => modification.get(key, ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
Version::Lsn(lsn) => *lsn,
|
||||
Version::Modified(modification) => modification.lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
|
||||
// else, but that has not been needed in a long time.
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
|
||||
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
|
||||
@@ -33,6 +33,7 @@ use tracing::*;
|
||||
use utils::backoff;
|
||||
use utils::completion;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::failpoint_support;
|
||||
use utils::fs_ext;
|
||||
use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
@@ -594,10 +595,9 @@ impl Tenant {
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
// TODO(sharding): make WalRedoManager shard-aware
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id.tenant_id,
|
||||
tenant_shard_id,
|
||||
)));
|
||||
|
||||
let TenantSharedResources {
|
||||
@@ -890,7 +890,7 @@ impl Tenant {
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
|
||||
let preload = match preload {
|
||||
Some(p) => p,
|
||||
@@ -1002,7 +1002,7 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate");
|
||||
|
||||
info!("Done");
|
||||
|
||||
@@ -1144,10 +1144,9 @@ impl Tenant {
|
||||
tenant_shard_id: TenantShardId,
|
||||
reason: String,
|
||||
) -> Arc<Tenant> {
|
||||
// TODO(sharding): make WalRedoManager shard-aware
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id.tenant_id,
|
||||
tenant_shard_id,
|
||||
)));
|
||||
Arc::new(Tenant::new(
|
||||
TenantState::Broken {
|
||||
@@ -2839,9 +2838,7 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
crate::failpoint_support::sleep_millis_async!(
|
||||
"gc_iteration_internal_after_getting_gc_timelines"
|
||||
);
|
||||
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
|
||||
|
||||
// If there is nothing to GC, we don't want any messages in the INFO log.
|
||||
if !gc_timelines.is_empty() {
|
||||
|
||||
@@ -46,6 +46,8 @@ pub mod defaults {
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
|
||||
@@ -130,7 +130,7 @@ impl TenantsMap {
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
/// resolve this to a fully qualified TenantShardId.
|
||||
fn resolve_shard(
|
||||
fn resolve_attached_shard(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
selector: ShardSelector,
|
||||
@@ -140,25 +140,27 @@ impl TenantsMap {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||
// Ignore all slots that don't contain an attached tenant
|
||||
let tenant = match &slot.1 {
|
||||
TenantSlot::Attached(t) => t,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return Some(*slot.0),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return Some(*slot.0)
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
if let Some(tenant) = slot.1.get_attached() {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
}
|
||||
}
|
||||
_ => continue,
|
||||
@@ -1257,9 +1259,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
// Resolve TenantId to TenantShardId
|
||||
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
|
||||
)?;
|
||||
let tenant_shard_id = locked
|
||||
.resolve_attached_shard(&tenant_id, shard_selector)
|
||||
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
)))?;
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.map_err(GetTenantError::MapState)?;
|
||||
|
||||
@@ -818,8 +818,25 @@ impl RemoteTimelineClient {
|
||||
fn schedule_deletion_of_unlinked0(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||
) {
|
||||
// Filter out any layers which were not created by this tenant shard. These are
|
||||
// layers that originate from some ancestor shard after a split, and may still
|
||||
// be referenced by other shards. We are free to delete them locally and remove
|
||||
// them from our index (and would have already done so when we reach this point
|
||||
// in the code), but we may not delete them remotely.
|
||||
with_metadata.retain(|(name, meta)| {
|
||||
let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
|
||||
&& meta.shard.shard_count == self.tenant_shard_id.shard_count;
|
||||
if !retain {
|
||||
tracing::debug!(
|
||||
"Skipping deletion of ancestor-shard layer {name}, from shard {}",
|
||||
meta.shard
|
||||
);
|
||||
}
|
||||
retain
|
||||
});
|
||||
|
||||
for (name, meta) in &with_metadata {
|
||||
info!(
|
||||
"scheduling deletion of layer {}{} (shard {})",
|
||||
|
||||
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{DeltaLayerWriter, ResidentLayer};
|
||||
|
||||
@@ -246,16 +246,43 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub async fn put_value(
|
||||
pub(crate) async fn put_value(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let inner: &mut _ = &mut *self.inner.write().await;
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn put_values(
|
||||
&self,
|
||||
values: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
for (key, vals) in values {
|
||||
for (lsn, val) in vals {
|
||||
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn put_value_locked(
|
||||
&self,
|
||||
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
|
||||
let off = {
|
||||
// Avoid doing allocations for "small" values.
|
||||
@@ -264,7 +291,7 @@ impl InMemoryLayer {
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
buf.clear();
|
||||
val.ser_into(&mut buf)?;
|
||||
inner
|
||||
locked_inner
|
||||
.file
|
||||
.write_blob(
|
||||
&buf,
|
||||
@@ -275,7 +302,7 @@ impl InMemoryLayer {
|
||||
.await?
|
||||
};
|
||||
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let vec_map = locked_inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
@@ -285,13 +312,11 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
||||
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
|
||||
// TODO: Currently, we just leak the storage for any deleted keys
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
|
||||
@@ -496,6 +496,11 @@ impl Timeline {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
|
||||
}
|
||||
|
||||
// This check is debug-only because of the cost of hashing, and because it's a double-check: we
|
||||
// already checked the key against the shard_identity when looking up the Timeline from
|
||||
// page_service.
|
||||
debug_assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
// XXX: structured stats collection for layer eviction here.
|
||||
trace!(
|
||||
"get page request for {}@{} from task kind {:?}",
|
||||
@@ -1459,6 +1464,7 @@ impl Timeline {
|
||||
max_lsn_wal_lag,
|
||||
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
|
||||
availability_zone: self.conf.availability_zone.clone(),
|
||||
ingest_batch_size: self.conf.ingest_batch_size,
|
||||
},
|
||||
broker_client,
|
||||
ctx,
|
||||
@@ -2223,13 +2229,13 @@ impl Timeline {
|
||||
return Err(layer_traversal_error(
|
||||
if cfg!(test) {
|
||||
format!(
|
||||
"could not find data for key {} at LSN {}, for request at LSN {}\n{}",
|
||||
key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
|
||||
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"could not find data for key {} at LSN {}, for request at LSN {}",
|
||||
key, cont_lsn, request_lsn
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
|
||||
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
|
||||
)
|
||||
},
|
||||
traversal_path,
|
||||
@@ -2471,9 +2477,27 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
|
||||
let layer = self.get_layer_for_write(lsn).await?;
|
||||
layer.put_tombstone(key_range, lsn).await?;
|
||||
async fn put_values(
|
||||
&self,
|
||||
values: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Pick the first LSN in the batch to get the layer to write to.
|
||||
for lsns in values.values() {
|
||||
if let Some((lsn, _)) = lsns.first() {
|
||||
let layer = self.get_layer_for_write(*lsn).await?;
|
||||
layer.put_values(values, ctx).await?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
|
||||
if let Some((_, lsn)) = tombstones.first() {
|
||||
let layer = self.get_layer_for_write(*lsn).await?;
|
||||
layer.put_tombstones(tombstones).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3035,6 +3059,15 @@ impl Timeline {
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
key,
|
||||
self.shard_identity.get_shard_number(&key)
|
||||
);
|
||||
key = key.next();
|
||||
continue;
|
||||
}
|
||||
let img = match self.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
@@ -3061,6 +3094,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(key, &img).await?;
|
||||
key = key.next();
|
||||
}
|
||||
@@ -3631,7 +3665,15 @@ impl Timeline {
|
||||
)))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
|
||||
if !self.shard_identity.is_key_disposable(&key) {
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
|
||||
} else {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
key,
|
||||
self.shard_identity.get_shard_number(&key)
|
||||
);
|
||||
}
|
||||
|
||||
if !new_layers.is_empty() {
|
||||
fail_point!("after-timeline-compacted-first-L1");
|
||||
@@ -4529,8 +4571,16 @@ impl<'a> TimelineWriter<'a> {
|
||||
self.tl.put_value(key, lsn, value, ctx).await
|
||||
}
|
||||
|
||||
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
|
||||
self.tl.put_tombstone(key_range, lsn).await
|
||||
pub(crate) async fn put_batch(
|
||||
&self,
|
||||
batch: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.tl.put_values(batch, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
|
||||
self.tl.put_tombstones(batch).await
|
||||
}
|
||||
|
||||
/// Track the end of the latest digested WAL record.
|
||||
@@ -4541,11 +4591,11 @@ impl<'a> TimelineWriter<'a> {
|
||||
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
|
||||
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
|
||||
/// the latest and can be read.
|
||||
pub fn finish_write(&self, new_lsn: Lsn) {
|
||||
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
|
||||
self.tl.finish_write(new_lsn);
|
||||
}
|
||||
|
||||
pub fn update_current_logical_size(&self, delta: i64) {
|
||||
pub(crate) fn update_current_logical_size(&self, delta: i64) {
|
||||
self.tl.update_current_logical_size(delta)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,6 +58,7 @@ pub struct WalReceiverConf {
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub auth_token: Option<Arc<String>>,
|
||||
pub availability_zone: Option<String>,
|
||||
pub ingest_batch_size: u64,
|
||||
}
|
||||
|
||||
pub struct WalReceiver {
|
||||
|
||||
@@ -411,6 +411,7 @@ impl ConnectionManagerState {
|
||||
|
||||
let node_id = new_sk.safekeeper_id;
|
||||
let connect_timeout = self.conf.wal_connect_timeout;
|
||||
let ingest_batch_size = self.conf.ingest_batch_size;
|
||||
let timeline = Arc::clone(&self.timeline);
|
||||
let ctx = ctx.detached_child(
|
||||
TaskKind::WalReceiverConnectionHandler,
|
||||
@@ -430,6 +431,7 @@ impl ConnectionManagerState {
|
||||
connect_timeout,
|
||||
ctx,
|
||||
node_id,
|
||||
ingest_batch_size,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1345,6 +1347,7 @@ mod tests {
|
||||
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
||||
auth_token: None,
|
||||
availability_zone: None,
|
||||
ingest_batch_size: 1,
|
||||
},
|
||||
wal_connection: None,
|
||||
wal_stream_candidates: HashMap::new(),
|
||||
|
||||
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
|
||||
use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
|
||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
@@ -106,6 +106,7 @@ impl From<WalDecodeError> for WalReceiverError {
|
||||
|
||||
/// Open a connection to the given safekeeper and receive WAL, sending back progress
|
||||
/// messages as we go.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) async fn handle_walreceiver_connection(
|
||||
timeline: Arc<Timeline>,
|
||||
wal_source_connconf: PgConnectionConfig,
|
||||
@@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
connect_timeout: Duration,
|
||||
ctx: RequestContext,
|
||||
node: NodeId,
|
||||
ingest_batch_size: u64,
|
||||
) -> Result<(), WalReceiverError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
{
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
let mut modification = timeline.begin_modification(endlsn);
|
||||
let mut modification = timeline.begin_modification(startlsn);
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
@@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
walingest
|
||||
// Ingest the records without immediately committing them.
|
||||
let ingested = walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||
if !ingested {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
filtered_records += 1;
|
||||
}
|
||||
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
last_rec_lsn = lsn;
|
||||
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
uncommitted_records += 1;
|
||||
if uncommitted_records >= ingest_batch_size {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(uncommitted_records - filtered_records);
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
filtered_records = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Commit the remaining records.
|
||||
if uncommitted_records > 0 {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(uncommitted_records - filtered_records);
|
||||
modification.commit(&ctx).await?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -21,10 +21,13 @@ pub enum NeonWalRecord {
|
||||
/// Native PostgreSQL WAL record
|
||||
Postgres { will_init: bool, rec: Bytes },
|
||||
|
||||
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
|
||||
/// Clear the bits specified in `flags` in the heap visibility map for the given heap blocks.
|
||||
///
|
||||
/// For example, for `{ heap_blkno_1: None, heap_blkno_2: Some(23), flags: 0b0010_0000}`
|
||||
/// redo will apply `&=0b0010_0000` on heap block 23's visibility map bitmask.
|
||||
ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
heap_blkno_1: Option<u32>,
|
||||
heap_blkno_2: Option<u32>,
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
|
||||
@@ -22,6 +22,7 @@ use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use nix::poll::*;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::Serialize;
|
||||
use std::collections::VecDeque;
|
||||
use std::io;
|
||||
@@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::{
|
||||
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
|
||||
@@ -92,7 +90,7 @@ struct ProcessOutput {
|
||||
/// records.
|
||||
///
|
||||
pub struct PostgresRedoManager {
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
||||
@@ -186,10 +184,13 @@ impl PostgresRedoManager {
|
||||
///
|
||||
/// Create a new PostgresRedoManager.
|
||||
///
|
||||
pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> PostgresRedoManager {
|
||||
// The actual process is launched lazily, on first request.
|
||||
PostgresRedoManager {
|
||||
tenant_id,
|
||||
tenant_shard_id,
|
||||
conf,
|
||||
last_redo_at: std::sync::Mutex::default(),
|
||||
redo_process: RwLock::new(None),
|
||||
@@ -244,8 +245,12 @@ impl PostgresRedoManager {
|
||||
let timer =
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
|
||||
let proc = Arc::new(
|
||||
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
|
||||
.context("launch walredo process")?,
|
||||
WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
timer.observe_duration();
|
||||
*proc_guard = Some(Arc::clone(&proc));
|
||||
@@ -398,8 +403,8 @@ impl PostgresRedoManager {
|
||||
anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
|
||||
}
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
heap_blkno_1,
|
||||
heap_blkno_2,
|
||||
flags,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relation
|
||||
@@ -409,7 +414,8 @@ impl PostgresRedoManager {
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
rel
|
||||
);
|
||||
if let Some(heap_blkno) = *new_heap_blkno {
|
||||
for heap_blkno in [heap_blkno_1, heap_blkno_2].into_iter().flatten() {
|
||||
let heap_blkno = *heap_blkno;
|
||||
// Calculate the VM block and offset that corresponds to the heap block.
|
||||
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
|
||||
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
|
||||
@@ -423,19 +429,6 @@ impl PostgresRedoManager {
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
}
|
||||
|
||||
// Repeat for 'old_heap_blkno', if any
|
||||
if let Some(heap_blkno) = *old_heap_blkno {
|
||||
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
|
||||
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
|
||||
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
|
||||
|
||||
assert!(map_block == blknum);
|
||||
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
}
|
||||
}
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
@@ -638,7 +631,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
|
||||
struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
@@ -652,10 +645,10 @@ impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
|
||||
#[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
|
||||
fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
@@ -680,7 +673,7 @@ impl WalRedoProcess {
|
||||
// as close-on-exec by default, but that's not enough, since we use
|
||||
// libraries that directly call libc open without setting that flag.
|
||||
.close_fds()
|
||||
.spawn_no_leak_child(tenant_id)
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
@@ -741,12 +734,12 @@ impl WalRedoProcess {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_id,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
@@ -772,7 +765,7 @@ impl WalRedoProcess {
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
fn apply_wal_records(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
@@ -966,11 +959,7 @@ impl WalRedoProcess {
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
// TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
|
||||
let path = self
|
||||
.conf
|
||||
.tenant_path(&TenantShardId::unsharded(self.tenant_id))
|
||||
.join(&filename);
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -1004,7 +993,7 @@ impl Drop for WalRedoProcess {
|
||||
/// Wrapper type around `std::process::Child` which guarantees that the child
|
||||
/// will be killed and waited-for by this process before being dropped.
|
||||
struct NoLeakChild {
|
||||
tenant_id: TenantId,
|
||||
tenant_id: TenantShardId,
|
||||
child: Option<Child>,
|
||||
}
|
||||
|
||||
@@ -1023,7 +1012,7 @@ impl DerefMut for NoLeakChild {
|
||||
}
|
||||
|
||||
impl NoLeakChild {
|
||||
fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
|
||||
fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
|
||||
let child = command.spawn()?;
|
||||
Ok(NoLeakChild {
|
||||
tenant_id,
|
||||
@@ -1078,7 +1067,7 @@ impl Drop for NoLeakChild {
|
||||
Some(child) => child,
|
||||
None => return,
|
||||
};
|
||||
let tenant_id = self.tenant_id;
|
||||
let tenant_shard_id = self.tenant_id;
|
||||
// Offload the kill+wait of the child process into the background.
|
||||
// If someone stops the runtime, we'll leak the child process.
|
||||
// We can ignore that case because we only stop the runtime on pageserver exit.
|
||||
@@ -1086,7 +1075,11 @@ impl Drop for NoLeakChild {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// Intentionally don't inherit the tracing context from whoever is dropping us.
|
||||
// This thread here is going to outlive of our dropper.
|
||||
let span = tracing::info_span!("walredo", %tenant_id);
|
||||
let span = tracing::info_span!(
|
||||
"walredo",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
);
|
||||
let _entered = span.enter();
|
||||
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
|
||||
})
|
||||
@@ -1096,11 +1089,11 @@ impl Drop for NoLeakChild {
|
||||
}
|
||||
|
||||
trait NoLeakChildCommandExt {
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
|
||||
}
|
||||
|
||||
impl NoLeakChildCommandExt for Command {
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
|
||||
NoLeakChild::spawn(tenant_id, self)
|
||||
}
|
||||
}
|
||||
@@ -1155,6 +1148,7 @@ mod tests {
|
||||
use crate::repository::Key;
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::str::FromStr;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
@@ -1264,9 +1258,9 @@ mod tests {
|
||||
let repo_dir = camino_tempfile::tempdir()?;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
|
||||
Ok(RedoHarness {
|
||||
_repo_dir: repo_dir,
|
||||
|
||||
@@ -35,7 +35,8 @@
|
||||
|
||||
#define PageStoreTrace DEBUG5
|
||||
|
||||
#define RECONNECT_INTERVAL_USEC 1000000
|
||||
#define MIN_RECONNECT_INTERVAL_USEC 1000
|
||||
#define MAX_RECONNECT_INTERVAL_USEC 1000000
|
||||
|
||||
bool connected = false;
|
||||
PGconn *pageserver_conn = NULL;
|
||||
@@ -133,6 +134,11 @@ pageserver_connect(int elevel)
|
||||
const char *values[3];
|
||||
int n;
|
||||
|
||||
static TimestampTz last_connect_time = 0;
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
|
||||
Assert(!connected);
|
||||
|
||||
if (CheckConnstringUpdated())
|
||||
@@ -140,6 +146,22 @@ pageserver_connect(int elevel)
|
||||
ReloadConnstring();
|
||||
}
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
if (us_since_last_connect < delay_us)
|
||||
{
|
||||
pg_usleep(delay_us - us_since_last_connect);
|
||||
delay_us *= 2;
|
||||
if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
|
||||
delay_us = MAX_RECONNECT_INTERVAL_USEC;
|
||||
last_connect_time = GetCurrentTimestamp();
|
||||
}
|
||||
else
|
||||
{
|
||||
delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
last_connect_time = now;
|
||||
}
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
@@ -333,7 +355,6 @@ pageserver_send(NeonRequest *request)
|
||||
{
|
||||
HandleMainLoopInterrupts();
|
||||
n_reconnect_attempts += 1;
|
||||
pg_usleep(RECONNECT_INTERVAL_USEC);
|
||||
}
|
||||
n_reconnect_attempts = 0;
|
||||
}
|
||||
|
||||
@@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
port = strchr(host, ':');
|
||||
if (port == NULL)
|
||||
{
|
||||
walprop_log(FATAL, "port is not specified");
|
||||
wp_log(FATAL, "port is not specified");
|
||||
}
|
||||
*port++ = '\0';
|
||||
sep = strchr(port, ',');
|
||||
@@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
*sep++ = '\0';
|
||||
if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
walprop_log(FATAL, "Too many safekeepers");
|
||||
wp_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
wp->safekeeper[wp->n_safekeepers].host = host;
|
||||
wp->safekeeper[wp->n_safekeepers].port = port;
|
||||
@@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
}
|
||||
|
||||
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
|
||||
@@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
}
|
||||
if (wp->n_safekeepers < 1)
|
||||
{
|
||||
walprop_log(FATAL, "Safekeepers addresses are not specified");
|
||||
wp_log(FATAL, "safekeepers addresses are not specified");
|
||||
}
|
||||
wp->quorum = wp->n_safekeepers / 2 + 1;
|
||||
|
||||
@@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
||||
wp->greetRequest.systemId = wp->config->systemId;
|
||||
if (!wp->config->neon_timeline)
|
||||
walprop_log(FATAL, "neon.timeline_id is not provided");
|
||||
wp_log(FATAL, "neon.timeline_id is not provided");
|
||||
if (*wp->config->neon_timeline != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
|
||||
walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||
wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||
if (!wp->config->neon_tenant)
|
||||
walprop_log(FATAL, "neon.tenant_id is not provided");
|
||||
wp_log(FATAL, "neon.tenant_id is not provided");
|
||||
if (*wp->config->neon_tenant != '\0' &&
|
||||
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
|
||||
walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||
wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||
|
||||
wp->greetRequest.timeline = wp->config->pgTimeline;
|
||||
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
|
||||
@@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp)
|
||||
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
|
||||
wp->config->safekeeper_connection_timeout))
|
||||
{
|
||||
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
|
||||
wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
|
||||
ShutdownConnection(sk);
|
||||
}
|
||||
}
|
||||
@@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk)
|
||||
*
|
||||
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
|
||||
*/
|
||||
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
|
||||
/*
|
||||
* Even though the connection failed, we still need to clean up the
|
||||
@@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk)
|
||||
* (see libpqrcv_connect, defined in
|
||||
* src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
|
||||
*/
|
||||
walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
||||
wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
||||
|
||||
sk->state = SS_CONNECTING_WRITE;
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
@@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp)
|
||||
static void
|
||||
AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
#ifdef WALPROPOSER_LIB /* walprop_log needs wp in lib build */
|
||||
#ifdef WALPROPOSER_LIB /* wp_log needs wp in lib build */
|
||||
WalProposer *wp = sk->wp;
|
||||
#endif
|
||||
|
||||
@@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* ResetConnection
|
||||
*/
|
||||
case SS_OFFLINE:
|
||||
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
|
||||
sk->host, sk->port);
|
||||
wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline",
|
||||
sk->host, sk->port);
|
||||
break; /* actually unreachable, but prevents
|
||||
* -Wimplicit-fallthrough */
|
||||
|
||||
@@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* requests.
|
||||
*/
|
||||
case SS_VOTING:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
* Idle state for waiting votes from quorum.
|
||||
*/
|
||||
case SS_IDLE:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
switch (result)
|
||||
{
|
||||
case WP_CONN_POLLING_OK:
|
||||
walprop_log(LOG, "connected with node %s:%s", sk->host,
|
||||
sk->port);
|
||||
wp_log(LOG, "connected with node %s:%s", sk->host,
|
||||
sk->port);
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
|
||||
/*
|
||||
@@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
break;
|
||||
|
||||
case WP_CONN_POLLING_FAILED:
|
||||
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to connect to node '%s:%s': %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
|
||||
/*
|
||||
* If connecting failed, we don't want to restart the connection
|
||||
@@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk)
|
||||
|
||||
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
||||
{
|
||||
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
break;
|
||||
|
||||
case WP_EXEC_FAILED:
|
||||
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
* wrong"
|
||||
*/
|
||||
case WP_EXEC_UNEXPECTED_SUCCESS:
|
||||
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
|
||||
sk->host, sk->port);
|
||||
wp_log(WARNING, "received bad response from safekeeper %s:%s query execution",
|
||||
sk->host, sk->port);
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
||||
return;
|
||||
|
||||
walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
|
||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
|
||||
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_VOTING;
|
||||
@@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
if (wp->n_connected == wp->quorum)
|
||||
{
|
||||
wp->propTerm++;
|
||||
walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
|
||||
wp->voteRequest = (VoteRequest)
|
||||
{
|
||||
@@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
else if (sk->greetResponse.term > wp->propTerm)
|
||||
{
|
||||
/* Another compute with higher term is running. */
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->greetResponse.term, wp->propTerm);
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->greetResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk)
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
/* We have quorum for voting, send our vote request */
|
||||
walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||
wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
|
||||
return;
|
||||
@@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
|
||||
return;
|
||||
|
||||
walprop_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||
wp_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||
|
||||
/*
|
||||
* In case of acceptor rejecting our vote, bail out, but only if either it
|
||||
@@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
if ((!sk->voteResponse.voteGiven) &&
|
||||
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
|
||||
{
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->voteResponse.term, wp->propTerm);
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->voteResponse.term, wp->propTerm);
|
||||
}
|
||||
Assert(sk->voteResponse.term == wp->propTerm);
|
||||
|
||||
@@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp)
|
||||
*/
|
||||
if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
|
||||
{
|
||||
walprop_log(FATAL, "failed to download WAL for logical replicaiton");
|
||||
wp_log(FATAL, "failed to download WAL for logical replicaiton");
|
||||
}
|
||||
|
||||
if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
|
||||
@@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
|
||||
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
||||
{
|
||||
walprop_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
wp_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
}
|
||||
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
|
||||
}
|
||||
@@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
{
|
||||
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
|
||||
}
|
||||
walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
|
||||
|
||||
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
wp->quorum,
|
||||
wp->propTerm,
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn));
|
||||
wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
wp->quorum,
|
||||
wp->propTerm,
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn));
|
||||
|
||||
/*
|
||||
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
|
||||
@@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
* scenario.
|
||||
*/
|
||||
disable_core_dump();
|
||||
walprop_log(PANIC,
|
||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||
wp_log(PANIC,
|
||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||
}
|
||||
}
|
||||
walprop_shared->mineLastElectedTerm = wp->propTerm;
|
||||
@@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk)
|
||||
{
|
||||
/* safekeeper is empty or no common point, start from the beginning */
|
||||
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
|
||||
|
||||
if (sk->startStreamingAt < wp->truncateLsn)
|
||||
{
|
||||
/*
|
||||
* There's a gap between the WAL starting point and a truncateLsn,
|
||||
* which can't appear in a normal working cluster. That gap means
|
||||
* that all safekeepers reported that they have persisted WAL up
|
||||
* to the truncateLsn before, but now current safekeeper tells
|
||||
* otherwise.
|
||||
*
|
||||
* Also we have a special condition here, which is empty
|
||||
* safekeeper with no history. In combination with a gap, that can
|
||||
* happen when we introduce a new safekeeper to the cluster. This
|
||||
* is a rare case, which is triggered manually for now, and should
|
||||
* be treated with care.
|
||||
*/
|
||||
|
||||
/*
|
||||
* truncateLsn will not change without ack from current
|
||||
* safekeeper, and it's aligned to the WAL record, so we can
|
||||
* safely start streaming from this point.
|
||||
*/
|
||||
sk->startStreamingAt = wp->truncateLsn;
|
||||
|
||||
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
|
||||
LSN_FORMAT_ARGS(sk->startStreamingAt));
|
||||
}
|
||||
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
|
||||
/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
|
||||
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk)
|
||||
}
|
||||
}
|
||||
|
||||
Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
|
||||
Assert(sk->startStreamingAt <= wp->availableLsn);
|
||||
|
||||
msg.tag = 'e';
|
||||
msg.term = wp->propTerm;
|
||||
@@ -1150,9 +1126,9 @@ SendProposerElected(Safekeeper *sk)
|
||||
msg.timelineStartLsn = wp->timelineStartLsn;
|
||||
|
||||
lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
|
||||
walprop_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
wp_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
pq_sendint64_le(&sk->outbuf, msg.tag);
|
||||
@@ -1261,8 +1237,8 @@ HandleActiveState(Safekeeper *sk, uint32 events)
|
||||
/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
|
||||
if (events & WL_SOCKET_CLOSED)
|
||||
{
|
||||
walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
|
||||
sk->host, sk->port);
|
||||
wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
|
||||
sk->host, sk->port);
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -1323,12 +1299,12 @@ SendAppendRequests(Safekeeper *sk)
|
||||
req = &sk->appendRequest;
|
||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||
|
||||
walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
@@ -1355,8 +1331,8 @@ SendAppendRequests(Safekeeper *sk)
|
||||
case NEON_WALREAD_WOULDBLOCK:
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1388,9 +1364,9 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return true;
|
||||
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1429,11 +1405,11 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
|
||||
break;
|
||||
|
||||
walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
|
||||
sk->appendResponse.term,
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
|
||||
sk->host, sk->port);
|
||||
wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
|
||||
sk->appendResponse.term,
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
|
||||
sk->host, sk->port);
|
||||
|
||||
if (sk->appendResponse.term > wp->propTerm)
|
||||
{
|
||||
@@ -1443,9 +1419,9 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
* core as this is kinda expected scenario.
|
||||
*/
|
||||
disable_core_dump();
|
||||
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
readAnything = true;
|
||||
@@ -1489,32 +1465,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||
rf->currentClusterSize);
|
||||
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||
rf->currentClusterSize);
|
||||
}
|
||||
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->last_received_lsn = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
|
||||
{
|
||||
@@ -1526,8 +1502,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
|
||||
/* Copy because timestamptz_to_str returns a static buffer */
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||
rf->replytime, replyTimeStr);
|
||||
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||
rf->replytime, replyTimeStr);
|
||||
|
||||
pfree(replyTimeStr);
|
||||
}
|
||||
@@ -1541,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
* Skip unknown keys to support backward compatibile protocol
|
||||
* changes
|
||||
*/
|
||||
walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
pq_getmsgbytes(reply_message, len);
|
||||
};
|
||||
}
|
||||
@@ -1606,7 +1582,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
||||
|
||||
if (wp->n_votes < wp->quorum)
|
||||
{
|
||||
walprop_log(WARNING, "GetDonor called before elections are won");
|
||||
wp_log(WARNING, "GetDonor called before elections are won");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -1734,9 +1710,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
return false;
|
||||
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1774,8 +1750,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
tag = pq_getmsgint64_le(&s);
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1851,9 +1827,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
|
||||
|
||||
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
|
||||
{
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1904,9 +1880,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
||||
wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
|
||||
return false;
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1943,9 +1919,9 @@ AsyncFlush(Safekeeper *sk)
|
||||
/* Nothing to do; try again when the socket's ready */
|
||||
return false;
|
||||
case -1:
|
||||
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1974,11 +1950,11 @@ CompareLsn(const void *a, const void *b)
|
||||
*
|
||||
* The strings are intended to be used as a prefix to "state", e.g.:
|
||||
*
|
||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
|
||||
* wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
|
||||
*
|
||||
* If this sort of phrasing doesn't fit the message, instead use something like:
|
||||
*
|
||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
|
||||
* wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
|
||||
*/
|
||||
static char *
|
||||
FormatSafekeeperState(Safekeeper *sk)
|
||||
@@ -2059,8 +2035,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
* To give a descriptive message in the case of failure, we use elog
|
||||
* and then an assertion that's guaranteed to fail.
|
||||
*/
|
||||
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
|
||||
wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
|
||||
Assert(events_ok_for_state);
|
||||
}
|
||||
}
|
||||
@@ -2199,8 +2175,8 @@ FormatEvents(WalProposer *wp, uint32 events)
|
||||
|
||||
if (events & (~all_flags))
|
||||
{
|
||||
walprop_log(WARNING, "Event formatting found unexpected component %d",
|
||||
events & (~all_flags));
|
||||
wp_log(WARNING, "event formatting found unexpected component %d",
|
||||
events & (~all_flags));
|
||||
return_str[6] = '*';
|
||||
return_str[7] = '\0';
|
||||
}
|
||||
|
||||
@@ -707,11 +707,23 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal
|
||||
* events */
|
||||
|
||||
#define WP_LOG_PREFIX "[WP] "
|
||||
|
||||
/*
|
||||
* wp_log is used in pure wp code (walproposer.c), allowing API callback to
|
||||
* catch logging.
|
||||
*/
|
||||
#ifdef WALPROPOSER_LIB
|
||||
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
|
||||
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
|
||||
#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
|
||||
#else
|
||||
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
|
||||
#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* And wpg_log is used all other (postgres specific) walproposer code, just
|
||||
* adding prefix.
|
||||
*/
|
||||
#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
|
||||
|
||||
#endif /* __NEON_WALPROPOSER_H__ */
|
||||
|
||||
@@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
|
||||
{
|
||||
StartReplicationCmd cmd;
|
||||
|
||||
elog(LOG, "WAL proposer starts streaming at %X/%X",
|
||||
LSN_FORMAT_ARGS(startpos));
|
||||
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
|
||||
LSN_FORMAT_ARGS(startpos));
|
||||
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
|
||||
cmd.timeline = wp->greetRequest.timeline;
|
||||
cmd.startpoint = startpos;
|
||||
@@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void)
|
||||
{
|
||||
load_file("libpqwalreceiver", false);
|
||||
if (WalReceiverFunctions == NULL)
|
||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
}
|
||||
|
||||
/* Helper function */
|
||||
@@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo)
|
||||
* PGconn structure"
|
||||
*/
|
||||
if (!pg_conn)
|
||||
elog(FATAL, "failed to allocate new PGconn object");
|
||||
wpg_log(FATAL, "failed to allocate new PGconn object");
|
||||
|
||||
/*
|
||||
* And in theory this allocation can fail as well, but it's incredibly
|
||||
@@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
|
||||
* unused. We'll expect it's never returned.
|
||||
*/
|
||||
case PGRES_POLLING_ACTIVE:
|
||||
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
|
||||
wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
|
||||
|
||||
/*
|
||||
* This return is never actually reached, but it's here to make
|
||||
@@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
|
||||
*/
|
||||
if (!result)
|
||||
{
|
||||
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
|
||||
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
|
||||
return WP_EXEC_UNEXPECTED_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
|
||||
}
|
||||
|
||||
if (unexpected_success)
|
||||
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
|
||||
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
|
||||
|
||||
return return_val;
|
||||
}
|
||||
@@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
||||
|
||||
if (status != PGRES_FATAL_ERROR)
|
||||
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||
wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||
|
||||
/*
|
||||
* If there was actually an error, it'll be properly reported
|
||||
@@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
case -1:
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
default:
|
||||
elog(FATAL, "invalid return %d from PQputCopyData", result);
|
||||
wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
case -1:
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
default:
|
||||
elog(FATAL, "invalid return %d from PQflush", result);
|
||||
wpg_log(FATAL, "invalid return %d from PQflush", result);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1237,19 +1237,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
return true; /* recovery not needed */
|
||||
endpos = wp->propEpochStartLsn;
|
||||
|
||||
/*
|
||||
* If we need to download more than a max_slot_wal_keep_size, cap to it to
|
||||
* avoid risk of exploding pg_wal. Logical replication won't work until
|
||||
* recreated, but at least compute would start; this also follows
|
||||
* max_slot_wal_keep_size semantics.
|
||||
*/
|
||||
download_range_mb = (endpos - startpos) / 1024 / 1024;
|
||||
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
|
||||
{
|
||||
startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
|
||||
walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
|
||||
LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
|
||||
}
|
||||
timeline = wp->greetRequest.timeline;
|
||||
|
||||
if (!neon_auth_token)
|
||||
@@ -1262,7 +1249,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
|
||||
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
elog(FATAL, "could not append password to the safekeeper connection string");
|
||||
wpg_log(FATAL, "could not append password to the safekeeper connection string");
|
||||
}
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
@@ -1279,11 +1266,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
err)));
|
||||
return false;
|
||||
}
|
||||
elog(LOG,
|
||||
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
|
||||
"%d",
|
||||
sk->host, sk->port, (uint32) (startpos >> 32),
|
||||
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
|
||||
wpg_log(LOG,
|
||||
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
|
||||
"%d",
|
||||
sk->host, sk->port, (uint32) (startpos >> 32),
|
||||
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
|
||||
|
||||
options.logical = false;
|
||||
options.startpoint = startpos;
|
||||
@@ -1481,11 +1468,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
{
|
||||
char log_prefix[64];
|
||||
|
||||
snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
Assert(!sk->xlogreader);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
|
||||
if (sk->xlogreader == NULL)
|
||||
elog(FATAL, "Failed to allocate xlog reader");
|
||||
wpg_log(FATAL, "failed to allocate xlog reader");
|
||||
}
|
||||
|
||||
static NeonWALReadResult
|
||||
@@ -1549,7 +1536,7 @@ static void
|
||||
walprop_pg_init_event_set(WalProposer *wp)
|
||||
{
|
||||
if (waitEvents)
|
||||
elog(FATAL, "double-initialization of event set");
|
||||
wpg_log(FATAL, "double-initialization of event set");
|
||||
|
||||
/* for each sk, we have socket plus potentially socket for neon walreader */
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
|
||||
@@ -1581,7 +1568,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
|
||||
Assert(sk->nwrEventPos == -1);
|
||||
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
|
||||
sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
|
||||
elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
|
||||
wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1680,8 +1667,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
|
||||
{
|
||||
WalProposer *wp = to_remove->wp;
|
||||
|
||||
elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
|
||||
to_remove->host, to_remove->port, is_sk);
|
||||
wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
|
||||
to_remove->host, to_remove->port, is_sk);
|
||||
|
||||
/*
|
||||
* Shortpath for exiting if have nothing to do. We never call this
|
||||
@@ -1835,13 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
|
||||
rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
|
||||
rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
|
||||
|
||||
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
||||
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
|
||||
rf->currentClusterSize,
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn),
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
|
||||
rf->replytime);
|
||||
wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
||||
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
|
||||
rf->currentClusterSize,
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn),
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
|
||||
rf->replytime);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1987,7 +1974,7 @@ GetLogRepRestartLSN(WalProposer *wp)
|
||||
{
|
||||
uint64 download_range_mb;
|
||||
|
||||
elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
|
||||
/*
|
||||
* If we need to download more than a max_slot_wal_keep_size,
|
||||
@@ -1999,8 +1986,8 @@ GetLogRepRestartLSN(WalProposer *wp)
|
||||
download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
|
||||
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
|
||||
{
|
||||
walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
|
||||
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
|
||||
wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
|
||||
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
|
||||
return InvalidXLogRecPtr;
|
||||
}
|
||||
|
||||
|
||||
49
poetry.lock
generated
49
poetry.lock
generated
@@ -339,19 +339,19 @@ uvloop = ["uvloop (>=0.15.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "boto3"
|
||||
version = "1.26.16"
|
||||
version = "1.34.11"
|
||||
description = "The AWS SDK for Python"
|
||||
optional = false
|
||||
python-versions = ">= 3.7"
|
||||
python-versions = ">= 3.8"
|
||||
files = [
|
||||
{file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
|
||||
{file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
|
||||
{file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
|
||||
{file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.29.16,<1.30.0"
|
||||
botocore = ">=1.34.11,<1.35.0"
|
||||
jmespath = ">=0.7.1,<2.0.0"
|
||||
s3transfer = ">=0.6.0,<0.7.0"
|
||||
s3transfer = ">=0.10.0,<0.11.0"
|
||||
|
||||
[package.extras]
|
||||
crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
||||
@@ -702,22 +702,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "botocore"
|
||||
version = "1.29.16"
|
||||
version = "1.34.11"
|
||||
description = "Low-level, data-driven core of boto 3."
|
||||
optional = false
|
||||
python-versions = ">= 3.7"
|
||||
python-versions = ">= 3.8"
|
||||
files = [
|
||||
{file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
|
||||
{file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
|
||||
{file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
|
||||
{file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
jmespath = ">=0.7.1,<2.0.0"
|
||||
python-dateutil = ">=2.1,<3.0.0"
|
||||
urllib3 = ">=1.25.4,<1.27"
|
||||
urllib3 = [
|
||||
{version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
|
||||
{version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
crt = ["awscrt (==0.14.0)"]
|
||||
crt = ["awscrt (==0.19.19)"]
|
||||
|
||||
[[package]]
|
||||
name = "botocore-stubs"
|
||||
@@ -1889,13 +1892,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "7.3.1"
|
||||
version = "7.4.4"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
|
||||
{file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
|
||||
{file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
|
||||
{file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -1907,7 +1910,7 @@ pluggy = ">=0.12,<2.0"
|
||||
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
|
||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-asyncio"
|
||||
@@ -2230,20 +2233,20 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "s3transfer"
|
||||
version = "0.6.0"
|
||||
version = "0.10.0"
|
||||
description = "An Amazon S3 Transfer Manager"
|
||||
optional = false
|
||||
python-versions = ">= 3.7"
|
||||
python-versions = ">= 3.8"
|
||||
files = [
|
||||
{file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
|
||||
{file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
|
||||
{file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
|
||||
{file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.12.36,<2.0a.0"
|
||||
botocore = ">=1.33.2,<2.0a.0"
|
||||
|
||||
[package.extras]
|
||||
crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
|
||||
crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "sarif-om"
|
||||
@@ -2740,4 +2743,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
|
||||
content-hash = "8de8b05a9b35a6f76da7d7e3652ddbb521f1eca53fce7b933f537080a9d6eada"
|
||||
|
||||
@@ -6,7 +6,7 @@ authors = []
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
pytest = "^7.3.1"
|
||||
pytest = "^7.4.4"
|
||||
psycopg2-binary = "^2.9.6"
|
||||
typing-extensions = "^4.6.1"
|
||||
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
|
||||
@@ -17,7 +17,7 @@ aiopg = "^1.4.0"
|
||||
Jinja2 = "^3.0.2"
|
||||
types-requests = "^2.31.0.0"
|
||||
types-psycopg2 = "^2.9.21.10"
|
||||
boto3 = "^1.26.16"
|
||||
boto3 = "^1.34.11"
|
||||
boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
|
||||
moto = {extras = ["server"], version = "^4.1.2"}
|
||||
backoff = "^2.2.1"
|
||||
|
||||
@@ -6,6 +6,7 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
aws-sdk-s3.workspace = true
|
||||
aws-smithy-async.workspace = true
|
||||
either.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
anyhow.workspace = true
|
||||
@@ -39,3 +40,5 @@ tracing-subscriber.workspace = true
|
||||
clap.workspace = true
|
||||
tracing-appender = "0.2"
|
||||
histogram = "0.7"
|
||||
|
||||
futures.workspace = true
|
||||
|
||||
@@ -16,10 +16,12 @@ use aws_config::environment::EnvironmentVariableCredentialsProvider;
|
||||
use aws_config::imds::credentials::ImdsCredentialsProvider;
|
||||
use aws_config::meta::credentials::CredentialsProviderChain;
|
||||
use aws_config::profile::ProfileFileCredentialsProvider;
|
||||
use aws_config::retry::RetryConfig;
|
||||
use aws_config::sso::SsoCredentialsProvider;
|
||||
use aws_config::BehaviorVersion;
|
||||
use aws_sdk_s3::config::Region;
|
||||
use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
|
||||
use aws_sdk_s3::{Client, Config};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use clap::ValueEnum;
|
||||
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
||||
@@ -283,9 +285,13 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
|
||||
)
|
||||
};
|
||||
|
||||
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
||||
|
||||
let mut builder = Config::builder()
|
||||
.behavior_version(BehaviorVersion::v2023_11_09())
|
||||
.region(bucket_region)
|
||||
.retry_config(RetryConfig::adaptive().with_max_attempts(3))
|
||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
|
||||
.credentials_provider(credentials_provider);
|
||||
|
||||
if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use s3_scrubber::scan_metadata::scan_metadata;
|
||||
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
|
||||
@@ -34,6 +35,8 @@ enum Command {
|
||||
ScanMetadata {
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
json: bool,
|
||||
#[arg(long = "tenant-id", num_args = 0..)]
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> {
|
||||
));
|
||||
|
||||
match cli.command {
|
||||
Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
}
|
||||
Ok(summary) => {
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
Command::ScanMetadata { json, tenant_ids } => {
|
||||
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
}
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
Err(anyhow::anyhow!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
Ok(summary) => {
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
Err(anyhow::anyhow!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
Command::FindGarbage {
|
||||
node_kind,
|
||||
depth,
|
||||
|
||||
@@ -187,10 +187,17 @@ Timeline layer count: {6}
|
||||
}
|
||||
|
||||
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
||||
pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
|
||||
pub async fn scan_metadata(
|
||||
bucket_config: BucketConfig,
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
) -> anyhow::Result<MetadataSummary> {
|
||||
let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
|
||||
|
||||
let tenants = stream_tenants(&s3_client, &target);
|
||||
let tenants = if tenant_ids.is_empty() {
|
||||
futures::future::Either::Left(stream_tenants(&s3_client, &target))
|
||||
} else {
|
||||
futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
|
||||
};
|
||||
|
||||
// How many tenants to process in parallel. We need to be mindful of pageservers
|
||||
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
|
||||
|
||||
@@ -4,6 +4,12 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
async-stream.workspace = true
|
||||
anyhow.workspace = true
|
||||
@@ -16,6 +22,7 @@ chrono.workspace = true
|
||||
clap = { workspace = true, features = ["derive"] }
|
||||
const_format.workspace = true
|
||||
crc32c.workspace = true
|
||||
fail.workspace = true
|
||||
fs2.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
|
||||
@@ -54,6 +54,19 @@ const ID_FILE_NAME: &str = "safekeeper.id";
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
];
|
||||
|
||||
fn version() -> String {
|
||||
format!(
|
||||
"{GIT_VERSION} failpoints: {}, features: {:?}",
|
||||
fail::has_failpoints(),
|
||||
FEATURES,
|
||||
)
|
||||
}
|
||||
|
||||
const ABOUT: &str = r#"
|
||||
A fleet of safekeepers is responsible for reliably storing WAL received from
|
||||
compute, passing it through consensus (mitigating potential computes brain
|
||||
@@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
// getting 'argument cannot be used multiple times' error. This seems to be
|
||||
// impossible with pure Derive API, so convert struct to Command, modify it,
|
||||
// parse arguments, and then fill the struct back.
|
||||
let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
|
||||
let cmd = <Args as clap::CommandFactory>::command()
|
||||
.args_override_self(true)
|
||||
.version(version());
|
||||
let mut matches = cmd.get_matches();
|
||||
let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
@@ -444,6 +446,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
.data(Arc::new(conf))
|
||||
.data(auth)
|
||||
.get("/v1/status", |r| request_span(r, status_handler))
|
||||
.put("/v1/failpoints", |r| {
|
||||
request_span(r, move |r| async {
|
||||
let cancel = CancellationToken::new();
|
||||
failpoints_handler(r, cancel).await
|
||||
})
|
||||
})
|
||||
// Will be used in the future instead of implicit timeline creation
|
||||
.post("/v1/tenant/timeline", |r| {
|
||||
request_span(r, timeline_create_handler)
|
||||
|
||||
@@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
||||
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use utils::failpoint_support;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::AtomicLsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
@@ -391,15 +392,8 @@ impl SafekeeperPostgresHandler {
|
||||
// application_name: give only committed WAL (used by pageserver) or all
|
||||
// existing WAL (up to flush_lsn, used by walproposer or peer recovery).
|
||||
// The second case is always driven by a consensus leader which term
|
||||
// must generally be also supplied. However we're sloppy to do this in
|
||||
// walproposer recovery which will be removed soon. So TODO is to make
|
||||
// it not Option'al then.
|
||||
//
|
||||
// Fetching WAL without term in recovery creates a small risk of this
|
||||
// WAL getting concurrently garbaged if another compute rises which
|
||||
// collects majority and starts fixing log on this safekeeper itself.
|
||||
// That's ok as (old) proposer will never be able to commit such WAL.
|
||||
let end_watch = if self.is_walproposer_recovery() {
|
||||
// must be supplied.
|
||||
let end_watch = if term.is_some() {
|
||||
EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
|
||||
} else {
|
||||
EndWatch::Commit(tli.get_commit_lsn_watch_rx())
|
||||
@@ -535,12 +529,19 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
);
|
||||
|
||||
// try to send as much as available, capped by MAX_SEND_SIZE
|
||||
let mut send_size = self
|
||||
.end_pos
|
||||
.checked_sub(self.start_pos)
|
||||
.context("reading wal without waiting for it first")?
|
||||
.0 as usize;
|
||||
send_size = min(send_size, self.send_buf.len());
|
||||
let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
|
||||
// if we went behind available WAL, back off
|
||||
if chunk_end_pos >= self.end_pos {
|
||||
chunk_end_pos = self.end_pos;
|
||||
} else {
|
||||
// If sending not up to end pos, round down to page boundary to
|
||||
// avoid breaking WAL record not at page boundary, as protocol
|
||||
// demands. See walsender.c (XLogSendPhysical).
|
||||
chunk_end_pos = chunk_end_pos
|
||||
.checked_sub(chunk_end_pos.block_offset())
|
||||
.unwrap();
|
||||
}
|
||||
let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
|
||||
let send_buf = &mut self.send_buf[..send_size];
|
||||
let send_size: usize;
|
||||
{
|
||||
@@ -551,7 +552,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// read wal into buffer
|
||||
// Read WAL into buffer. send_size can be additionally capped to
|
||||
// segment boundary here.
|
||||
send_size = self.wal_reader.read(send_buf).await?
|
||||
};
|
||||
let send_buf = &send_buf[..send_size];
|
||||
@@ -566,6 +568,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
}))
|
||||
.await?;
|
||||
|
||||
if let Some(appname) = &self.appname {
|
||||
if appname == "replica" {
|
||||
failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
|
||||
}
|
||||
}
|
||||
trace!(
|
||||
"sent {} bytes of WAL {}-{}",
|
||||
send_size,
|
||||
|
||||
@@ -565,6 +565,9 @@ impl WalReader {
|
||||
})
|
||||
}
|
||||
|
||||
/// Read WAL at current position into provided buf, returns number of bytes
|
||||
/// read. It can be smaller than buf size only if segment boundary is
|
||||
/// reached.
|
||||
pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
||||
// If this timeline is new, we may not have a full segment yet, so
|
||||
// we pad the first bytes of the timeline's first WAL segment with 0s
|
||||
|
||||
@@ -347,7 +347,9 @@ class PgProtocol:
|
||||
"""
|
||||
return self.safe_psql_many([query], **kwargs)[0]
|
||||
|
||||
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
|
||||
def safe_psql_many(
|
||||
self, queries: List[str], log_query=True, **kwargs: Any
|
||||
) -> List[List[Tuple[Any, ...]]]:
|
||||
"""
|
||||
Execute queries against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
@@ -356,7 +358,8 @@ class PgProtocol:
|
||||
with closing(self.connect(**kwargs)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for query in queries:
|
||||
log.info(f"Executing query: {query}")
|
||||
if log_query:
|
||||
log.info(f"Executing query: {query}")
|
||||
cur.execute(query)
|
||||
|
||||
if cur.description is None:
|
||||
@@ -365,11 +368,11 @@ class PgProtocol:
|
||||
result.append(cur.fetchall())
|
||||
return result
|
||||
|
||||
def safe_psql_scalar(self, query) -> Any:
|
||||
def safe_psql_scalar(self, query, log_query=True) -> Any:
|
||||
"""
|
||||
Execute query returning single row with single column.
|
||||
"""
|
||||
return self.safe_psql(query)[0][0]
|
||||
return self.safe_psql(query, log_query=log_query)[0][0]
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -890,8 +893,8 @@ class NeonEnv:
|
||||
"""Get list of safekeeper endpoints suitable for safekeepers GUC"""
|
||||
return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
|
||||
|
||||
def get_pageserver_version(self) -> str:
|
||||
bin_pageserver = str(self.neon_binpath / "pageserver")
|
||||
def get_binary_version(self, binary_name: str) -> str:
|
||||
bin_pageserver = str(self.neon_binpath / binary_name)
|
||||
res = subprocess.run(
|
||||
[bin_pageserver, "--version"],
|
||||
check=True,
|
||||
@@ -1656,7 +1659,7 @@ class NeonPageserver(PgProtocol):
|
||||
self.running = False
|
||||
self.service_port = port
|
||||
self.config_override = config_override
|
||||
self.version = env.get_pageserver_version()
|
||||
self.version = env.get_binary_version("pageserver")
|
||||
|
||||
# After a test finishes, we will scrape the log to see if there are any
|
||||
# unexpected error messages. If your test expects an error, add it to
|
||||
@@ -2924,7 +2927,10 @@ class Safekeeper:
|
||||
return res
|
||||
|
||||
def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
|
||||
return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token)
|
||||
is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
|
||||
return SafekeeperHttpClient(
|
||||
port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
|
||||
)
|
||||
|
||||
def data_dir(self) -> str:
|
||||
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
|
||||
@@ -2975,10 +2981,11 @@ class SafekeeperMetrics:
|
||||
class SafekeeperHttpClient(requests.Session):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None):
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled = is_testing_enabled
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
@@ -2986,6 +2993,30 @@ class SafekeeperHttpClient(requests.Session):
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def is_testing_enabled_or_skip(self):
|
||||
if not self.is_testing_enabled:
|
||||
pytest.skip("safekeeper was built without 'testing' feature")
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
return res_json
|
||||
|
||||
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
params = params or {}
|
||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||
|
||||
@@ -714,7 +714,7 @@ class PageserverHttpClient(requests.Session):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
assert res.status_code == 200
|
||||
assert res.status_code in (200, 304)
|
||||
|
||||
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
info = self.layer_map_info(tenant_id, timeline_id)
|
||||
|
||||
@@ -1,19 +1,59 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import Endpoint, NeonEnv
|
||||
|
||||
|
||||
def wait_caughtup(primary: Endpoint, secondary: Endpoint):
|
||||
primary_lsn = primary.safe_psql_scalar(
|
||||
"SELECT pg_current_wal_insert_lsn()::text", log_query=False
|
||||
)
|
||||
while True:
|
||||
secondary_lsn = secondary.safe_psql_scalar(
|
||||
"SELECT pg_last_wal_replay_lsn()", log_query=False
|
||||
)
|
||||
caught_up = secondary_lsn >= primary_lsn
|
||||
log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
|
||||
if caught_up:
|
||||
return
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
# Check for corrupted WAL messages which might otherwise go unnoticed if
|
||||
# reconnection fixes this.
|
||||
def scan_standby_log_for_errors(secondary):
|
||||
log_path = secondary.endpoint_path() / "compute.log"
|
||||
with log_path.open("r") as f:
|
||||
markers = re.compile(
|
||||
r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
|
||||
)
|
||||
for line in f:
|
||||
if markers.search(line):
|
||||
log.info(f"bad error in standby log: {line}")
|
||||
raise AssertionError()
|
||||
|
||||
|
||||
def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
# We've had a bug caused by WAL records split across multiple XLogData
|
||||
# messages resulting in corrupted WAL complains on standby. It reproduced
|
||||
# only when sending from safekeeper is slow enough to grab full
|
||||
# MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in
|
||||
# one conf to decrease test time.
|
||||
slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "")
|
||||
if slow_down_send:
|
||||
sk_http = env.safekeepers[0].http_client()
|
||||
sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")])
|
||||
|
||||
with env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
) as primary:
|
||||
time.sleep(1)
|
||||
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
|
||||
primary_lsn = None
|
||||
caught_up = False
|
||||
queries = [
|
||||
"SHOW neon.timeline_id",
|
||||
"SHOW neon.tenant_id",
|
||||
@@ -26,23 +66,6 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
with p_con.cursor() as p_cur:
|
||||
p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
|
||||
|
||||
# Explicit commit to make sure other connections (and replicas) can
|
||||
# see the changes of this commit.
|
||||
p_con.commit()
|
||||
|
||||
with p_con.cursor() as p_cur:
|
||||
p_cur.execute("SELECT pg_current_wal_insert_lsn()::text")
|
||||
res = p_cur.fetchone()
|
||||
assert res is not None
|
||||
(lsn,) = res
|
||||
primary_lsn = lsn
|
||||
|
||||
# Explicit commit to make sure other connections (and replicas) can
|
||||
# see the changes of this commit.
|
||||
# Note that this may generate more WAL if the transaction has changed
|
||||
# things, but we don't care about that.
|
||||
p_con.commit()
|
||||
|
||||
for query in queries:
|
||||
with p_con.cursor() as p_cur:
|
||||
p_cur.execute(query)
|
||||
@@ -51,30 +74,28 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
response = res
|
||||
responses[query] = response
|
||||
|
||||
# insert more data to make safekeeper send MAX_SEND_SIZE messages
|
||||
if slow_down_send:
|
||||
primary.safe_psql("create table t(key int, value text)")
|
||||
primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
|
||||
|
||||
wait_caughtup(primary, secondary)
|
||||
|
||||
with secondary.connect() as s_con:
|
||||
with s_con.cursor() as s_cur:
|
||||
s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
|
||||
res = s_cur.fetchone()
|
||||
assert res is not None
|
||||
|
||||
while not caught_up:
|
||||
with s_con.cursor() as secondary_cursor:
|
||||
secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
|
||||
res = secondary_cursor.fetchone()
|
||||
assert res is not None
|
||||
(secondary_lsn,) = res
|
||||
# There may be more changes on the primary after we got our LSN
|
||||
# due to e.g. autovacuum, but that shouldn't impact the content
|
||||
# of the tables, so we check whether we've replayed up to at
|
||||
# least after the commit of the `test` table.
|
||||
caught_up = secondary_lsn >= primary_lsn
|
||||
|
||||
# Explicit commit to flush any transient transaction-level state.
|
||||
s_con.commit()
|
||||
|
||||
for query in queries:
|
||||
with s_con.cursor() as secondary_cursor:
|
||||
secondary_cursor.execute(query)
|
||||
response = secondary_cursor.fetchone()
|
||||
assert response is not None
|
||||
assert response == responses[query]
|
||||
|
||||
scan_standby_log_for_errors(secondary)
|
||||
|
||||
# clean up
|
||||
if slow_down_send:
|
||||
sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))
|
||||
|
||||
@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
|
||||
@@ -566,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
|
||||
f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
|
||||
)
|
||||
|
||||
endpoint.stop_and_destroy()
|
||||
endpoint.stop()
|
||||
timeline_delete_wait_completed(ps_http, tenant_id, timeline_id)
|
||||
|
||||
# Also delete and manually create timeline on safekeepers -- this tests
|
||||
|
||||
@@ -475,6 +475,46 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder):
|
||||
asyncio.run(run_unavailability(env, endpoint))
|
||||
|
||||
|
||||
async def run_recovery_uncommitted(env: NeonEnv):
|
||||
(sk1, sk2, _) = env.safekeepers
|
||||
|
||||
env.neon_cli.create_branch("test_recovery_uncommitted")
|
||||
ep = env.endpoints.create_start("test_recovery_uncommitted")
|
||||
ep.safe_psql("create table t(key int, value text)")
|
||||
ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
|
||||
|
||||
# insert with only one safekeeper up to create tail of flushed but not committed WAL
|
||||
sk1.stop()
|
||||
sk2.stop()
|
||||
conn = await ep.connect_async()
|
||||
# query should hang, so execute in separate task
|
||||
bg_query = asyncio.create_task(
|
||||
conn.execute("insert into t select generate_series(1, 2000), 'payload'")
|
||||
)
|
||||
sleep_sec = 2
|
||||
await asyncio.sleep(sleep_sec)
|
||||
# it must still be not finished
|
||||
assert not bg_query.done()
|
||||
# note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
|
||||
ep.stop_and_destroy()
|
||||
|
||||
# Start one of sks to make quorum online plus compute and ensure they can
|
||||
# sync.
|
||||
sk2.start()
|
||||
ep = env.endpoints.create_start(
|
||||
"test_recovery_uncommitted",
|
||||
)
|
||||
ep.safe_psql("insert into t select generate_series(1, 2000), 'payload'")
|
||||
|
||||
|
||||
# Test pulling uncommitted WAL (up to flush_lsn) during recovery.
|
||||
def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
asyncio.run(run_recovery_uncommitted(env))
|
||||
|
||||
|
||||
@dataclass
|
||||
class RaceConditionTest:
|
||||
iteration: int
|
||||
|
||||
@@ -167,22 +167,21 @@ build: |
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
git \
|
||||
libevent-dev \
|
||||
libssl-dev \
|
||||
patchutils \
|
||||
libtool \
|
||||
pkg-config
|
||||
|
||||
ENV PGBOUNCER_VERSION 1.21.0
|
||||
ENV PGBOUNCER_GITPATH 1_21_0
|
||||
# Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits.
|
||||
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
|
||||
ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1
|
||||
RUN set -e \
|
||||
&& curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
|
||||
&& tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
|
||||
&& cd pgbouncer-${PGBOUNCER_VERSION} \
|
||||
&& curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \
|
||||
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \
|
||||
&& cd pgbouncer \
|
||||
&& ./autogen.sh \
|
||||
&& LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
|
||||
&& make -j $(nproc) \
|
||||
&& make install
|
||||
&& make -j $(nproc) dist_man_MANS= \
|
||||
&& make install dist_man_MANS=
|
||||
merge: |
|
||||
# tweak nofile limits
|
||||
RUN set -e \
|
||||
|
||||
Reference in New Issue
Block a user