mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 03:30:36 +00:00
Compare commits
7 Commits
log-ttid
...
problame/p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1863a04fb0 | ||
|
|
f83a71ca6a | ||
|
|
74a634c9fa | ||
|
|
fc3f8a65b3 | ||
|
|
9f03dd24c2 | ||
|
|
dc96a7604a | ||
|
|
d7c94e67ce |
4
.github/workflows/build_and_test.yml
vendored
4
.github/workflows/build_and_test.yml
vendored
@@ -1092,10 +1092,8 @@ jobs:
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
|
||||
27
Cargo.lock
generated
27
Cargo.lock
generated
@@ -158,6 +158,17 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-channel"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"event-listener",
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.4.0"
|
||||
@@ -1031,6 +1042,15 @@ dependencies = [
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "concurrent-queue"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format"
|
||||
version = "0.2.30"
|
||||
@@ -1452,6 +1472,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "2.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
|
||||
|
||||
[[package]]
|
||||
name = "fail"
|
||||
version = "0.5.1"
|
||||
@@ -2674,6 +2700,7 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-channel",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
|
||||
@@ -35,6 +35,7 @@ license = "Apache-2.0"
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-channel = "1.9.0"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
|
||||
@@ -44,7 +44,7 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use tracing::{error, info, info_span};
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
@@ -57,7 +57,6 @@ use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
@@ -250,20 +249,11 @@ fn main() -> Result<()> {
|
||||
|
||||
state.status = ComputeStatus::Init;
|
||||
compute.state_changed.notify_all();
|
||||
let pspec = state.pspec.as_ref().expect("spec must be set");
|
||||
let ttid = TenantTimelineId {
|
||||
tenant_id: pspec.tenant_id,
|
||||
timeline_id: pspec.timeline_id,
|
||||
};
|
||||
drop(state);
|
||||
|
||||
// Log ttid everywhere for easier log identification (e.g. loki agent can
|
||||
// create label on that).
|
||||
let _guard = info_span!("", ttid = %ttid).entered();
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute, ttid);
|
||||
let _configurator_handle = launch_configurator(&compute, ttid);
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
|
||||
@@ -4,13 +4,11 @@ use std::thread;
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
// Log ttid everywhere
|
||||
#[instrument(name = "", fields(ttid = %ttid), skip_all)]
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>, ttid: TenantTimelineId) {
|
||||
#[instrument(skip_all)]
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
info!("waiting for reconfiguration requests");
|
||||
loop {
|
||||
let state = compute.state.lock().unwrap();
|
||||
@@ -43,16 +41,13 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>, ttid: TenantTimelineId) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn launch_configurator(
|
||||
compute: &Arc<ComputeNode>,
|
||||
ttid: TenantTimelineId,
|
||||
) -> thread::JoinHandle<()> {
|
||||
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let compute = Arc::clone(compute);
|
||||
|
||||
thread::Builder::new()
|
||||
.name("compute-configurator".into())
|
||||
.spawn(move || {
|
||||
configurator_main_loop(&compute, ttid);
|
||||
configurator_main_loop(&compute);
|
||||
info!("configurator thread is exited");
|
||||
})
|
||||
.expect("cannot launch configurator thread")
|
||||
|
||||
@@ -3,8 +3,7 @@ use std::{thread, time::Duration};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, info, instrument};
|
||||
use utils::id::TenantTimelineId;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
@@ -13,8 +12,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
// Then update it in the shared state. This function never errors out.
|
||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||
#[instrument(name = "", fields(ttid = %ttid), skip_all)]
|
||||
fn watch_compute_activity(compute: &ComputeNode, ttid: TenantTimelineId) {
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.as_str();
|
||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||
@@ -105,11 +103,11 @@ fn watch_compute_activity(compute: &ComputeNode, ttid: TenantTimelineId) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>, ttid: TenantTimelineId) -> thread::JoinHandle<()> {
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || watch_compute_activity(&state, ttid))
|
||||
.spawn(move || watch_compute_activity(&state))
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-channel.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
|
||||
@@ -86,15 +86,18 @@
|
||||
//! [`RequestContext`] argument. Functions in the middle of the call chain
|
||||
//! only need to pass it on.
|
||||
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone)]
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
page_content_kind: PageContentKind,
|
||||
page_cache_permit: Option<Arc<crate::page_cache::PinnedSlotsPermit>>,
|
||||
}
|
||||
|
||||
/// The kind of access to the page cache.
|
||||
@@ -150,6 +153,7 @@ impl RequestContextBuilder {
|
||||
download_behavior: DownloadBehavior::Download,
|
||||
access_stats_behavior: AccessStatsBehavior::Update,
|
||||
page_content_kind: PageContentKind::Unknown,
|
||||
page_cache_permit: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -163,6 +167,7 @@ impl RequestContextBuilder {
|
||||
download_behavior: original.download_behavior,
|
||||
access_stats_behavior: original.access_stats_behavior,
|
||||
page_content_kind: original.page_content_kind,
|
||||
page_cache_permit: original.page_cache_permit.clone(),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -186,6 +191,11 @@ impl RequestContextBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
pub(crate) fn page_cache_permit(mut self, p: Arc<crate::page_cache::PinnedSlotsPermit>) -> Self {
|
||||
self.inner.page_cache_permit = Some(p);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> RequestContext {
|
||||
self.inner
|
||||
}
|
||||
@@ -286,4 +296,8 @@ impl RequestContext {
|
||||
pub(crate) fn page_content_kind(&self) -> PageContentKind {
|
||||
self.page_content_kind
|
||||
}
|
||||
|
||||
pub(crate) fn permit(&self) -> Option<&crate::page_cache::PinnedSlotsPermit> {
|
||||
self.page_cache_permit.as_ref().map(|p| &**p)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,8 +133,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
node_id: self.node_id,
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-re-attach");
|
||||
|
||||
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants",
|
||||
@@ -170,8 +168,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
.collect(),
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-validate");
|
||||
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
Ok(response
|
||||
|
||||
@@ -220,8 +220,6 @@ where
|
||||
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
||||
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
|
||||
mutated = true;
|
||||
} else {
|
||||
metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
|
||||
}
|
||||
this_list_valid
|
||||
});
|
||||
|
||||
@@ -314,7 +314,6 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum PageCacheErrorKind {
|
||||
AcquirePinnedSlotTimeout,
|
||||
EvictIterLimit,
|
||||
}
|
||||
|
||||
pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
|
||||
@@ -967,7 +966,6 @@ pub(crate) struct DeletionQueueMetrics {
|
||||
pub(crate) keys_submitted: IntCounter,
|
||||
pub(crate) keys_dropped: IntCounter,
|
||||
pub(crate) keys_executed: IntCounter,
|
||||
pub(crate) keys_validated: IntCounter,
|
||||
pub(crate) dropped_lsn_updates: IntCounter,
|
||||
pub(crate) unexpected_errors: IntCounter,
|
||||
pub(crate) remote_errors: IntCounterVec,
|
||||
@@ -989,13 +987,7 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
|
||||
keys_executed: register_int_counter!(
|
||||
"pageserver_deletion_queue_executed_total",
|
||||
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
keys_validated: register_int_counter!(
|
||||
"pageserver_deletion_queue_validated_total",
|
||||
"Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
|
||||
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
|
||||
@@ -66,8 +66,7 @@
|
||||
//! inserted to the mapping, but you must hold the write-lock on the slot until
|
||||
//! the contents are valid. If you need to release the lock without initializing
|
||||
//! the contents, you must remove the mapping first. We make that easy for the
|
||||
//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
|
||||
//! page, the caller must explicitly call guard.mark_valid() after it has
|
||||
//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
|
||||
//! initialized it. If the guard is dropped without calling mark_valid(), the
|
||||
//! mapping is automatically removed and the slot is marked free.
|
||||
//!
|
||||
@@ -79,6 +78,7 @@ use std::{
|
||||
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
|
||||
Arc, Weak,
|
||||
},
|
||||
task::Poll,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
@@ -215,16 +215,21 @@ impl Slot {
|
||||
|
||||
impl SlotInner {
|
||||
/// If there is aready a reader, drop our permit and share its permit, just like we share read access.
|
||||
fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
|
||||
let mut guard = self.permit.lock().unwrap();
|
||||
if let Some(existing_permit) = guard.upgrade() {
|
||||
drop(guard);
|
||||
drop(permit);
|
||||
existing_permit
|
||||
} else {
|
||||
let permit = Arc::new(permit);
|
||||
*guard = Arc::downgrade(&permit);
|
||||
permit
|
||||
fn coalesce_readers_permit<'c>(&self, permit: PermitKind<'c>) -> PermitKindReadGuard<'c> {
|
||||
match permit {
|
||||
PermitKind::CtxProvided(permit) => PermitKindReadGuard::CtxProvided(permit),
|
||||
PermitKind::Acquired(permit) => {
|
||||
let mut guard = self.permit.lock().unwrap();
|
||||
if let Some(existing_permit) = guard.upgrade() {
|
||||
drop(guard);
|
||||
drop(permit);
|
||||
existing_permit
|
||||
} else {
|
||||
let permit = Arc::new(permit);
|
||||
*guard = Arc::downgrade(&permit);
|
||||
permit
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -252,21 +257,36 @@ pub struct PageCache {
|
||||
/// This is interpreted modulo the page cache size.
|
||||
next_evict_slot: AtomicUsize,
|
||||
|
||||
find_victim_sender:
|
||||
async_channel::Sender<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
|
||||
find_victim_waiters:
|
||||
async_channel::Receiver<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
|
||||
|
||||
size_metrics: &'static PageCacheSizeMetrics,
|
||||
}
|
||||
|
||||
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
pub(crate) struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
enum PermitKind<'c> {
|
||||
CtxProvided(&'c PinnedSlotsPermit),
|
||||
Acquired(PinnedSlotsPermit),
|
||||
}
|
||||
|
||||
enum PermitKindReadGuard<'c> {
|
||||
CtxProvided(&'c PinnedSlotsPermit),
|
||||
Coalesced(Arc<PinnedSlotsPermit>),
|
||||
}
|
||||
|
||||
///
|
||||
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
|
||||
/// until the guard is dropped.
|
||||
///
|
||||
pub struct PageReadGuard<'i> {
|
||||
_permit: Arc<PinnedSlotsPermit>,
|
||||
pub struct PageReadGuard<'c, 'i> {
|
||||
_permit: PermitKindReadGuard<'c>,
|
||||
slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for PageReadGuard<'_> {
|
||||
impl std::ops::Deref for PageReadGuard<'_, '_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@@ -274,7 +294,7 @@ impl std::ops::Deref for PageReadGuard<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
|
||||
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_, '_> {
|
||||
fn as_ref(&self) -> &[u8; PAGE_SZ] {
|
||||
self.slot_guard.buf
|
||||
}
|
||||
@@ -286,78 +306,89 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
|
||||
///
|
||||
/// Counterintuitively, this is used even for a read, if the requested page is not
|
||||
/// currently found in the page cache. In that case, the caller of lock_for_read()
|
||||
/// is expected to fill in the page contents and call mark_valid(). Similarly
|
||||
/// lock_for_write() can return an invalid buffer that the caller is expected to
|
||||
/// to initialize.
|
||||
///
|
||||
pub struct PageWriteGuard<'i> {
|
||||
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
|
||||
|
||||
_permit: PinnedSlotsPermit,
|
||||
|
||||
// Are the page contents currently valid?
|
||||
// Used to mark pages as invalid that are assigned but not yet filled with data.
|
||||
valid: bool,
|
||||
/// is expected to fill in the page contents and call mark_valid().
|
||||
pub struct PageWriteGuard<'c, 'i> {
|
||||
state: PageWriteGuardState<'c, 'i>,
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for PageWriteGuard<'_> {
|
||||
enum PageWriteGuardState<'c, 'i> {
|
||||
Invalid {
|
||||
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
|
||||
_permit: PermitKindReadGuard<'c>,
|
||||
},
|
||||
Downgraded,
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for PageWriteGuard<'_, '_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.inner.buf
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for PageWriteGuard<'_> {
|
||||
impl std::ops::Deref for PageWriteGuard<'_, '_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.inner.buf
|
||||
match &self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
|
||||
impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_, '_> {
|
||||
fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
|
||||
self.inner.buf
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => todo!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageWriteGuard<'_> {
|
||||
impl<'c, 'a> PageWriteGuard<'c, 'a> {
|
||||
/// Mark that the buffer contents are now valid.
|
||||
pub fn mark_valid(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
assert!(
|
||||
!self.valid,
|
||||
"mark_valid called on a buffer that was already valid"
|
||||
);
|
||||
self.valid = true;
|
||||
#[must_use]
|
||||
pub fn mark_valid(mut self) -> PageReadGuard<'c, 'a> {
|
||||
let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
|
||||
match prev {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => {
|
||||
assert!(inner.key.is_some());
|
||||
PageReadGuard {
|
||||
_permit,
|
||||
slot_guard: inner.downgrade(),
|
||||
}
|
||||
}
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageWriteGuard<'_> {
|
||||
impl Drop for PageWriteGuard<'_, '_> {
|
||||
///
|
||||
/// If the buffer was allocated for a page that was not already in the
|
||||
/// cache, but the lock_for_read/write() caller dropped the buffer without
|
||||
/// initializing it, remove the mapping from the page cache.
|
||||
///
|
||||
fn drop(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
if !self.valid {
|
||||
let self_key = self.inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
self.inner.key = None;
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => {
|
||||
assert!(inner.key.is_some());
|
||||
let self_key = inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
inner.key = None;
|
||||
}
|
||||
PageWriteGuardState::Downgraded => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// lock_for_read() return value
|
||||
pub enum ReadBufResult<'a> {
|
||||
Found(PageReadGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
/// lock_for_write() return value
|
||||
pub enum WriteBufResult<'a> {
|
||||
Found(PageWriteGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
pub enum ReadBufResult<'c, 'a> {
|
||||
Found(PageReadGuard<'c, 'a>),
|
||||
NotFound(PageWriteGuard<'c, 'a>),
|
||||
}
|
||||
|
||||
impl PageCache {
|
||||
@@ -379,10 +410,9 @@ impl PageCache {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Option<(Lsn, PageReadGuard)> {
|
||||
let Ok(permit) = self.try_get_pinned_slot_permit().await else {
|
||||
let Ok(permit) = self.try_get_pinned_slot_permit(ctx).await else {
|
||||
return None;
|
||||
};
|
||||
|
||||
crate::metrics::PAGE_CACHE
|
||||
.for_ctx(ctx)
|
||||
.read_accesses_materialized_page
|
||||
@@ -430,12 +460,13 @@ impl PageCache {
|
||||
/// Store an image of the given page in the cache.
|
||||
///
|
||||
pub async fn memorize_materialized_page(
|
||||
&self,
|
||||
&'static self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
img: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
@@ -446,30 +477,87 @@ impl PageCache {
|
||||
lsn,
|
||||
};
|
||||
|
||||
match self.lock_for_write(&cache_key).await? {
|
||||
WriteBufResult::Found(write_guard) => {
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(*write_guard == img);
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit(ctx).await?);
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
if inner.key.as_ref() == Some(&cache_key) {
|
||||
slot.inc_usage_count();
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
debug_assert_eq!(inner.buf.len(), img.len());
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(inner.buf == img);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
WriteBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(img);
|
||||
write_guard.mark_valid();
|
||||
}
|
||||
}
|
||||
debug_assert!(permit.is_some());
|
||||
|
||||
Ok(())
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self
|
||||
.find_victim(permit.as_ref().unwrap())
|
||||
.await
|
||||
.context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
// Create a write guard for the slot so we go through the expected motions.
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
let mut write_guard = PageWriteGuard {
|
||||
state: PageWriteGuardState::Invalid {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
},
|
||||
};
|
||||
write_guard.copy_from_slice(img);
|
||||
let _ = write_guard.mark_valid();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with immutable file pages.
|
||||
|
||||
pub async fn read_immutable_buf(
|
||||
&self,
|
||||
pub async fn read_immutable_buf<'c>(
|
||||
&'static self,
|
||||
file_id: FileId,
|
||||
blkno: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
ctx: &'c RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult<'c, 'static>> {
|
||||
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key, ctx).await
|
||||
@@ -483,7 +571,22 @@ impl PageCache {
|
||||
// "mappings" after this section. But the routines in this section should
|
||||
// not require changes.
|
||||
|
||||
async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
|
||||
pub(crate) async fn get_permit(&self) -> Arc<PinnedSlotsPermit> {
|
||||
Arc::new(PinnedSlotsPermit(
|
||||
Arc::clone(&self.pinned_slots)
|
||||
.acquire_owned()
|
||||
.await
|
||||
.expect("the semaphore is never closed"),
|
||||
))
|
||||
}
|
||||
|
||||
async fn try_get_pinned_slot_permit<'c>(
|
||||
&self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> anyhow::Result<PermitKind<'c>> {
|
||||
if let Some(permit) = ctx.permit() {
|
||||
return Ok(PermitKind::CtxProvided(permit));
|
||||
};
|
||||
let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
|
||||
match tokio::time::timeout(
|
||||
// Choose small timeout, neon_smgr does its own retries.
|
||||
@@ -493,9 +596,9 @@ impl PageCache {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(res) => Ok(PinnedSlotsPermit(
|
||||
Ok(res) => Ok(PermitKind::Acquired(PinnedSlotsPermit(
|
||||
res.expect("this semaphore is never closed"),
|
||||
)),
|
||||
))),
|
||||
Err(_timeout) => {
|
||||
timer.stop_and_discard();
|
||||
crate::metrics::page_cache_errors_inc(
|
||||
@@ -515,10 +618,10 @@ impl PageCache {
|
||||
///
|
||||
/// If no page is found, returns None and *cache_key is left unmodified.
|
||||
///
|
||||
async fn try_lock_for_read(
|
||||
async fn try_lock_for_read<'c>(
|
||||
&self,
|
||||
cache_key: &mut CacheKey,
|
||||
permit: &mut Option<PinnedSlotsPermit>,
|
||||
permit: &mut Option<PermitKind<'c>>,
|
||||
) -> Option<PageReadGuard> {
|
||||
let cache_key_orig = cache_key.clone();
|
||||
if let Some(slot_idx) = self.search_mapping(cache_key) {
|
||||
@@ -571,11 +674,11 @@ impl PageCache {
|
||||
/// ```
|
||||
///
|
||||
async fn lock_for_read(
|
||||
&self,
|
||||
&'static self,
|
||||
cache_key: &mut CacheKey,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit(ctx).await?);
|
||||
|
||||
let (read_access, hit) = match cache_key {
|
||||
CacheKey::MaterializedPage { .. } => {
|
||||
@@ -638,99 +741,10 @@ impl PageCache {
|
||||
);
|
||||
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: false,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up a page in the cache and lock it in write mode. If it's not
|
||||
/// found, returns None.
|
||||
///
|
||||
/// When locking a page for writing, the search criteria is always "exact".
|
||||
async fn try_lock_for_write(
|
||||
&self,
|
||||
cache_key: &CacheKey,
|
||||
permit: &mut Option<PinnedSlotsPermit>,
|
||||
) -> Option<PageWriteGuard> {
|
||||
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
return Some(PageWriteGuard {
|
||||
state: PageWriteGuardState::Invalid {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Return a write-locked buffer for given block.
|
||||
///
|
||||
/// Similar to lock_for_read(), but the returned buffer is write-locked and
|
||||
/// may be modified by the caller even if it's already found in the cache.
|
||||
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
|
||||
debug_assert!(permit.is_none());
|
||||
return Ok(WriteBufResult::Found(write_guard));
|
||||
}
|
||||
debug_assert!(permit.is_some());
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self
|
||||
.find_victim(permit.as_ref().unwrap())
|
||||
.await
|
||||
.context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: false,
|
||||
}));
|
||||
}
|
||||
}
|
||||
@@ -775,7 +789,7 @@ impl PageCache {
|
||||
///
|
||||
/// Like 'search_mapping, but performs an "exact" search. Used for
|
||||
/// allocating a new buffer.
|
||||
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
||||
fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
|
||||
match key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
@@ -882,10 +896,12 @@ impl PageCache {
|
||||
///
|
||||
/// On return, the slot is empty and write-locked.
|
||||
async fn find_victim(
|
||||
&self,
|
||||
&'static self,
|
||||
_permit_witness: &PinnedSlotsPermit,
|
||||
) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
|
||||
let iter_limit = self.slots.len() * 10;
|
||||
// Get in line.
|
||||
let receiver = self.find_victim_waiters.recv();
|
||||
|
||||
let mut iters = 0;
|
||||
loop {
|
||||
iters += 1;
|
||||
@@ -897,41 +913,8 @@ impl PageCache {
|
||||
let mut inner = match slot.inner.try_write() {
|
||||
Ok(inner) => inner,
|
||||
Err(_err) => {
|
||||
if iters > iter_limit {
|
||||
// NB: Even with the permits, there's no hard guarantee that we will find a slot with
|
||||
// any particular number of iterations: other threads might race ahead and acquire and
|
||||
// release pins just as we're scanning the array.
|
||||
//
|
||||
// Imagine that nslots is 2, and as starting point, usage_count==1 on all
|
||||
// slots. There are two threads running concurrently, A and B. A has just
|
||||
// acquired the permit from the semaphore.
|
||||
//
|
||||
// A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
|
||||
// B: Acquire permit.
|
||||
// B: Look at slot 2, decrement its usage_count to zero and continue the search
|
||||
// B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
|
||||
// B: Release pin and permit again
|
||||
// B: Acquire permit.
|
||||
// B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
|
||||
// B: Release pin and permit again
|
||||
//
|
||||
// Now we're back in the starting situation that both slots have
|
||||
// usage_count 1, but A has now been through one iteration of the
|
||||
// find_victim() loop. This can repeat indefinitely and on each
|
||||
// iteration, A's iteration count increases by one.
|
||||
//
|
||||
// So, even though the semaphore for the permits is fair, the victim search
|
||||
// itself happens in parallel and is not fair.
|
||||
// Hence even with a permit, a task can theoretically be starved.
|
||||
// To avoid this, we'd need tokio to give priority to tasks that are holding
|
||||
// permits for longer.
|
||||
// Note that just yielding to tokio during iteration without such
|
||||
// priority boosting is likely counter-productive. We'd just give more opportunities
|
||||
// for B to bump usage count, further starving A.
|
||||
crate::metrics::page_cache_errors_inc(
|
||||
crate::metrics::PageCacheErrorKind::EvictIterLimit,
|
||||
);
|
||||
anyhow::bail!("exceeded evict iter limit");
|
||||
if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
|
||||
unreachable!("find_victim_waiters prevents starvation");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -942,7 +925,16 @@ impl PageCache {
|
||||
inner.key = None;
|
||||
}
|
||||
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
|
||||
return Ok((slot_idx, inner));
|
||||
self.find_victim_sender
|
||||
.try_send((slot_idx, inner))
|
||||
.expect("we always get in line first");
|
||||
match futures::poll!(receiver) {
|
||||
Poll::Ready(Ok(res)) => return Ok(res),
|
||||
Poll::Ready(Err(_closed)) => unreachable!("we never close"),
|
||||
Poll::Pending => {
|
||||
unreachable!("we just sent to the channel and got in line earlier")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -979,6 +971,7 @@ impl PageCache {
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (find_victim_sender, find_victim_waiters) = async_channel::bounded(num_pages);
|
||||
Self {
|
||||
materialized_page_map: Default::default(),
|
||||
immutable_page_map: Default::default(),
|
||||
@@ -986,6 +979,8 @@ impl PageCache {
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
size_metrics,
|
||||
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
|
||||
find_victim_sender,
|
||||
find_victim_waiters,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,10 +20,10 @@ use std::io::{Error, ErrorKind};
|
||||
|
||||
impl<'a> BlockCursor<'a> {
|
||||
/// Read a blob into a new buffer.
|
||||
pub async fn read_blob(
|
||||
pub async fn read_blob<'c>(
|
||||
&self,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_blob_into_buf(offset, &mut buf, ctx).await?;
|
||||
@@ -31,11 +31,11 @@ impl<'a> BlockCursor<'a> {
|
||||
}
|
||||
/// Read blob into the given buffer. Any previous contents in the buffer
|
||||
/// are overwritten.
|
||||
pub async fn read_blob_into_buf(
|
||||
pub async fn read_blob_into_buf<'c>(
|
||||
&self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut blknum = (offset / PAGE_SZ as u64) as u32;
|
||||
let mut off = (offset % PAGE_SZ as u64) as usize;
|
||||
|
||||
@@ -34,27 +34,27 @@ where
|
||||
}
|
||||
|
||||
/// Reference to an in-memory copy of an immutable on-disk block.
|
||||
pub enum BlockLease<'a> {
|
||||
PageReadGuard(PageReadGuard<'static>),
|
||||
pub enum BlockLease<'c, 'a> {
|
||||
PageReadGuard(PageReadGuard<'c, 'static>),
|
||||
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
|
||||
#[cfg(test)]
|
||||
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
|
||||
}
|
||||
|
||||
impl From<PageReadGuard<'static>> for BlockLease<'static> {
|
||||
fn from(value: PageReadGuard<'static>) -> BlockLease<'static> {
|
||||
impl<'c, 'a> From<PageReadGuard<'c, 'a>> for BlockLease<'c, 'a> {
|
||||
fn from(value: PageReadGuard<'c, 'a>) -> BlockLease<'c, 'a> {
|
||||
BlockLease::PageReadGuard(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
|
||||
impl<'c, 'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'c, 'a> {
|
||||
fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
|
||||
BlockLease::Arc(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BlockLease<'a> {
|
||||
impl<'c, 'a> Deref for BlockLease<'c, 'a> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@@ -83,11 +83,11 @@ pub(crate) enum BlockReaderRef<'a> {
|
||||
|
||||
impl<'a> BlockReaderRef<'a> {
|
||||
#[inline(always)]
|
||||
async fn read_blk(
|
||||
async fn read_blk<'c>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
ctx: &'c RequestContext,
|
||||
) -> Result<BlockLease<'c, '_>, std::io::Error> {
|
||||
use BlockReaderRef::*;
|
||||
match self {
|
||||
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
||||
@@ -141,11 +141,11 @@ impl<'a> BlockCursor<'a> {
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
#[inline(always)]
|
||||
pub async fn read_blk(
|
||||
pub async fn read_blk<'c>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
ctx: &'c RequestContext,
|
||||
) -> Result<BlockLease<'c, '_>, std::io::Error> {
|
||||
self.reader.read_blk(blknum, ctx).await
|
||||
}
|
||||
}
|
||||
@@ -180,32 +180,27 @@ impl FileBlockReader {
|
||||
/// Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
pub async fn read_blk(
|
||||
pub async fn read_blk<'c>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
ctx: &'c RequestContext,
|
||||
) -> Result<BlockLease<'c, 'static>, std::io::Error> {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => break Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
Ok(write_guard.mark_valid().into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,44 +64,40 @@ impl EphemeralFile {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
pub(crate) async fn read_blk<'c>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
ctx: &'c RequestContext,
|
||||
) -> Result<BlockLease<'c, '_>, io::Error> {
|
||||
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
|
||||
if flushed_blknums.contains(&(blknum as u64)) {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum, self.file.path, e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum, self.file.path, e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
}
|
||||
};
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
@@ -171,7 +167,7 @@ impl EphemeralFile {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
|
||||
write_guard.mark_valid();
|
||||
let _ = write_guard.mark_valid();
|
||||
// pre-warm successful
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -549,7 +549,7 @@ impl DeltaLayer {
|
||||
/// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
|
||||
pub(crate) async fn load_keys<'c>(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.await
|
||||
@@ -1038,9 +1038,9 @@ pub struct ValueRef<'a> {
|
||||
reader: BlockCursor<'a>,
|
||||
}
|
||||
|
||||
impl<'a> ValueRef<'a> {
|
||||
impl<'c, 'a> ValueRef<'a> {
|
||||
/// Loads the value from disk
|
||||
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
|
||||
pub async fn load(&self, ctx: &'c RequestContext) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
@@ -1051,11 +1051,11 @@ impl<'a> ValueRef<'a> {
|
||||
pub(crate) struct Adapter<T>(T);
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
|
||||
pub(crate) async fn read_blk(
|
||||
pub(crate) async fn read_blk<'c>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
ctx: &'c RequestContext,
|
||||
) -> Result<BlockLease<'c, '_>, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,7 +158,7 @@ pub struct Timeline {
|
||||
|
||||
/// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
|
||||
/// Never changes for the lifetime of this [`Timeline`] object.
|
||||
///
|
||||
///
|
||||
/// This duplicates the generation stored in LocationConf, but that structure is mutable:
|
||||
/// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
|
||||
generation: Generation,
|
||||
@@ -505,7 +505,7 @@ impl Timeline {
|
||||
timer.stop_and_record();
|
||||
|
||||
let start = Instant::now();
|
||||
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
|
||||
let res = self.reconstruct_value(key, lsn, reconstruct_state, ctx).await;
|
||||
let elapsed = start.elapsed();
|
||||
crate::metrics::RECONSTRUCT_TIME
|
||||
.for_result(&res)
|
||||
@@ -4279,6 +4279,7 @@ impl Timeline {
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut data: ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
// Perform WAL redo if needed
|
||||
data.records.reverse();
|
||||
@@ -4342,6 +4343,7 @@ impl Timeline {
|
||||
key,
|
||||
last_rec_lsn,
|
||||
&img,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("Materialized page memoization failed")
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
sync::{Arc, Mutex},
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -25,7 +25,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
context::{DownloadBehavior, RequestContext, RequestContextBuilder},
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
@@ -397,9 +397,14 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let permit = crate::page_cache::get().get_permit().await;
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_cache_permit(permit)
|
||||
.build();
|
||||
|
||||
// imitiate repartiting on first compactation
|
||||
if let Err(e) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.collect_keyspace(lsn, &ctx)
|
||||
.instrument(info_span!("collect_keyspace"))
|
||||
.await
|
||||
{
|
||||
|
||||
@@ -544,7 +544,7 @@ impl VirtualFile {
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
|
||||
) -> Result<crate::tenant::block_io::BlockLease<'_, '_>, std::io::Error> {
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
let mut buf = [0; PAGE_SZ];
|
||||
self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
|
||||
|
||||
@@ -7,12 +7,12 @@ OBJS = \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
neon.o \
|
||||
neon_utils.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
walproposer_utils.o \
|
||||
control_plane_connector.o
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
#include "neon_utils.h"
|
||||
#include "walproposer_utils.h"
|
||||
|
||||
#define PageStoreTrace DEBUG5
|
||||
|
||||
|
||||
424
pgxn/neon/libpqwalproposer.c
Normal file
424
pgxn/neon/libpqwalproposer.c
Normal file
@@ -0,0 +1,424 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
|
||||
struct WalProposerConn
|
||||
{
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received data from
|
||||
* walprop_async_read */
|
||||
};
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
{
|
||||
/* If we're already correctly blocking or nonblocking, all good */
|
||||
if (is_nonblocking == conn->is_nonblocking)
|
||||
return true;
|
||||
|
||||
/* Otherwise, set it appropriately */
|
||||
if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
|
||||
return false;
|
||||
|
||||
conn->is_nonblocking = is_nonblocking;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Exported function definitions */
|
||||
char *
|
||||
walprop_error_message(WalProposerConn *conn)
|
||||
{
|
||||
return PQerrorMessage(conn->pg_conn);
|
||||
}
|
||||
|
||||
WalProposerConnStatusType
|
||||
walprop_status(WalProposerConn *conn)
|
||||
{
|
||||
switch (PQstatus(conn->pg_conn))
|
||||
{
|
||||
case CONNECTION_OK:
|
||||
return WP_CONNECTION_OK;
|
||||
case CONNECTION_BAD:
|
||||
return WP_CONNECTION_BAD;
|
||||
default:
|
||||
return WP_CONNECTION_IN_PROGRESS;
|
||||
}
|
||||
}
|
||||
|
||||
WalProposerConn *
|
||||
walprop_connect_start(char *conninfo, char *password)
|
||||
{
|
||||
WalProposerConn *conn;
|
||||
PGconn *pg_conn;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
|
||||
/*
|
||||
* Connect using the given connection string. If the
|
||||
* NEON_AUTH_TOKEN environment variable was set, use that as
|
||||
* the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so
|
||||
* when we set the password before the connection string, the
|
||||
* connection string can override the password from the env variable.
|
||||
* Seems useful, although we don't currently use that capability
|
||||
* anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (password)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = password;
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = conninfo;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
pg_conn = PQconnectStartParams(keywords, values, 1);
|
||||
|
||||
/*
|
||||
* Allocation of a PQconn can fail, and will return NULL. We want to fully
|
||||
* replicate the behavior of PQconnectStart here.
|
||||
*/
|
||||
if (!pg_conn)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* And in theory this allocation can fail as well, but it's incredibly
|
||||
* unlikely if we just successfully allocated a PGconn.
|
||||
*
|
||||
* palloc will exit on failure though, so there's not much we could do if
|
||||
* it *did* fail.
|
||||
*/
|
||||
conn = palloc(sizeof(WalProposerConn));
|
||||
conn->pg_conn = pg_conn;
|
||||
conn->is_nonblocking = false; /* connections always start in blocking
|
||||
* mode */
|
||||
conn->recvbuf = NULL;
|
||||
return conn;
|
||||
}
|
||||
|
||||
WalProposerConnectPollStatusType
|
||||
walprop_connect_poll(WalProposerConn *conn)
|
||||
{
|
||||
WalProposerConnectPollStatusType return_val;
|
||||
|
||||
switch (PQconnectPoll(conn->pg_conn))
|
||||
{
|
||||
case PGRES_POLLING_FAILED:
|
||||
return_val = WP_CONN_POLLING_FAILED;
|
||||
break;
|
||||
case PGRES_POLLING_READING:
|
||||
return_val = WP_CONN_POLLING_READING;
|
||||
break;
|
||||
case PGRES_POLLING_WRITING:
|
||||
return_val = WP_CONN_POLLING_WRITING;
|
||||
break;
|
||||
case PGRES_POLLING_OK:
|
||||
return_val = WP_CONN_POLLING_OK;
|
||||
break;
|
||||
|
||||
/*
|
||||
* There's a comment at its source about this constant being
|
||||
* unused. We'll expect it's never returned.
|
||||
*/
|
||||
case PGRES_POLLING_ACTIVE:
|
||||
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
|
||||
|
||||
/*
|
||||
* This return is never actually reached, but it's here to make
|
||||
* the compiler happy
|
||||
*/
|
||||
return WP_CONN_POLLING_FAILED;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
|
||||
}
|
||||
|
||||
return return_val;
|
||||
}
|
||||
|
||||
bool
|
||||
walprop_send_query(WalProposerConn *conn, char *query)
|
||||
{
|
||||
/*
|
||||
* We need to be in blocking mode for sending the query to run without
|
||||
* requiring a call to PQflush
|
||||
*/
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
return false;
|
||||
|
||||
/* PQsendQuery returns 1 on success, 0 on failure */
|
||||
if (!PQsendQuery(conn->pg_conn, query))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
WalProposerExecStatusType
|
||||
walprop_get_query_result(WalProposerConn *conn)
|
||||
{
|
||||
PGresult *result;
|
||||
WalProposerExecStatusType return_val;
|
||||
|
||||
/* Marker variable if we need to log an unexpected success result */
|
||||
char *unexpected_success = NULL;
|
||||
|
||||
/* Consume any input that we might be missing */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
return WP_EXEC_FAILED;
|
||||
|
||||
if (PQisBusy(conn->pg_conn))
|
||||
return WP_EXEC_NEEDS_INPUT;
|
||||
|
||||
|
||||
result = PQgetResult(conn->pg_conn);
|
||||
|
||||
/*
|
||||
* PQgetResult returns NULL only if getting the result was successful &
|
||||
* there's no more of the result to get.
|
||||
*/
|
||||
if (!result)
|
||||
{
|
||||
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
|
||||
return WP_EXEC_UNEXPECTED_SUCCESS;
|
||||
}
|
||||
|
||||
/* Helper macro to reduce boilerplate */
|
||||
#define UNEXPECTED_SUCCESS(msg) \
|
||||
return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
|
||||
unexpected_success = msg; \
|
||||
break;
|
||||
|
||||
|
||||
switch (PQresultStatus(result))
|
||||
{
|
||||
/* "true" success case */
|
||||
case PGRES_COPY_BOTH:
|
||||
return_val = WP_EXEC_SUCCESS_COPYBOTH;
|
||||
break;
|
||||
|
||||
/* Unexpected success case */
|
||||
case PGRES_EMPTY_QUERY:
|
||||
UNEXPECTED_SUCCESS("empty query return");
|
||||
case PGRES_COMMAND_OK:
|
||||
UNEXPECTED_SUCCESS("data-less command end");
|
||||
case PGRES_TUPLES_OK:
|
||||
UNEXPECTED_SUCCESS("tuples return");
|
||||
case PGRES_COPY_OUT:
|
||||
UNEXPECTED_SUCCESS("'Copy Out' response");
|
||||
case PGRES_COPY_IN:
|
||||
UNEXPECTED_SUCCESS("'Copy In' response");
|
||||
case PGRES_SINGLE_TUPLE:
|
||||
UNEXPECTED_SUCCESS("single tuple return");
|
||||
case PGRES_PIPELINE_SYNC:
|
||||
UNEXPECTED_SUCCESS("pipeline sync point");
|
||||
|
||||
/* Failure cases */
|
||||
case PGRES_BAD_RESPONSE:
|
||||
case PGRES_NONFATAL_ERROR:
|
||||
case PGRES_FATAL_ERROR:
|
||||
case PGRES_PIPELINE_ABORTED:
|
||||
return_val = WP_EXEC_FAILED;
|
||||
break;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
|
||||
}
|
||||
|
||||
if (unexpected_success)
|
||||
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
|
||||
|
||||
return return_val;
|
||||
}
|
||||
|
||||
pgsocket
|
||||
walprop_socket(WalProposerConn *conn)
|
||||
{
|
||||
return PQsocket(conn->pg_conn);
|
||||
}
|
||||
|
||||
int
|
||||
walprop_flush(WalProposerConn *conn)
|
||||
{
|
||||
return (PQflush(conn->pg_conn));
|
||||
}
|
||||
|
||||
void
|
||||
walprop_finish(WalProposerConn *conn)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
PQfinish(conn->pg_conn);
|
||||
pfree(conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
PGAsyncReadResult
|
||||
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (conn->recvbuf != NULL)
|
||||
{
|
||||
PQfreemem(conn->recvbuf);
|
||||
conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
/* Call PQconsumeInput so that we have the data we need */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
{
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
}
|
||||
|
||||
/*
|
||||
* The docs for PQgetCopyData list the return values as: 0 if the copy is
|
||||
* still in progress, but no "complete row" is available -1 if the copy is
|
||||
* done -2 if an error occurred (> 0) if it was successful; that value is
|
||||
* the amount transferred.
|
||||
*
|
||||
* The protocol we use between walproposer and safekeeper means that we
|
||||
* *usually* wouldn't expect to see that the copy is done, but this can
|
||||
* sometimes be triggered by the server returning an ErrorResponse (which
|
||||
* also happens to have the effect that the copy is done).
|
||||
*/
|
||||
switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
|
||||
{
|
||||
case 0:
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_TRY_AGAIN;
|
||||
case -1:
|
||||
{
|
||||
/*
|
||||
* If we get -1, it's probably because of a server error; the
|
||||
* safekeeper won't normally send a CopyDone message.
|
||||
*
|
||||
* We can check PQgetResult to make sure that the server
|
||||
* failed; it'll always result in PGRES_FATAL_ERROR
|
||||
*/
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
||||
|
||||
if (status != PGRES_FATAL_ERROR)
|
||||
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||
|
||||
/*
|
||||
* If there was actually an error, it'll be properly reported
|
||||
* by calls to PQerrorMessage -- we don't have to do anything
|
||||
* else
|
||||
*/
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
}
|
||||
case -2:
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
default:
|
||||
/* Positive values indicate the size of the returned result */
|
||||
*amount = result;
|
||||
*buf = conn->recvbuf;
|
||||
return PG_ASYNC_READ_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
PGAsyncWriteResult
|
||||
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we aren't in non-blocking mode, switch to it. */
|
||||
if (!ensure_nonblocking_status(conn, true))
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
|
||||
/*
|
||||
* The docs for PQputcopyData list the return values as: 1 if the data was
|
||||
* queued, 0 if it was not queued because of full buffers, or -1 if an
|
||||
* error occurred
|
||||
*/
|
||||
result = PQputCopyData(conn->pg_conn, buf, size);
|
||||
|
||||
/*
|
||||
* We won't get a result of zero because walproposer always empties the
|
||||
* connection's buffers before sending more
|
||||
*/
|
||||
Assert(result != 0);
|
||||
|
||||
switch (result)
|
||||
{
|
||||
case 1:
|
||||
/* good -- continue */
|
||||
break;
|
||||
case -1:
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
default:
|
||||
elog(FATAL, "invalid return %d from PQputCopyData", result);
|
||||
}
|
||||
|
||||
/*
|
||||
* After queueing the data, we still need to flush to get it to send. This
|
||||
* might take multiple tries, but we don't want to wait around until it's
|
||||
* done.
|
||||
*
|
||||
* PQflush has the following returns (directly quoting the docs): 0 if
|
||||
* sucessful, 1 if it was unable to send all the data in the send queue
|
||||
* yet -1 if it failed for some reason
|
||||
*/
|
||||
switch (result = PQflush(conn->pg_conn))
|
||||
{
|
||||
case 0:
|
||||
return PG_ASYNC_WRITE_SUCCESS;
|
||||
case 1:
|
||||
return PG_ASYNC_WRITE_TRY_FLUSH;
|
||||
case -1:
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
default:
|
||||
elog(FATAL, "invalid return %d from PQflush", result);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is very similar to walprop_async_write. For more
|
||||
* information, refer to the comments there.
|
||||
*/
|
||||
bool
|
||||
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we are in non-blocking mode, switch out of it. */
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
return false;
|
||||
|
||||
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 1);
|
||||
|
||||
/* Because the connection is non-blocking, flushing returns 0 or -1 */
|
||||
|
||||
if ((result = PQflush(conn->pg_conn)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 0);
|
||||
return true;
|
||||
}
|
||||
@@ -18,10 +18,6 @@ extern char *neon_auth_token;
|
||||
extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
|
||||
extern char *wal_acceptors_list;
|
||||
extern int wal_acceptor_reconnect_timeout;
|
||||
extern int wal_acceptor_connection_timeout;
|
||||
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
@@ -34,10 +30,4 @@ extern void pg_init_extension_server(void);
|
||||
extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
|
||||
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
|
||||
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
|
||||
extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
|
||||
|
||||
#endif /* NEON_H */
|
||||
|
||||
@@ -1,116 +0,0 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/timeline.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "common/logging.h"
|
||||
#include "common/ip.h"
|
||||
#include "funcapi.h"
|
||||
#include "libpq/libpq.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "replication/slot.h"
|
||||
#include "replication/walsender_private.h"
|
||||
|
||||
#include "storage/ipc.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/ps_status.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
#include <netinet/tcp.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
#include "access/xlogutils.h"
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
#include "utils/guc.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Convert a character which represents a hexadecimal digit to an integer.
|
||||
*
|
||||
* Returns -1 if the character is not a hexadecimal digit.
|
||||
*/
|
||||
int
|
||||
HexDecodeChar(char c)
|
||||
{
|
||||
if (c >= '0' && c <= '9')
|
||||
return c - '0';
|
||||
if (c >= 'a' && c <= 'f')
|
||||
return c - 'a' + 10;
|
||||
if (c >= 'A' && c <= 'F')
|
||||
return c - 'A' + 10;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode a hex string into a byte string, 2 hex chars per byte.
|
||||
*
|
||||
* Returns false if invalid characters are encountered; otherwise true.
|
||||
*/
|
||||
bool
|
||||
HexDecodeString(uint8 *result, char *input, int nbytes)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nbytes; ++i)
|
||||
{
|
||||
int n1 = HexDecodeChar(input[i * 2]);
|
||||
int n2 = HexDecodeChar(input[i * 2 + 1]);
|
||||
|
||||
if (n1 < 0 || n2 < 0)
|
||||
return false;
|
||||
result[i] = n1 * 16 + n2;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order
|
||||
* --------------------------------
|
||||
*/
|
||||
uint32
|
||||
pq_getmsgint32_le(StringInfo msg)
|
||||
{
|
||||
uint32 n32;
|
||||
|
||||
pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
|
||||
|
||||
return n32;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order
|
||||
* --------------------------------
|
||||
*/
|
||||
uint64
|
||||
pq_getmsgint64_le(StringInfo msg)
|
||||
{
|
||||
uint64 n64;
|
||||
|
||||
pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
|
||||
|
||||
return n64;
|
||||
}
|
||||
|
||||
/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
|
||||
void
|
||||
pq_sendint32_le(StringInfo buf, uint32 i)
|
||||
{
|
||||
enlargeStringInfo(buf, sizeof(uint32));
|
||||
memcpy(buf->data + buf->len, &i, sizeof(uint32));
|
||||
buf->len += sizeof(uint32);
|
||||
}
|
||||
|
||||
/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
|
||||
void
|
||||
pq_sendint64_le(StringInfo buf, uint64 i)
|
||||
{
|
||||
enlargeStringInfo(buf, sizeof(uint64));
|
||||
memcpy(buf->data + buf->len, &i, sizeof(uint64));
|
||||
buf->len += sizeof(uint64);
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
#ifndef __NEON_UTILS_H__
|
||||
#define __NEON_UTILS_H__
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
bool HexDecodeString(uint8 *result, char *input, int nbytes);
|
||||
uint32 pq_getmsgint32_le(StringInfo msg);
|
||||
uint64 pq_getmsgint64_le(StringInfo msg);
|
||||
void pq_sendint32_le(StringInfo buf, uint32 i);
|
||||
void pq_sendint64_le(StringInfo buf, uint64 i);
|
||||
|
||||
#endif /* __NEON_UTILS_H__ */
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,8 @@
|
||||
#ifndef __NEON_WALPROPOSER_H__
|
||||
#define __NEON_WALPROPOSER_H__
|
||||
|
||||
#include "postgres.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#include "postgres.h"
|
||||
#include "port.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/transam.h"
|
||||
@@ -16,15 +16,29 @@
|
||||
#define MAX_SAFEKEEPERS 32
|
||||
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL
|
||||
* message */
|
||||
#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */
|
||||
#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender*
|
||||
* message header */
|
||||
#define XLOG_HDR_END_POS (1 + 8) /* offset of end position in wal sender*
|
||||
* message header */
|
||||
|
||||
/*
|
||||
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
|
||||
* because all WL_* events are given flags equal to some (1 << i), starting from i = 0
|
||||
*/
|
||||
#define WL_NO_EVENTS 0
|
||||
|
||||
struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */
|
||||
extern char *wal_acceptors_list;
|
||||
extern int wal_acceptor_reconnect_timeout;
|
||||
extern int wal_acceptor_connection_timeout;
|
||||
extern bool am_wal_proposer;
|
||||
|
||||
struct WalProposerConn; /* Defined in libpqwalproposer */
|
||||
typedef struct WalProposerConn WalProposerConn;
|
||||
|
||||
struct WalMessage;
|
||||
typedef struct WalMessage WalMessage;
|
||||
|
||||
/* Possible return values from ReadPGAsync */
|
||||
typedef enum
|
||||
{
|
||||
@@ -38,7 +52,7 @@ typedef enum
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from WritePGAsync */
|
||||
typedef enum
|
||||
@@ -57,7 +71,7 @@ typedef enum
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* WAL safekeeper state, which is used to wait for some event.
|
||||
@@ -133,7 +147,7 @@ typedef enum
|
||||
* to read.
|
||||
*/
|
||||
SS_ACTIVE,
|
||||
} SafekeeperState;
|
||||
} SafekeeperState;
|
||||
|
||||
/* Consensus logical timestamp. */
|
||||
typedef uint64 term_t;
|
||||
@@ -157,12 +171,12 @@ typedef struct ProposerGreeting
|
||||
uint8 tenant_id[16];
|
||||
TimeLineID timeline;
|
||||
uint32 walSegSize;
|
||||
} ProposerGreeting;
|
||||
} ProposerGreeting;
|
||||
|
||||
typedef struct AcceptorProposerMessage
|
||||
{
|
||||
uint64 tag;
|
||||
} AcceptorProposerMessage;
|
||||
} AcceptorProposerMessage;
|
||||
|
||||
/*
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for.
|
||||
@@ -172,7 +186,7 @@ typedef struct AcceptorGreeting
|
||||
AcceptorProposerMessage apm;
|
||||
term_t term;
|
||||
NNodeId nodeId;
|
||||
} AcceptorGreeting;
|
||||
} AcceptorGreeting;
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor vote request.
|
||||
@@ -182,20 +196,20 @@ typedef struct VoteRequest
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} VoteRequest;
|
||||
} VoteRequest;
|
||||
|
||||
/* Element of term switching chain. */
|
||||
typedef struct TermSwitchEntry
|
||||
{
|
||||
term_t term;
|
||||
XLogRecPtr lsn;
|
||||
} TermSwitchEntry;
|
||||
} TermSwitchEntry;
|
||||
|
||||
typedef struct TermHistory
|
||||
{
|
||||
uint32 n_entries;
|
||||
TermSwitchEntry *entries;
|
||||
} TermHistory;
|
||||
} TermHistory;
|
||||
|
||||
/* Vote itself, sent from safekeeper to proposer */
|
||||
typedef struct VoteResponse
|
||||
@@ -213,7 +227,7 @@ typedef struct VoteResponse
|
||||
* recovery of some safekeeper */
|
||||
TermHistory termHistory;
|
||||
XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
|
||||
} VoteResponse;
|
||||
} VoteResponse;
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor message announcing proposer is elected and communicating
|
||||
@@ -229,7 +243,7 @@ typedef struct ProposerElected
|
||||
TermHistory *termHistory;
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
} ProposerElected;
|
||||
} ProposerElected;
|
||||
|
||||
/*
|
||||
* Header of request with WAL message sent from proposer to safekeeper.
|
||||
@@ -254,7 +268,7 @@ typedef struct AppendRequestHeader
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} AppendRequestHeader;
|
||||
} AppendRequestHeader;
|
||||
|
||||
/*
|
||||
* Hot standby feedback received from replica
|
||||
@@ -264,7 +278,7 @@ typedef struct HotStandbyFeedback
|
||||
TimestampTz ts;
|
||||
FullTransactionId xmin;
|
||||
FullTransactionId catalog_xmin;
|
||||
} HotStandbyFeedback;
|
||||
} HotStandbyFeedback;
|
||||
|
||||
typedef struct PageserverFeedback
|
||||
{
|
||||
@@ -275,7 +289,7 @@ typedef struct PageserverFeedback
|
||||
XLogRecPtr disk_consistent_lsn;
|
||||
XLogRecPtr remote_consistent_lsn;
|
||||
TimestampTz replytime;
|
||||
} PageserverFeedback;
|
||||
} PageserverFeedback;
|
||||
|
||||
typedef struct WalproposerShmemState
|
||||
{
|
||||
@@ -283,7 +297,7 @@ typedef struct WalproposerShmemState
|
||||
PageserverFeedback feedback;
|
||||
term_t mineLastElectedTerm;
|
||||
pg_atomic_uint64 backpressureThrottlingTime;
|
||||
} WalproposerShmemState;
|
||||
} WalproposerShmemState;
|
||||
|
||||
/*
|
||||
* Report safekeeper state to proposer
|
||||
@@ -307,22 +321,17 @@ typedef struct AppendResponse
|
||||
/* and custom neon feedback. */
|
||||
/* This part of the message is extensible. */
|
||||
PageserverFeedback rf;
|
||||
} AppendResponse;
|
||||
} AppendResponse;
|
||||
|
||||
/* PageserverFeedback is extensible part of the message that is parsed separately */
|
||||
/* Other fields are fixed part */
|
||||
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
|
||||
|
||||
struct WalProposer;
|
||||
typedef struct WalProposer WalProposer;
|
||||
|
||||
/*
|
||||
* Descriptor of safekeeper
|
||||
*/
|
||||
typedef struct Safekeeper
|
||||
{
|
||||
WalProposer *wp;
|
||||
|
||||
char const *host;
|
||||
char const *port;
|
||||
|
||||
@@ -331,7 +340,7 @@ typedef struct Safekeeper
|
||||
*
|
||||
* May contain private information like password and should not be logged.
|
||||
*/
|
||||
char conninfo[MAXCONNINFO];
|
||||
char conninfo[MAXCONNINFO];
|
||||
|
||||
/*
|
||||
* postgres protocol connection to the WAL acceptor
|
||||
@@ -364,12 +373,27 @@ typedef struct Safekeeper
|
||||
int eventPos; /* position in wait event set. Equal to -1 if*
|
||||
* no event */
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
VoteResponse voteResponse; /* the vote */
|
||||
AppendResponse appendResponse; /* feedback for master */
|
||||
} Safekeeper;
|
||||
|
||||
extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
|
||||
extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
|
||||
extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
extern void WalProposerPoll(void);
|
||||
extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
|
||||
PageserverFeedback *rf);
|
||||
extern void StartProposerReplication(StartReplicationCmd *cmd);
|
||||
|
||||
extern Size WalproposerShmemSize(void);
|
||||
extern bool WalproposerShmemInit(void);
|
||||
extern void replication_feedback_set(PageserverFeedback *rf);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
/* libpqwalproposer hooks & helper type */
|
||||
|
||||
/* Re-exported PostgresPollingStatusType */
|
||||
typedef enum
|
||||
{
|
||||
@@ -382,7 +406,7 @@ typedef enum
|
||||
* 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
|
||||
* We've removed it here to avoid clutter.
|
||||
*/
|
||||
} WalProposerConnectPollStatusType;
|
||||
} WalProposerConnectPollStatusType;
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
@@ -407,7 +431,7 @@ typedef enum
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Re-exported ConnStatusType */
|
||||
typedef enum
|
||||
@@ -421,252 +445,67 @@ typedef enum
|
||||
* that extra functionality, so we collect them into a single tag here.
|
||||
*/
|
||||
WP_CONNECTION_IN_PROGRESS,
|
||||
} WalProposerConnStatusType;
|
||||
} WalProposerConnStatusType;
|
||||
|
||||
/* Re-exported PQerrorMessage */
|
||||
extern char *walprop_error_message(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQstatus */
|
||||
extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQconnectStart */
|
||||
extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
|
||||
|
||||
/* Re-exported PQconectPoll */
|
||||
extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
|
||||
|
||||
/* Blocking wrapper around PQsendQuery */
|
||||
extern bool walprop_send_query(WalProposerConn *conn, char *query);
|
||||
|
||||
/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
|
||||
extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQsocket */
|
||||
extern pgsocket walprop_socket(WalProposerConn *conn);
|
||||
|
||||
/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
|
||||
extern int walprop_flush(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQfinish */
|
||||
extern void walprop_finish(WalProposerConn *conn);
|
||||
|
||||
/*
|
||||
* Collection of hooks for walproposer, to call postgres functions,
|
||||
* read WAL and send it over the network.
|
||||
* Ergonomic wrapper around PGgetCopyData
|
||||
*
|
||||
* Reads a CopyData block from a safekeeper, setting *amount to the number
|
||||
* of bytes returned.
|
||||
*
|
||||
* This function is allowed to assume certain properties specific to the
|
||||
* protocol with the safekeepers, so it should not be used as-is for any
|
||||
* other purpose.
|
||||
*
|
||||
* Note: If possible, using <AsyncRead> is generally preferred, because it
|
||||
* performs a bit of extra checking work that's always required and is normally
|
||||
* somewhat verbose.
|
||||
*/
|
||||
typedef struct walproposer_api
|
||||
{
|
||||
/*
|
||||
* Get WalproposerShmemState. This is used to store information about last
|
||||
* elected term.
|
||||
*/
|
||||
WalproposerShmemState *(*get_shmem_state) (void);
|
||||
|
||||
/*
|
||||
* Start receiving notifications about new WAL. This is an infinite loop
|
||||
* which calls WalProposerBroadcast() and WalProposerPoll() to send the
|
||||
* WAL.
|
||||
*/
|
||||
void (*start_streaming) (WalProposer *wp, XLogRecPtr startpos);
|
||||
|
||||
/* Get pointer to the latest available WAL. */
|
||||
XLogRecPtr (*get_flush_rec_ptr) (void);
|
||||
|
||||
/* Get current time. */
|
||||
TimestampTz (*get_current_timestamp) (void);
|
||||
|
||||
/* Get postgres timeline. */
|
||||
TimeLineID (*get_timeline_id) (void);
|
||||
|
||||
/* Current error message, aka PQerrorMessage. */
|
||||
char *(*conn_error_message) (WalProposerConn *conn);
|
||||
|
||||
/* Connection status, aka PQstatus. */
|
||||
WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
|
||||
|
||||
/* Start the connection, aka PQconnectStart. */
|
||||
WalProposerConn *(*conn_connect_start) (char *conninfo);
|
||||
|
||||
/* Poll an asynchronous connection, aka PQconnectPoll. */
|
||||
WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
|
||||
|
||||
/* Send a blocking SQL query, aka PQsendQuery. */
|
||||
bool (*conn_send_query) (WalProposerConn *conn, char *query);
|
||||
|
||||
/* Read the query result, aka PQgetResult. */
|
||||
WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
|
||||
|
||||
/* Flush buffer to the network, aka PQflush. */
|
||||
int (*conn_flush) (WalProposerConn *conn);
|
||||
|
||||
/* Close the connection, aka PQfinish. */
|
||||
void (*conn_finish) (WalProposerConn *conn);
|
||||
|
||||
/* Try to read CopyData message, aka PQgetCopyData. */
|
||||
PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
|
||||
|
||||
/* Try to write CopyData message, aka PQputCopyData. */
|
||||
PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
/* Blocking CopyData write, aka PQputCopyData + PQflush. */
|
||||
bool (*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
/* Download WAL from startpos to endpos and make it available locally. */
|
||||
bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
|
||||
/* Read WAL from disk to buf. */
|
||||
void (*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
|
||||
|
||||
/* Allocate WAL reader. */
|
||||
XLogReaderState *(*wal_reader_allocate) (void);
|
||||
|
||||
/* Deallocate event set. */
|
||||
void (*free_event_set) (void);
|
||||
|
||||
/* Initialize event set. */
|
||||
void (*init_event_set) (int n_safekeepers);
|
||||
|
||||
/* Update events for an existing safekeeper connection. */
|
||||
void (*update_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/* Add a new safekeeper connection to the event set. */
|
||||
void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/*
|
||||
* Wait until some event happens: - timeout is reached - socket event for
|
||||
* safekeeper connection - new WAL is available
|
||||
*
|
||||
* Returns 0 if timeout is reached, 1 if some event happened. Updates
|
||||
* events mask to indicate events and sets sk to the safekeeper which has
|
||||
* an event.
|
||||
*/
|
||||
int (*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
|
||||
|
||||
/* Read random bytes. */
|
||||
bool (*strong_random) (void *buf, size_t len);
|
||||
|
||||
/*
|
||||
* Get a basebackup LSN. Used to cross-validate with the latest available
|
||||
* LSN on the safekeepers.
|
||||
*/
|
||||
XLogRecPtr (*get_redo_start_lsn) (void);
|
||||
|
||||
/*
|
||||
* Finish sync safekeepers with the given LSN. This function should not
|
||||
* return and should exit the program.
|
||||
*/
|
||||
void (*finish_sync_safekeepers) (XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Called after every new message from the safekeeper. Used to propagate
|
||||
* backpressure feedback and to confirm WAL persistence (has been commited
|
||||
* on the quorum of safekeepers).
|
||||
*/
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
|
||||
|
||||
/*
|
||||
* Called on peer_horizon_lsn updates. Used to advance replication slot
|
||||
* and to free up disk space by deleting unnecessary WAL.
|
||||
*/
|
||||
void (*confirm_wal_streamed) (XLogRecPtr lsn);
|
||||
} walproposer_api;
|
||||
extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);
|
||||
|
||||
/*
|
||||
* Configuration of the WAL proposer.
|
||||
* Ergonomic wrapper around PQputCopyData + PQflush
|
||||
*
|
||||
* Starts to write a CopyData block to a safekeeper.
|
||||
*
|
||||
* For information on the meaning of return codes, refer to PGAsyncWriteResult.
|
||||
*/
|
||||
typedef struct WalProposerConfig
|
||||
{
|
||||
/* hex-encoded TenantId cstr */
|
||||
char *neon_tenant;
|
||||
|
||||
/* hex-encoded TimelineId cstr */
|
||||
char *neon_timeline;
|
||||
|
||||
/*
|
||||
* Comma-separated list of safekeepers, in the following format:
|
||||
* host1:port1,host2:port2,host3:port3
|
||||
*
|
||||
* This cstr should be editable.
|
||||
*/
|
||||
char *safekeepers_list;
|
||||
|
||||
/*
|
||||
* WalProposer reconnects to offline safekeepers once in this interval.
|
||||
* Time is in milliseconds.
|
||||
*/
|
||||
int safekeeper_reconnect_timeout;
|
||||
|
||||
/*
|
||||
* WalProposer terminates the connection if it doesn't receive any message
|
||||
* from the safekeeper in this interval. Time is in milliseconds.
|
||||
*/
|
||||
int safekeeper_connection_timeout;
|
||||
|
||||
/*
|
||||
* WAL segment size. Will be passed to safekeepers in greet request. Also
|
||||
* used to detect page headers.
|
||||
*/
|
||||
int wal_segment_size;
|
||||
|
||||
/*
|
||||
* If safekeeper was started in sync mode, walproposer will not subscribe
|
||||
* for new WAL and will exit when quorum of safekeepers will be synced to
|
||||
* the latest available LSN.
|
||||
*/
|
||||
bool syncSafekeepers;
|
||||
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
uint64 systemId;
|
||||
} WalProposerConfig;
|
||||
|
||||
extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
/*
|
||||
* WAL proposer state.
|
||||
* Blocking equivalent to walprop_async_write_fn
|
||||
*
|
||||
* Returns 'true' if successful, 'false' on failure.
|
||||
*/
|
||||
typedef struct WalProposer
|
||||
{
|
||||
WalProposerConfig *config;
|
||||
int n_safekeepers;
|
||||
extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
/* (n_safekeepers / 2) + 1 */
|
||||
int quorum;
|
||||
|
||||
Safekeeper safekeeper[MAX_SAFEKEEPERS];
|
||||
|
||||
/* WAL has been generated up to this point */
|
||||
XLogRecPtr availableLsn;
|
||||
|
||||
/* last commitLsn broadcasted to safekeepers */
|
||||
XLogRecPtr lastSentCommitLsn;
|
||||
|
||||
ProposerGreeting greetRequest;
|
||||
|
||||
/* Vote request for safekeeper */
|
||||
VoteRequest voteRequest;
|
||||
|
||||
/*
|
||||
* Minimal LSN which may be needed for recovery of some safekeeper,
|
||||
* record-aligned (first record which might not yet received by someone).
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
|
||||
/*
|
||||
* Term of the proposer. We want our term to be highest and unique, so we
|
||||
* collect terms from safekeepers quorum, choose max and +1. After that
|
||||
* our term is fixed and must not change. If we observe that some
|
||||
* safekeeper has higher term, it means that we have another running
|
||||
* compute, so we must stop immediately.
|
||||
*/
|
||||
term_t propTerm;
|
||||
|
||||
/* term history of the proposer */
|
||||
TermHistory propTermHistory;
|
||||
|
||||
/* epoch start lsn of the proposer */
|
||||
XLogRecPtr propEpochStartLsn;
|
||||
|
||||
/* Most advanced acceptor epoch */
|
||||
term_t donorEpoch;
|
||||
|
||||
/* Most advanced acceptor */
|
||||
int donor;
|
||||
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
|
||||
/* number of votes collected from safekeepers */
|
||||
int n_votes;
|
||||
|
||||
/* number of successful connections over the lifetime of walproposer */
|
||||
int n_connected;
|
||||
|
||||
/*
|
||||
* Timestamp of the last reconnection attempt. Related to
|
||||
* config->safekeeper_reconnect_timeout
|
||||
*/
|
||||
TimestampTz last_reconnect_attempt;
|
||||
|
||||
walproposer_api api;
|
||||
} WalProposer;
|
||||
|
||||
extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api api);
|
||||
extern void WalProposerStart(WalProposer *wp);
|
||||
extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
extern void WalProposerPoll(WalProposer *wp);
|
||||
extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
|
||||
PageserverFeedback *rf);
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
|
||||
#endif /* __NEON_WALPROPOSER_H__ */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
659
pgxn/neon/walproposer_utils.c
Normal file
659
pgxn/neon/walproposer_utils.c
Normal file
@@ -0,0 +1,659 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/timeline.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "common/logging.h"
|
||||
#include "common/ip.h"
|
||||
#include "funcapi.h"
|
||||
#include "libpq/libpq.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "replication/slot.h"
|
||||
#include "walproposer_utils.h"
|
||||
#include "replication/walsender_private.h"
|
||||
|
||||
#include "storage/ipc.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/ps_status.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
#include <netinet/tcp.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
#include "access/xlogutils.h"
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
#include "utils/guc.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* These variables are used similarly to openLogFile/SegNo,
|
||||
* but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
|
||||
* corresponding the filename of walpropFile.
|
||||
*/
|
||||
static int walpropFile = -1;
|
||||
static TimeLineID walpropFileTLI = 0;
|
||||
static XLogSegNo walpropSegNo = 0;
|
||||
|
||||
/* START cloned file-local variables and functions from walsender.c */
|
||||
|
||||
/*
|
||||
* How far have we sent WAL already? This is also advertised in
|
||||
* MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.)
|
||||
*/
|
||||
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
|
||||
|
||||
static void WalSndLoop(void);
|
||||
static void XLogBroadcastWalProposer(void);
|
||||
/* END cloned file-level variables and functions from walsender.c */
|
||||
|
||||
int
|
||||
CompareLsn(const void *a, const void *b)
|
||||
{
|
||||
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
|
||||
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
|
||||
|
||||
if (lsn1 < lsn2)
|
||||
return -1;
|
||||
else if (lsn1 == lsn2)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Returns a human-readable string corresonding to the SafekeeperState
|
||||
*
|
||||
* The string should not be freed.
|
||||
*
|
||||
* The strings are intended to be used as a prefix to "state", e.g.:
|
||||
*
|
||||
* elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
|
||||
*
|
||||
* If this sort of phrasing doesn't fit the message, instead use something like:
|
||||
*
|
||||
* elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
|
||||
*/
|
||||
char *
|
||||
FormatSafekeeperState(SafekeeperState state)
|
||||
{
|
||||
char *return_val = NULL;
|
||||
|
||||
switch (state)
|
||||
{
|
||||
case SS_OFFLINE:
|
||||
return_val = "offline";
|
||||
break;
|
||||
case SS_CONNECTING_READ:
|
||||
case SS_CONNECTING_WRITE:
|
||||
return_val = "connecting";
|
||||
break;
|
||||
case SS_WAIT_EXEC_RESULT:
|
||||
return_val = "receiving query result";
|
||||
break;
|
||||
case SS_HANDSHAKE_RECV:
|
||||
return_val = "handshake (receiving)";
|
||||
break;
|
||||
case SS_VOTING:
|
||||
return_val = "voting";
|
||||
break;
|
||||
case SS_WAIT_VERDICT:
|
||||
return_val = "wait-for-verdict";
|
||||
break;
|
||||
case SS_SEND_ELECTED_FLUSH:
|
||||
return_val = "send-announcement-flush";
|
||||
break;
|
||||
case SS_IDLE:
|
||||
return_val = "idle";
|
||||
break;
|
||||
case SS_ACTIVE:
|
||||
return_val = "active";
|
||||
break;
|
||||
}
|
||||
|
||||
Assert(return_val != NULL);
|
||||
|
||||
return return_val;
|
||||
}
|
||||
|
||||
/* Asserts that the provided events are expected for given safekeeper's state */
|
||||
void
|
||||
AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
{
|
||||
uint32 expected = SafekeeperStateDesiredEvents(sk->state);
|
||||
|
||||
/*
|
||||
* The events are in-line with what we're expecting, under two conditions:
|
||||
* (a) if we aren't expecting anything, `events` has no read- or
|
||||
* write-ready component. (b) if we are expecting something, there's
|
||||
* overlap (i.e. `events & expected != 0`)
|
||||
*/
|
||||
bool events_ok_for_state; /* long name so the `Assert` is more
|
||||
* clear later */
|
||||
|
||||
if (expected == WL_NO_EVENTS)
|
||||
events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
|
||||
else
|
||||
events_ok_for_state = ((events & expected) != 0);
|
||||
|
||||
if (!events_ok_for_state)
|
||||
{
|
||||
/*
|
||||
* To give a descriptive message in the case of failure, we use elog
|
||||
* and then an assertion that's guaranteed to fail.
|
||||
*/
|
||||
elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
Assert(events_ok_for_state);
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns the set of events a safekeeper in this state should be waiting on
|
||||
*
|
||||
* This will return WL_NO_EVENTS (= 0) for some events. */
|
||||
uint32
|
||||
SafekeeperStateDesiredEvents(SafekeeperState state)
|
||||
{
|
||||
uint32 result = WL_NO_EVENTS;
|
||||
|
||||
/* If the state doesn't have a modifier, we can check the base state */
|
||||
switch (state)
|
||||
{
|
||||
/* Connecting states say what they want in the name */
|
||||
case SS_CONNECTING_READ:
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
case SS_CONNECTING_WRITE:
|
||||
result = WL_SOCKET_WRITEABLE;
|
||||
break;
|
||||
|
||||
/* Reading states need the socket to be read-ready to continue */
|
||||
case SS_WAIT_EXEC_RESULT:
|
||||
case SS_HANDSHAKE_RECV:
|
||||
case SS_WAIT_VERDICT:
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
|
||||
/*
|
||||
* Idle states use read-readiness as a sign that the connection
|
||||
* has been disconnected.
|
||||
*/
|
||||
case SS_VOTING:
|
||||
case SS_IDLE:
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
|
||||
/*
|
||||
* Flush states require write-ready for flushing. Active state
|
||||
* does both reading and writing.
|
||||
*
|
||||
* TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
|
||||
* should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
|
||||
*/
|
||||
case SS_SEND_ELECTED_FLUSH:
|
||||
case SS_ACTIVE:
|
||||
result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
|
||||
break;
|
||||
|
||||
/* The offline state expects no events. */
|
||||
case SS_OFFLINE:
|
||||
result = WL_NO_EVENTS;
|
||||
break;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Returns a human-readable string corresponding to the event set
|
||||
*
|
||||
* If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
|
||||
* returned string may be meaingless.
|
||||
*
|
||||
* The string should not be freed. It should also not be expected to remain the same between
|
||||
* function calls. */
|
||||
char *
|
||||
FormatEvents(uint32 events)
|
||||
{
|
||||
static char return_str[8];
|
||||
|
||||
/* Helper variable to check if there's extra bits */
|
||||
uint32 all_flags = WL_LATCH_SET
|
||||
| WL_SOCKET_READABLE
|
||||
| WL_SOCKET_WRITEABLE
|
||||
| WL_TIMEOUT
|
||||
| WL_POSTMASTER_DEATH
|
||||
| WL_EXIT_ON_PM_DEATH
|
||||
| WL_SOCKET_CONNECTED;
|
||||
|
||||
/*
|
||||
* The formatting here isn't supposed to be *particularly* useful -- it's
|
||||
* just to give an sense of what events have been triggered without
|
||||
* needing to remember your powers of two.
|
||||
*/
|
||||
|
||||
return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_';
|
||||
return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_';
|
||||
return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
|
||||
return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_';
|
||||
return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
|
||||
return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
|
||||
return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
|
||||
|
||||
if (events & (~all_flags))
|
||||
{
|
||||
elog(WARNING, "Event formatting found unexpected component %d",
|
||||
events & (~all_flags));
|
||||
return_str[6] = '*';
|
||||
return_str[7] = '\0';
|
||||
}
|
||||
else
|
||||
return_str[6] = '\0';
|
||||
|
||||
return (char *) &return_str;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a character which represents a hexadecimal digit to an integer.
|
||||
*
|
||||
* Returns -1 if the character is not a hexadecimal digit.
|
||||
*/
|
||||
static int
|
||||
HexDecodeChar(char c)
|
||||
{
|
||||
if (c >= '0' && c <= '9')
|
||||
return c - '0';
|
||||
if (c >= 'a' && c <= 'f')
|
||||
return c - 'a' + 10;
|
||||
if (c >= 'A' && c <= 'F')
|
||||
return c - 'A' + 10;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode a hex string into a byte string, 2 hex chars per byte.
|
||||
*
|
||||
* Returns false if invalid characters are encountered; otherwise true.
|
||||
*/
|
||||
bool
|
||||
HexDecodeString(uint8 *result, char *input, int nbytes)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nbytes; ++i)
|
||||
{
|
||||
int n1 = HexDecodeChar(input[i * 2]);
|
||||
int n2 = HexDecodeChar(input[i * 2 + 1]);
|
||||
|
||||
if (n1 < 0 || n2 < 0)
|
||||
return false;
|
||||
result[i] = n1 * 16 + n2;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order
|
||||
* --------------------------------
|
||||
*/
|
||||
uint32
|
||||
pq_getmsgint32_le(StringInfo msg)
|
||||
{
|
||||
uint32 n32;
|
||||
|
||||
pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
|
||||
|
||||
return n32;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order
|
||||
* --------------------------------
|
||||
*/
|
||||
uint64
|
||||
pq_getmsgint64_le(StringInfo msg)
|
||||
{
|
||||
uint64 n64;
|
||||
|
||||
pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
|
||||
|
||||
return n64;
|
||||
}
|
||||
|
||||
/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
|
||||
void
|
||||
pq_sendint32_le(StringInfo buf, uint32 i)
|
||||
{
|
||||
enlargeStringInfo(buf, sizeof(uint32));
|
||||
memcpy(buf->data + buf->len, &i, sizeof(uint32));
|
||||
buf->len += sizeof(uint32);
|
||||
}
|
||||
|
||||
/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
|
||||
void
|
||||
pq_sendint64_le(StringInfo buf, uint64 i)
|
||||
{
|
||||
enlargeStringInfo(buf, sizeof(uint64));
|
||||
memcpy(buf->data + buf->len, &i, sizeof(uint64));
|
||||
buf->len += sizeof(uint64);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write XLOG data to disk.
|
||||
*/
|
||||
void
|
||||
XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
|
||||
{
|
||||
int startoff;
|
||||
int byteswritten;
|
||||
|
||||
while (nbytes > 0)
|
||||
{
|
||||
int segbytes;
|
||||
|
||||
/* Close the current segment if it's completed */
|
||||
if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
|
||||
XLogWalPropClose(recptr);
|
||||
|
||||
if (walpropFile < 0)
|
||||
{
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/* FIXME Is it ok to use hardcoded value here? */
|
||||
TimeLineID tli = 1;
|
||||
#else
|
||||
bool use_existent = true;
|
||||
#endif
|
||||
/* Create/use new log file */
|
||||
XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
walpropFile = XLogFileInit(walpropSegNo, tli);
|
||||
walpropFileTLI = tli;
|
||||
#else
|
||||
walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
|
||||
walpropFileTLI = ThisTimeLineID;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Calculate the start offset of the received logs */
|
||||
startoff = XLogSegmentOffset(recptr, wal_segment_size);
|
||||
|
||||
if (startoff + nbytes > wal_segment_size)
|
||||
segbytes = wal_segment_size - startoff;
|
||||
else
|
||||
segbytes = nbytes;
|
||||
|
||||
/* OK to write the logs */
|
||||
errno = 0;
|
||||
|
||||
byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
|
||||
if (byteswritten <= 0)
|
||||
{
|
||||
char xlogfname[MAXFNAMELEN];
|
||||
int save_errno;
|
||||
|
||||
/* if write didn't set errno, assume no disk space */
|
||||
if (errno == 0)
|
||||
errno = ENOSPC;
|
||||
|
||||
save_errno = errno;
|
||||
XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
|
||||
errno = save_errno;
|
||||
ereport(PANIC,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to log segment %s "
|
||||
"at offset %u, length %lu: %m",
|
||||
xlogfname, startoff, (unsigned long) segbytes)));
|
||||
}
|
||||
|
||||
/* Update state for write */
|
||||
recptr += byteswritten;
|
||||
|
||||
nbytes -= byteswritten;
|
||||
buf += byteswritten;
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the current segment if it's fully written up in the last cycle of
|
||||
* the loop.
|
||||
*/
|
||||
if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
|
||||
{
|
||||
XLogWalPropClose(recptr);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Close the current segment.
|
||||
*/
|
||||
void
|
||||
XLogWalPropClose(XLogRecPtr recptr)
|
||||
{
|
||||
Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
|
||||
|
||||
if (close(walpropFile) != 0)
|
||||
{
|
||||
char xlogfname[MAXFNAMELEN];
|
||||
|
||||
XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
|
||||
|
||||
ereport(PANIC,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not close log segment %s: %m",
|
||||
xlogfname)));
|
||||
}
|
||||
|
||||
walpropFile = -1;
|
||||
}
|
||||
|
||||
/* START of cloned functions from walsender.c */
|
||||
|
||||
/*
|
||||
* Subscribe for new WAL and stream it in the loop to safekeepers.
|
||||
*
|
||||
* At the moment, this never returns, but an ereport(ERROR) will take us back
|
||||
* to the main loop.
|
||||
*/
|
||||
void
|
||||
StartProposerReplication(StartReplicationCmd *cmd)
|
||||
{
|
||||
XLogRecPtr FlushPtr;
|
||||
TimeLineID currTLI;
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (ThisTimeLineID == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We assume here that we're logging enough information in the WAL for
|
||||
* log-shipping, since this is checked in PostmasterMain().
|
||||
*
|
||||
* NOTE: wal_level can only change at shutdown, so in most cases it is
|
||||
* difficult for there to be WAL data that we can still see that was
|
||||
* written at wal_level='minimal'.
|
||||
*/
|
||||
|
||||
if (cmd->slotname)
|
||||
{
|
||||
ReplicationSlotAcquire(cmd->slotname, true);
|
||||
if (SlotIsLogical(MyReplicationSlot))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("cannot use a logical replication slot for physical replication")));
|
||||
|
||||
/*
|
||||
* We don't need to verify the slot's restart_lsn here; instead we
|
||||
* rely on the caller requesting the starting point to use. If the
|
||||
* WAL segment doesn't exist, we'll fail later.
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* Select the timeline. If it was given explicitly by the client, use
|
||||
* that. Otherwise use the timeline of the last replayed record, which is
|
||||
* kept in ThisTimeLineID.
|
||||
*
|
||||
* Neon doesn't currently use PG Timelines, but it may in the future, so
|
||||
* we keep this code around to lighten the load for when we need it.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
FlushPtr = GetFlushRecPtr(&currTLI);
|
||||
#else
|
||||
FlushPtr = GetFlushRecPtr();
|
||||
currTLI = ThisTimeLineID;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When we first start replication the standby will be behind the
|
||||
* primary. For some applications, for example synchronous
|
||||
* replication, it is important to have a clear state for this initial
|
||||
* catchup mode, so we can trigger actions when we change streaming
|
||||
* state later. We may stay in this state for a long time, which is
|
||||
* exactly why we want to be able to monitor whether or not we are
|
||||
* still here.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that
|
||||
* hasn't been flushed to disk in this server yet.
|
||||
*/
|
||||
if (FlushPtr < cmd->startpoint)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
|
||||
LSN_FORMAT_ARGS(cmd->startpoint),
|
||||
LSN_FORMAT_ARGS(FlushPtr))));
|
||||
}
|
||||
|
||||
/* Start streaming from the requested point */
|
||||
sentPtr = cmd->startpoint;
|
||||
|
||||
/* Initialize shared memory status, too */
|
||||
SpinLockAcquire(&MyWalSnd->mutex);
|
||||
MyWalSnd->sentPtr = sentPtr;
|
||||
SpinLockRelease(&MyWalSnd->mutex);
|
||||
|
||||
SyncRepInitConfig();
|
||||
|
||||
/* Infinite send loop, never returns */
|
||||
WalSndLoop();
|
||||
|
||||
WalSndSetState(WALSNDSTATE_STARTUP);
|
||||
|
||||
if (cmd->slotname)
|
||||
ReplicationSlotRelease();
|
||||
}
|
||||
|
||||
/*
|
||||
* Main loop that waits for LSN updates and calls the walproposer.
|
||||
* Synchronous replication sets latch in WalSndWakeup at walsender.c
|
||||
*/
|
||||
static void
|
||||
WalSndLoop(void)
|
||||
{
|
||||
/* Clear any already-pending wakeups */
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
XLogBroadcastWalProposer();
|
||||
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify walproposer about the new WAL position.
|
||||
*/
|
||||
static void
|
||||
XLogBroadcastWalProposer(void)
|
||||
{
|
||||
XLogRecPtr startptr;
|
||||
XLogRecPtr endptr;
|
||||
|
||||
/* Start from the last sent position */
|
||||
startptr = sentPtr;
|
||||
|
||||
/*
|
||||
* Streaming the current timeline on a primary.
|
||||
*
|
||||
* Attempt to send all data that's already been written out and
|
||||
* fsync'd to disk. We cannot go further than what's been written out
|
||||
* given the current implementation of WALRead(). And in any case
|
||||
* it's unsafe to send WAL that is not securely down to disk on the
|
||||
* primary: if the primary subsequently crashes and restarts, standbys
|
||||
* must not have applied any WAL that got lost on the primary.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
endptr = GetFlushRecPtr(NULL);
|
||||
#else
|
||||
endptr = GetFlushRecPtr();
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Record the current system time as an approximation of the time at which
|
||||
* this WAL location was written for the purposes of lag tracking.
|
||||
*
|
||||
* In theory we could make XLogFlush() record a time in shmem whenever WAL
|
||||
* is flushed and we could get that time as well as the LSN when we call
|
||||
* GetFlushRecPtr() above (and likewise for the cascading standby
|
||||
* equivalent), but rather than putting any new code into the hot WAL path
|
||||
* it seems good enough to capture the time here. We should reach this
|
||||
* after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
|
||||
* may take some time, we read the WAL flush pointer and take the time
|
||||
* very close to together here so that we'll get a later position if it is
|
||||
* still moving.
|
||||
*
|
||||
* Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
|
||||
* this gives us a cheap approximation for the WAL flush time for this
|
||||
* LSN.
|
||||
*
|
||||
* Note that the LSN is not necessarily the LSN for the data contained in
|
||||
* the present message; it's the end of the WAL, which might be further
|
||||
* ahead. All the lag tracking machinery cares about is finding out when
|
||||
* that arbitrary LSN is eventually reported as written, flushed and
|
||||
* applied, so that it can measure the elapsed time.
|
||||
*/
|
||||
LagTrackerWrite(endptr, GetCurrentTimestamp());
|
||||
|
||||
/* Do we have any work to do? */
|
||||
Assert(startptr <= endptr);
|
||||
if (endptr <= startptr)
|
||||
return;
|
||||
|
||||
WalProposerBroadcast(startptr, endptr);
|
||||
sentPtr = endptr;
|
||||
|
||||
/* Update shared memory status */
|
||||
{
|
||||
WalSnd *walsnd = MyWalSnd;
|
||||
|
||||
SpinLockAcquire(&walsnd->mutex);
|
||||
walsnd->sentPtr = sentPtr;
|
||||
SpinLockRelease(&walsnd->mutex);
|
||||
}
|
||||
|
||||
/* Report progress of XLOG streaming in PS display */
|
||||
if (update_process_title)
|
||||
{
|
||||
char activitymsg[50];
|
||||
|
||||
snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
|
||||
LSN_FORMAT_ARGS(sentPtr));
|
||||
set_ps_display(activitymsg);
|
||||
}
|
||||
}
|
||||
19
pgxn/neon/walproposer_utils.h
Normal file
19
pgxn/neon/walproposer_utils.h
Normal file
@@ -0,0 +1,19 @@
|
||||
#ifndef __NEON_WALPROPOSER_UTILS_H__
|
||||
#define __NEON_WALPROPOSER_UTILS_H__
|
||||
|
||||
#include "walproposer.h"
|
||||
|
||||
int CompareLsn(const void *a, const void *b);
|
||||
char *FormatSafekeeperState(SafekeeperState state);
|
||||
void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
|
||||
char *FormatEvents(uint32 events);
|
||||
bool HexDecodeString(uint8 *result, char *input, int nbytes);
|
||||
uint32 pq_getmsgint32_le(StringInfo msg);
|
||||
uint64 pq_getmsgint64_le(StringInfo msg);
|
||||
void pq_sendint32_le(StringInfo buf, uint32 i);
|
||||
void pq_sendint64_le(StringInfo buf, uint64 i);
|
||||
void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
|
||||
void XLogWalPropClose(XLogRecPtr recptr);
|
||||
|
||||
#endif /* __NEON_WALPROPOSER_UTILS_H__ */
|
||||
@@ -1085,32 +1085,15 @@ class AbstractNeonCli(abc.ABC):
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
indent = " "
|
||||
if not res.returncode:
|
||||
stripped = res.stdout.strip()
|
||||
lines = stripped.splitlines()
|
||||
if len(lines) < 2:
|
||||
log.debug(f"Run {res.args} success: {stripped}")
|
||||
else:
|
||||
log.debug("Run %s success:\n%s" % (res.args, textwrap.indent(stripped, indent)))
|
||||
log.info(f"Run {res.args} success: {res.stdout}")
|
||||
elif check_return_code:
|
||||
# this way command output will be in recorded and shown in CI in failure message
|
||||
indent = indent * 2
|
||||
msg = textwrap.dedent(
|
||||
"""\
|
||||
Run %s failed:
|
||||
stdout:
|
||||
%s
|
||||
stderr:
|
||||
%s
|
||||
msg = f"""\
|
||||
Run {res.args} failed:
|
||||
stdout: {res.stdout}
|
||||
stderr: {res.stderr}
|
||||
"""
|
||||
)
|
||||
msg = msg % (
|
||||
res.args,
|
||||
textwrap.indent(res.stdout.strip(), indent),
|
||||
textwrap.indent(res.stderr.strip(), indent),
|
||||
)
|
||||
log.info(msg)
|
||||
raise RuntimeError(msg) from subprocess.CalledProcessError(
|
||||
res.returncode, res.args, res.stdout, res.stderr
|
||||
|
||||
@@ -116,10 +116,6 @@ def get_deletion_queue_submitted(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
|
||||
|
||||
|
||||
def get_deletion_queue_validated(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_validated_total")
|
||||
|
||||
|
||||
def get_deletion_queue_dropped(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
|
||||
|
||||
@@ -277,15 +273,12 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_attachment", [True, False])
|
||||
@pytest.mark.parametrize("validate_before", [True, False])
|
||||
def test_deletion_queue_recovery(
|
||||
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool, validate_before: bool
|
||||
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
|
||||
):
|
||||
"""
|
||||
:param keep_attachment: If true, we re-attach after restart. Else, we act as if some other
|
||||
node took the attachment while we were restarting.
|
||||
:param validate_before: If true, we wait for deletions to be validated before restart. This
|
||||
makes them elegible to be executed after restart, if the same node keeps the attachment.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
@@ -295,20 +288,12 @@ def test_deletion_queue_recovery(
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
failpoints = [
|
||||
# Prevent deletion lists from being executed, to build up some backlog of deletions
|
||||
("deletion-queue-before-execute", "return"),
|
||||
]
|
||||
|
||||
if not validate_before:
|
||||
failpoints.append(
|
||||
# Prevent deletion lists from being validated, we will test that they are
|
||||
# dropped properly during recovery. 'pause' is okay here because we kill
|
||||
# the pageserver with immediate=true
|
||||
("control-plane-client-validate", "pause")
|
||||
)
|
||||
|
||||
ps_http.configure_failpoints(failpoints)
|
||||
# Prevent deletion lists from being executed, to build up some backlog of deletions
|
||||
ps_http.configure_failpoints(
|
||||
[
|
||||
("deletion-queue-before-execute", "return"),
|
||||
]
|
||||
)
|
||||
|
||||
generate_uploads_and_deletions(env)
|
||||
|
||||
@@ -320,16 +305,6 @@ def test_deletion_queue_recovery(
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
if validate_before:
|
||||
|
||||
def assert_validation_complete():
|
||||
assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
|
||||
|
||||
wait_until(20, 1, assert_validation_complete)
|
||||
# A short wait to let the DeletionHeader get written out, as this happens after
|
||||
# the validated count gets incremented.
|
||||
time.sleep(1)
|
||||
|
||||
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
@@ -352,17 +327,14 @@ def test_deletion_queue_recovery(
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
|
||||
|
||||
if keep_attachment or validate_before:
|
||||
# - If we kept the attachment, then our pre-restart deletions should execute
|
||||
# because on re-attach they were from the immediately preceding generation
|
||||
# - If we validated before restart, then the deletions should execute because the
|
||||
# deletion queue header records a validated deletion list sequence number.
|
||||
if keep_attachment:
|
||||
# If we kept the attachment, then our pre-restart deletions should have executed
|
||||
# successfully
|
||||
assert get_deletion_queue_executed(ps_http) == before_restart_depth
|
||||
else:
|
||||
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
|
||||
|
||||
# If we lost the attachment, we should have dropped our pre-restart deletions.
|
||||
assert get_deletion_queue_dropped(ps_http) == before_restart_depth
|
||||
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
Reference in New Issue
Block a user