mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
This is a preparation for https://github.com/neondatabase/neon/issues/6337. The idea is to add FullAccessTimeline, which will act as a guard for tasks requiring access to WAL files. Eviction will be blocked on these tasks and WAL won't be deleted from disk until there is at least one active FullAccessTimeline. To get FullAccessTimeline, tasks call `tli.full_access_guard().await?`. After eviction is implemented, this function will be responsible for downloading missing WAL file and waiting until the download finishes. This commit also contains other small refactorings: - Separate `get_tenant_dir` and `get_timeline_dir` functions for building a local path. This is useful for looking at usages and finding tasks requiring access to local filesystem. - `timeline_manager` is now responsible for spawning all background tasks - WAL removal task is now spawned instantly after horizon is updated
772 lines
29 KiB
Rust
772 lines
29 KiB
Rust
//! This module has everything to deal with WAL -- reading and writing to disk.
|
|
//!
|
|
//! Safekeeper WAL is stored in the timeline directory, in format similar to pg_wal.
|
|
//! PG timeline is always 1, so WAL segments are usually have names like this:
|
|
//! - 000000010000000000000001
|
|
//! - 000000010000000000000002.partial
|
|
//!
|
|
//! Note that last file has `.partial` suffix, that's different from postgres.
|
|
|
|
use anyhow::{bail, Context, Result};
|
|
use bytes::Bytes;
|
|
use camino::{Utf8Path, Utf8PathBuf};
|
|
use futures::future::BoxFuture;
|
|
use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
|
|
use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI};
|
|
use remote_storage::RemotePath;
|
|
use std::cmp::{max, min};
|
|
use std::io::{self, SeekFrom};
|
|
use std::pin::Pin;
|
|
use tokio::fs::{self, remove_file, File, OpenOptions};
|
|
use tokio::io::{AsyncRead, AsyncWriteExt};
|
|
use tokio::io::{AsyncReadExt, AsyncSeekExt};
|
|
use tracing::*;
|
|
use utils::crashsafe::durable_rename;
|
|
|
|
use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
|
|
use crate::state::TimelinePersistentState;
|
|
use crate::wal_backup::{read_object, remote_timeline_path};
|
|
use crate::SafeKeeperConf;
|
|
use postgres_ffi::waldecoder::WalStreamDecoder;
|
|
use postgres_ffi::XLogFileName;
|
|
use postgres_ffi::XLOG_BLCKSZ;
|
|
use pq_proto::SystemId;
|
|
use utils::{id::TenantTimelineId, lsn::Lsn};
|
|
|
|
#[async_trait::async_trait]
|
|
pub trait Storage {
|
|
/// LSN of last durably stored WAL record.
|
|
fn flush_lsn(&self) -> Lsn;
|
|
|
|
/// Initialize segment by creating proper long header at the beginning of
|
|
/// the segment and short header at the page of given LSN. This is only used
|
|
/// for timeline initialization because compute will stream data only since
|
|
/// init_lsn. Other segment headers are included in compute stream.
|
|
async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>;
|
|
|
|
/// Write piece of WAL from buf to disk, but not necessarily sync it.
|
|
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
|
|
|
/// Truncate WAL at specified LSN, which must be the end of WAL record.
|
|
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
|
|
|
|
/// Durably store WAL on disk, up to the last written WAL record.
|
|
async fn flush_wal(&mut self) -> Result<()>;
|
|
|
|
/// Remove all segments <= given segno. Returns function doing that as we
|
|
/// want to perform it without timeline lock.
|
|
fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>>;
|
|
|
|
/// Release resources associated with the storage -- technically, close FDs.
|
|
/// Currently we don't remove timelines until restart (#3146), so need to
|
|
/// spare descriptors. This would be useful for temporary tli detach as
|
|
/// well.
|
|
fn close(&mut self) {}
|
|
|
|
/// Get metrics for this timeline.
|
|
fn get_metrics(&self) -> WalStorageMetrics;
|
|
}
|
|
|
|
/// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes
|
|
/// for better performance. Storage is initialized in the constructor.
|
|
///
|
|
/// WAL is stored in segments, each segment is a file. Last segment has ".partial" suffix in
|
|
/// its filename and may be not fully flushed.
|
|
///
|
|
/// Relationship of LSNs:
|
|
/// `write_lsn` >= `write_record_lsn` >= `flush_record_lsn`
|
|
///
|
|
/// When storage is created first time, all LSNs are zeroes and there are no segments on disk.
|
|
pub struct PhysicalStorage {
|
|
metrics: WalStorageMetrics,
|
|
timeline_dir: Utf8PathBuf,
|
|
conf: SafeKeeperConf,
|
|
|
|
/// Size of WAL segment in bytes.
|
|
wal_seg_size: usize,
|
|
pg_version: u32,
|
|
system_id: u64,
|
|
|
|
/// Written to disk, but possibly still in the cache and not fully persisted.
|
|
/// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
|
|
write_lsn: Lsn,
|
|
|
|
/// The LSN of the last WAL record written to disk. Still can be not fully flushed.
|
|
write_record_lsn: Lsn,
|
|
|
|
/// The LSN of the last WAL record flushed to disk.
|
|
flush_record_lsn: Lsn,
|
|
|
|
/// Decoder is required for detecting boundaries of WAL records.
|
|
decoder: WalStreamDecoder,
|
|
|
|
/// Cached open file for the last segment.
|
|
///
|
|
/// If Some(file) is open, then it always:
|
|
/// - has ".partial" suffix
|
|
/// - points to write_lsn, so no seek is needed for writing
|
|
/// - doesn't point to the end of the segment
|
|
file: Option<File>,
|
|
|
|
/// When false, we have just initialized storage using the LSN from find_end_of_wal().
|
|
/// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
|
|
/// there can be a case with unexpected .partial file.
|
|
///
|
|
/// Imagine the following:
|
|
/// - 000000010000000000000001
|
|
/// - it was fully written, but the last record is split between 2 segments
|
|
/// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in the end of this segment
|
|
/// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were initialized to 0/1FFFFF0
|
|
/// - 000000010000000000000002.partial
|
|
/// - it has only 1 byte written, which is not enough to make a full WAL record
|
|
///
|
|
/// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
|
|
/// This flag will be set to true after the first truncate_wal() call.
|
|
///
|
|
/// [`write_lsn`]: Self::write_lsn
|
|
is_truncated_after_restart: bool,
|
|
}
|
|
|
|
impl PhysicalStorage {
|
|
/// Create new storage. If commit_lsn is not zero, flush_lsn is tried to be restored from
|
|
/// the disk. Otherwise, all LSNs are set to zero.
|
|
pub fn new(
|
|
ttid: &TenantTimelineId,
|
|
timeline_dir: Utf8PathBuf,
|
|
conf: &SafeKeeperConf,
|
|
state: &TimelinePersistentState,
|
|
) -> Result<PhysicalStorage> {
|
|
let wal_seg_size = state.server.wal_seg_size as usize;
|
|
|
|
// Find out where stored WAL ends, starting at commit_lsn which is a
|
|
// known recent record boundary (unless we don't have WAL at all).
|
|
//
|
|
// NB: find_end_of_wal MUST be backwards compatible with the previously
|
|
// written WAL. If find_end_of_wal fails to read any WAL written by an
|
|
// older version of the code, we could lose data forever.
|
|
let write_lsn = if state.commit_lsn == Lsn(0) {
|
|
Lsn(0)
|
|
} else {
|
|
let version = state.server.pg_version / 10000;
|
|
|
|
dispatch_pgversion!(
|
|
version,
|
|
pgv::xlog_utils::find_end_of_wal(
|
|
timeline_dir.as_std_path(),
|
|
wal_seg_size,
|
|
state.commit_lsn,
|
|
)?,
|
|
bail!("unsupported postgres version: {}", version)
|
|
)
|
|
};
|
|
|
|
// TODO: do we really know that write_lsn is fully flushed to disk?
|
|
// If not, maybe it's better to call fsync() here to be sure?
|
|
let flush_lsn = write_lsn;
|
|
|
|
debug!(
|
|
"initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}",
|
|
ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
|
|
);
|
|
if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn {
|
|
warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id);
|
|
}
|
|
|
|
Ok(PhysicalStorage {
|
|
metrics: WalStorageMetrics::default(),
|
|
timeline_dir,
|
|
conf: conf.clone(),
|
|
wal_seg_size,
|
|
pg_version: state.server.pg_version,
|
|
system_id: state.server.system_id,
|
|
write_lsn,
|
|
write_record_lsn: write_lsn,
|
|
flush_record_lsn: flush_lsn,
|
|
decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
|
|
file: None,
|
|
is_truncated_after_restart: false,
|
|
})
|
|
}
|
|
|
|
/// Get all known state of the storage.
|
|
pub fn internal_state(&self) -> (Lsn, Lsn, Lsn, bool) {
|
|
(
|
|
self.write_lsn,
|
|
self.write_record_lsn,
|
|
self.flush_record_lsn,
|
|
self.file.is_some(),
|
|
)
|
|
}
|
|
|
|
/// Call fdatasync if config requires so.
|
|
async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
|
|
if !self.conf.no_sync {
|
|
self.metrics
|
|
.observe_flush_seconds(time_io_closure(file.sync_data()).await?);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Open or create WAL segment file. Caller must call seek to the wanted position.
|
|
/// Returns `file` and `is_partial`.
|
|
async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
|
|
let (wal_file_path, wal_file_partial_path) =
|
|
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
|
|
|
|
// Try to open already completed segment
|
|
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await {
|
|
Ok((file, false))
|
|
} else if let Ok(file) = OpenOptions::new()
|
|
.write(true)
|
|
.open(&wal_file_partial_path)
|
|
.await
|
|
{
|
|
// Try to open existing partial file
|
|
Ok((file, true))
|
|
} else {
|
|
// Create and fill new partial file
|
|
//
|
|
// We're using fdatasync during WAL writing, so file size must not
|
|
// change; to this end it is filled with zeros here. To avoid using
|
|
// half initialized segment, first bake it under tmp filename and
|
|
// then rename.
|
|
let tmp_path = self.timeline_dir.join("waltmp");
|
|
#[allow(clippy::suspicious_open_options)]
|
|
let mut file = OpenOptions::new()
|
|
.create(true)
|
|
.write(true)
|
|
.open(&tmp_path)
|
|
.await
|
|
.with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
|
|
|
|
write_zeroes(&mut file, self.wal_seg_size).await?;
|
|
|
|
// Note: this doesn't get into observe_flush_seconds metric. But
|
|
// segment init should be separate metric, if any.
|
|
if let Err(e) =
|
|
durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
|
|
{
|
|
// Probably rename succeeded, but fsync of it failed. Remove
|
|
// the file then to avoid using it.
|
|
remove_file(wal_file_partial_path)
|
|
.await
|
|
.or_else(utils::fs_ext::ignore_not_found)?;
|
|
return Err(e.into());
|
|
}
|
|
Ok((file, true))
|
|
}
|
|
}
|
|
|
|
/// Write WAL bytes, which are known to be located in a single WAL segment.
|
|
async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
|
|
let mut file = if let Some(file) = self.file.take() {
|
|
file
|
|
} else {
|
|
let (mut file, is_partial) = self.open_or_create(segno).await?;
|
|
assert!(is_partial, "unexpected write into non-partial segment file");
|
|
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
|
file
|
|
};
|
|
|
|
file.write_all(buf).await?;
|
|
// Note: flush just ensures write above reaches the OS (this is not
|
|
// needed in case of sync IO as Write::write there calls directly write
|
|
// syscall, but needed in case of async). It does *not* fsyncs the file.
|
|
file.flush().await?;
|
|
|
|
if xlogoff + buf.len() == self.wal_seg_size {
|
|
// If we reached the end of a WAL segment, flush and close it.
|
|
self.fdatasync_file(&file).await?;
|
|
|
|
// Rename partial file to completed file
|
|
let (wal_file_path, wal_file_partial_path) =
|
|
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
|
|
fs::rename(wal_file_partial_path, wal_file_path).await?;
|
|
} else {
|
|
// otherwise, file can be reused later
|
|
self.file = Some(file);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Writes WAL to the segment files, until everything is writed. If some segments
|
|
/// are fully written, they are flushed to disk. The last (partial) segment can
|
|
/// be flushed separately later.
|
|
///
|
|
/// Updates `write_lsn`.
|
|
async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
|
|
if self.write_lsn != pos {
|
|
// need to flush the file before discarding it
|
|
if let Some(file) = self.file.take() {
|
|
self.fdatasync_file(&file).await?;
|
|
}
|
|
|
|
self.write_lsn = pos;
|
|
}
|
|
|
|
while !buf.is_empty() {
|
|
// Extract WAL location for this block
|
|
let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size);
|
|
let segno = self.write_lsn.segment_number(self.wal_seg_size);
|
|
|
|
// If crossing a WAL boundary, only write up until we reach wal segment size.
|
|
let bytes_write = if xlogoff + buf.len() > self.wal_seg_size {
|
|
self.wal_seg_size - xlogoff
|
|
} else {
|
|
buf.len()
|
|
};
|
|
|
|
self.write_in_segment(segno, xlogoff, &buf[..bytes_write])
|
|
.await?;
|
|
self.write_lsn += bytes_write as u64;
|
|
buf = &buf[bytes_write..];
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[async_trait::async_trait]
|
|
impl Storage for PhysicalStorage {
|
|
/// flush_lsn returns LSN of last durably stored WAL record.
|
|
fn flush_lsn(&self) -> Lsn {
|
|
self.flush_record_lsn
|
|
}
|
|
|
|
async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> {
|
|
let segno = init_lsn.segment_number(self.wal_seg_size);
|
|
let (mut file, _) = self.open_or_create(segno).await?;
|
|
let major_pg_version = self.pg_version / 10000;
|
|
let wal_seg =
|
|
postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?;
|
|
file.seek(SeekFrom::Start(0)).await?;
|
|
file.write_all(&wal_seg).await?;
|
|
file.flush().await?;
|
|
info!("initialized segno {} at lsn {}", segno, init_lsn);
|
|
// note: file is *not* fsynced
|
|
Ok(())
|
|
}
|
|
|
|
/// Write WAL to disk.
|
|
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
|
// Disallow any non-sequential writes, which can result in gaps or overwrites.
|
|
// If we need to move the pointer, use truncate_wal() instead.
|
|
if self.write_lsn > startpos {
|
|
bail!(
|
|
"write_wal rewrites WAL written before, write_lsn={}, startpos={}",
|
|
self.write_lsn,
|
|
startpos
|
|
);
|
|
}
|
|
if self.write_lsn < startpos && self.write_lsn != Lsn(0) {
|
|
bail!(
|
|
"write_wal creates gap in written WAL, write_lsn={}, startpos={}",
|
|
self.write_lsn,
|
|
startpos
|
|
);
|
|
}
|
|
|
|
let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
|
|
// WAL is written, updating write metrics
|
|
self.metrics.observe_write_seconds(write_seconds);
|
|
self.metrics.observe_write_bytes(buf.len());
|
|
|
|
// figure out last record's end lsn for reporting (if we got the
|
|
// whole record)
|
|
if self.decoder.available() != startpos {
|
|
info!(
|
|
"restart decoder from {} to {}",
|
|
self.decoder.available(),
|
|
startpos,
|
|
);
|
|
let pg_version = self.decoder.pg_version;
|
|
self.decoder = WalStreamDecoder::new(startpos, pg_version);
|
|
}
|
|
self.decoder.feed_bytes(buf);
|
|
loop {
|
|
match self.decoder.poll_decode()? {
|
|
None => break, // no full record yet
|
|
Some((lsn, _rec)) => {
|
|
self.write_record_lsn = lsn;
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn flush_wal(&mut self) -> Result<()> {
|
|
if self.flush_record_lsn == self.write_record_lsn {
|
|
// no need to do extra flush
|
|
return Ok(());
|
|
}
|
|
|
|
if let Some(unflushed_file) = self.file.take() {
|
|
self.fdatasync_file(&unflushed_file).await?;
|
|
self.file = Some(unflushed_file);
|
|
} else {
|
|
// We have unflushed data (write_lsn != flush_lsn), but no file.
|
|
// This should only happen if last file was fully written and flushed,
|
|
// but haven't updated flush_lsn yet.
|
|
if self.write_lsn.segment_offset(self.wal_seg_size) != 0 {
|
|
bail!(
|
|
"unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}",
|
|
self.write_lsn,
|
|
self.flush_record_lsn
|
|
);
|
|
}
|
|
}
|
|
|
|
// everything is flushed now, let's update flush_lsn
|
|
self.flush_record_lsn = self.write_record_lsn;
|
|
Ok(())
|
|
}
|
|
|
|
/// Truncate written WAL by removing all WAL segments after the given LSN.
|
|
/// end_pos must point to the end of the WAL record.
|
|
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
|
// Streaming must not create a hole, so truncate cannot be called on non-written lsn
|
|
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
|
|
bail!(
|
|
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
|
|
self.write_lsn,
|
|
end_pos
|
|
);
|
|
}
|
|
|
|
// Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
|
|
// disk (this happens on each connect).
|
|
if self.is_truncated_after_restart
|
|
&& end_pos == self.write_lsn
|
|
&& end_pos == self.flush_record_lsn
|
|
{
|
|
return Ok(());
|
|
}
|
|
|
|
// Close previously opened file, if any
|
|
if let Some(unflushed_file) = self.file.take() {
|
|
self.fdatasync_file(&unflushed_file).await?;
|
|
}
|
|
|
|
let xlogoff = end_pos.segment_offset(self.wal_seg_size);
|
|
let segno = end_pos.segment_number(self.wal_seg_size);
|
|
|
|
// Remove all segments after the given LSN.
|
|
remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;
|
|
|
|
let (mut file, is_partial) = self.open_or_create(segno).await?;
|
|
|
|
// Fill end with zeroes
|
|
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
|
write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
|
|
self.fdatasync_file(&file).await?;
|
|
|
|
if !is_partial {
|
|
// Make segment partial once again
|
|
let (wal_file_path, wal_file_partial_path) =
|
|
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
|
|
fs::rename(wal_file_path, wal_file_partial_path).await?;
|
|
}
|
|
|
|
// Update LSNs
|
|
self.write_lsn = end_pos;
|
|
self.write_record_lsn = end_pos;
|
|
self.flush_record_lsn = end_pos;
|
|
self.is_truncated_after_restart = true;
|
|
Ok(())
|
|
}
|
|
|
|
fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
|
|
let timeline_dir = self.timeline_dir.clone();
|
|
let wal_seg_size = self.wal_seg_size;
|
|
Box::pin(async move {
|
|
remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to).await
|
|
})
|
|
}
|
|
|
|
fn close(&mut self) {
|
|
// close happens in destructor
|
|
let _open_file = self.file.take();
|
|
}
|
|
|
|
fn get_metrics(&self) -> WalStorageMetrics {
|
|
self.metrics.clone()
|
|
}
|
|
}
|
|
|
|
/// Remove all WAL segments in timeline_dir that match the given predicate.
|
|
async fn remove_segments_from_disk(
|
|
timeline_dir: &Utf8Path,
|
|
wal_seg_size: usize,
|
|
remove_predicate: impl Fn(XLogSegNo) -> bool,
|
|
) -> Result<()> {
|
|
let mut n_removed = 0;
|
|
let mut min_removed = u64::MAX;
|
|
let mut max_removed = u64::MIN;
|
|
|
|
let mut entries = fs::read_dir(timeline_dir).await?;
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
let entry_path = entry.path();
|
|
let fname = entry_path.file_name().unwrap();
|
|
|
|
if let Some(fname_str) = fname.to_str() {
|
|
/* Ignore files that are not XLOG segments */
|
|
if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) {
|
|
continue;
|
|
}
|
|
let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
|
|
if remove_predicate(segno) {
|
|
remove_file(entry_path).await?;
|
|
n_removed += 1;
|
|
min_removed = min(min_removed, segno);
|
|
max_removed = max(max_removed, segno);
|
|
REMOVED_WAL_SEGMENTS.inc();
|
|
}
|
|
}
|
|
}
|
|
|
|
if n_removed > 0 {
|
|
info!(
|
|
"removed {} WAL segments [{}; {}]",
|
|
n_removed, min_removed, max_removed
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub struct WalReader {
|
|
remote_path: RemotePath,
|
|
timeline_dir: Utf8PathBuf,
|
|
wal_seg_size: usize,
|
|
pos: Lsn,
|
|
wal_segment: Option<Pin<Box<dyn AsyncRead + Send + Sync>>>,
|
|
|
|
// S3 will be used to read WAL if LSN is not available locally
|
|
enable_remote_read: bool,
|
|
|
|
// We don't have WAL locally if LSN is less than local_start_lsn
|
|
local_start_lsn: Lsn,
|
|
// We will respond with zero-ed bytes before this Lsn as long as
|
|
// pos is in the same segment as timeline_start_lsn.
|
|
timeline_start_lsn: Lsn,
|
|
// integer version number of PostgreSQL, e.g. 14; 15; 16
|
|
pg_version: u32,
|
|
system_id: SystemId,
|
|
timeline_start_segment: Option<Bytes>,
|
|
}
|
|
|
|
impl WalReader {
|
|
pub fn new(
|
|
ttid: &TenantTimelineId,
|
|
timeline_dir: Utf8PathBuf,
|
|
state: &TimelinePersistentState,
|
|
start_pos: Lsn,
|
|
enable_remote_read: bool,
|
|
) -> Result<Self> {
|
|
if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
|
|
bail!("state uninitialized, no data to read");
|
|
}
|
|
|
|
// TODO: Upgrade to bail!() once we know this couldn't possibly happen
|
|
if state.timeline_start_lsn == Lsn(0) {
|
|
warn!("timeline_start_lsn uninitialized before initializing wal reader");
|
|
}
|
|
|
|
if start_pos
|
|
< state
|
|
.timeline_start_lsn
|
|
.segment_lsn(state.server.wal_seg_size as usize)
|
|
{
|
|
bail!(
|
|
"Requested streaming from {}, which is before the start of the timeline {}, and also doesn't start at the first segment of that timeline",
|
|
start_pos,
|
|
state.timeline_start_lsn
|
|
);
|
|
}
|
|
|
|
Ok(Self {
|
|
remote_path: remote_timeline_path(ttid)?,
|
|
timeline_dir,
|
|
wal_seg_size: state.server.wal_seg_size as usize,
|
|
pos: start_pos,
|
|
wal_segment: None,
|
|
enable_remote_read,
|
|
local_start_lsn: state.local_start_lsn,
|
|
timeline_start_lsn: state.timeline_start_lsn,
|
|
pg_version: state.server.pg_version / 10000,
|
|
system_id: state.server.system_id,
|
|
timeline_start_segment: None,
|
|
})
|
|
}
|
|
|
|
/// Read WAL at current position into provided buf, returns number of bytes
|
|
/// read. It can be smaller than buf size only if segment boundary is
|
|
/// reached.
|
|
pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
|
// If this timeline is new, we may not have a full segment yet, so
|
|
// we pad the first bytes of the timeline's first WAL segment with 0s
|
|
if self.pos < self.timeline_start_lsn {
|
|
debug_assert_eq!(
|
|
self.pos.segment_number(self.wal_seg_size),
|
|
self.timeline_start_lsn.segment_number(self.wal_seg_size)
|
|
);
|
|
|
|
// All bytes after timeline_start_lsn are in WAL, but those before
|
|
// are not, so we manually construct an empty segment for the bytes
|
|
// not available in this timeline.
|
|
if self.timeline_start_segment.is_none() {
|
|
let it = postgres_ffi::generate_wal_segment(
|
|
self.timeline_start_lsn.segment_number(self.wal_seg_size),
|
|
self.system_id,
|
|
self.pg_version,
|
|
self.timeline_start_lsn,
|
|
)?;
|
|
self.timeline_start_segment = Some(it);
|
|
}
|
|
|
|
assert!(self.timeline_start_segment.is_some());
|
|
let segment = self.timeline_start_segment.take().unwrap();
|
|
|
|
let seg_bytes = &segment[..];
|
|
|
|
// How much of the current segment have we already consumed?
|
|
let pos_seg_offset = self.pos.segment_offset(self.wal_seg_size);
|
|
|
|
// How many bytes may we consume in total?
|
|
let tl_start_seg_offset = self.timeline_start_lsn.segment_offset(self.wal_seg_size);
|
|
|
|
debug_assert!(seg_bytes.len() > pos_seg_offset);
|
|
debug_assert!(seg_bytes.len() > tl_start_seg_offset);
|
|
|
|
// Copy as many bytes as possible into the buffer
|
|
let len = (tl_start_seg_offset - pos_seg_offset).min(buf.len());
|
|
buf[0..len].copy_from_slice(&seg_bytes[pos_seg_offset..pos_seg_offset + len]);
|
|
|
|
self.pos += len as u64;
|
|
|
|
// If we're done with the segment, we can release it's memory.
|
|
// However, if we're not yet done, store it so that we don't have to
|
|
// construct the segment the next time this function is called.
|
|
if self.pos < self.timeline_start_lsn {
|
|
self.timeline_start_segment = Some(segment);
|
|
}
|
|
|
|
return Ok(len);
|
|
}
|
|
|
|
let mut wal_segment = match self.wal_segment.take() {
|
|
Some(reader) => reader,
|
|
None => self.open_segment().await?,
|
|
};
|
|
|
|
// How much to read and send in message? We cannot cross the WAL file
|
|
// boundary, and we don't want send more than provided buffer.
|
|
let xlogoff = self.pos.segment_offset(self.wal_seg_size);
|
|
let send_size = min(buf.len(), self.wal_seg_size - xlogoff);
|
|
|
|
// Read some data from the file.
|
|
let buf = &mut buf[0..send_size];
|
|
let send_size = wal_segment.read_exact(buf).await?;
|
|
self.pos += send_size as u64;
|
|
|
|
// Decide whether to reuse this file. If we don't set wal_segment here
|
|
// a new reader will be opened next time.
|
|
if self.pos.segment_offset(self.wal_seg_size) != 0 {
|
|
self.wal_segment = Some(wal_segment);
|
|
}
|
|
|
|
Ok(send_size)
|
|
}
|
|
|
|
/// Open WAL segment at the current position of the reader.
|
|
async fn open_segment(&self) -> Result<Pin<Box<dyn AsyncRead + Send + Sync>>> {
|
|
let xlogoff = self.pos.segment_offset(self.wal_seg_size);
|
|
let segno = self.pos.segment_number(self.wal_seg_size);
|
|
let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
|
|
let wal_file_path = self.timeline_dir.join(&wal_file_name);
|
|
|
|
// Try to open local file, if we may have WAL locally
|
|
if self.pos >= self.local_start_lsn {
|
|
let res = Self::open_wal_file(&wal_file_path).await;
|
|
match res {
|
|
Ok(mut file) => {
|
|
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
|
return Ok(Box::pin(file));
|
|
}
|
|
Err(e) => {
|
|
let is_not_found = e.chain().any(|e| {
|
|
if let Some(e) = e.downcast_ref::<io::Error>() {
|
|
e.kind() == io::ErrorKind::NotFound
|
|
} else {
|
|
false
|
|
}
|
|
});
|
|
if !is_not_found {
|
|
return Err(e);
|
|
}
|
|
// NotFound is expected, fall through to remote read
|
|
}
|
|
};
|
|
}
|
|
|
|
// Try to open remote file, if remote reads are enabled
|
|
if self.enable_remote_read {
|
|
let remote_wal_file_path = self.remote_path.join(&wal_file_name);
|
|
return read_object(&remote_wal_file_path, xlogoff as u64).await;
|
|
}
|
|
|
|
bail!("WAL segment is not found")
|
|
}
|
|
|
|
/// Helper function for opening a wal file.
|
|
async fn open_wal_file(wal_file_path: &Utf8Path) -> Result<tokio::fs::File> {
|
|
// First try to open the .partial file.
|
|
let mut partial_path = wal_file_path.to_owned();
|
|
partial_path.set_extension("partial");
|
|
if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await {
|
|
return Ok(opened_file);
|
|
}
|
|
|
|
// If that failed, try it without the .partial extension.
|
|
tokio::fs::File::open(&wal_file_path)
|
|
.await
|
|
.with_context(|| format!("Failed to open WAL file {:?}", wal_file_path))
|
|
.map_err(|e| {
|
|
warn!("{}", e);
|
|
e
|
|
})
|
|
}
|
|
}
|
|
|
|
/// Zero block for filling created WAL segments.
|
|
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
|
|
|
/// Helper for filling file with zeroes.
|
|
async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
|
|
fail::fail_point!("sk-write-zeroes", |_| {
|
|
info!("write_zeroes hit failpoint");
|
|
Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
|
|
});
|
|
|
|
while count >= XLOG_BLCKSZ {
|
|
file.write_all(ZERO_BLOCK).await?;
|
|
count -= XLOG_BLCKSZ;
|
|
}
|
|
file.write_all(&ZERO_BLOCK[0..count]).await?;
|
|
file.flush().await?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Helper returning full path to WAL segment file and its .partial brother.
|
|
pub fn wal_file_paths(
|
|
timeline_dir: &Utf8Path,
|
|
segno: XLogSegNo,
|
|
wal_seg_size: usize,
|
|
) -> Result<(Utf8PathBuf, Utf8PathBuf)> {
|
|
let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size);
|
|
let wal_file_path = timeline_dir.join(wal_file_name.clone());
|
|
let wal_file_partial_path = timeline_dir.join(wal_file_name + ".partial");
|
|
Ok((wal_file_path, wal_file_partial_path))
|
|
}
|