Compare commits

..

21 Commits

Author SHA1 Message Date
Alexey Kondratov
58ee590bfd cargo fmt 2021-10-20 14:01:00 +03:00
Alexey Kondratov
78773ee2c1 Bump vendor/postgres 2021-10-20 13:59:42 +03:00
Konstantin Knizhnik
a6d096a874 Merge with main 2021-10-20 13:19:50 +03:00
Konstantin Knizhnik
1ae336c356 Fix type in comment 2021-10-19 17:42:29 +03:00
Konstantin Knizhnik
57da296356 Fix race condition in add_replica 2021-10-19 12:01:24 +03:00
Konstantin Knizhnik
4ee30b6c36 Merge with main 2021-10-08 14:47:00 +03:00
Konstantin Knizhnik
4d90f2f316 Add separate back pressure threashold for write/flush LSNs 2021-09-30 19:55:38 +03:00
Konstantin Knizhnik
fc774c819e MReduce default checkpoint distance to 64Mb 2021-09-30 17:32:57 +03:00
Konstantin Knizhnik
04a6652f16 Bump postgres version 2021-09-22 16:02:12 +03:00
Konstantin Knizhnik
25ca847295 Remove extra tracing 2021-09-22 11:07:27 +03:00
Konstantin Knizhnik
b472e28f3a Fix indentation 2021-09-22 10:40:31 +03:00
Konstantin Knizhnik
e28960b816 Fix zerialization of KeepAlive message 2021-09-22 10:40:31 +03:00
Konstantin Knizhnik
379f6b8638 Fix clippy errors 2021-09-22 10:40:31 +03:00
Konstantin Knizhnik
018408e0af Fix clippy error 2021-09-22 10:40:31 +03:00
Konstantin Knizhnik
bf5d17cbaa Request disk consistent LSN through safekeeper 2021-09-22 10:40:31 +03:00
Konstantin Knizhnik
ae2232641d Eliminate warnings 2021-09-22 10:40:22 +03:00
Konstantin Knizhnik
891633f2cd Bump postgres version 2021-09-22 10:36:14 +03:00
Konstantin Knizhnik
6056ce7602 Implement backpressure for compute node to avoid WAL overflow 2021-09-22 10:36:14 +03:00
Konstantin Knizhnik
e53dd5240a Adjust default parameters for chekpointer to avoid OOM 2021-09-22 10:35:17 +03:00
Konstantin Knizhnik
611abc5299 Use checkpoint_distance instead of OLDEST_INMEM_DISTANCE 2021-09-22 10:35:17 +03:00
Konstantin Knizhnik
09cced0855 Delay adding new layers if checpointing is too slow to avoid OOM 2021-09-22 10:35:17 +03:00
31 changed files with 2702 additions and 3238 deletions

76
Cargo.lock generated
View File

@@ -180,9 +180,9 @@ dependencies = [
[[package]]
name = "bitflags"
version = "1.3.2"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "bitvec"
@@ -338,7 +338,7 @@ dependencies = [
"bytes",
"hex",
"lazy_static",
"nix 0.20.0",
"nix",
"pageserver",
"postgres",
"postgres_ffi",
@@ -886,9 +886,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.109"
version = "0.2.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f98a04dce437184842841303488f70d0188c5f51437d2a834dc097eafa909a01"
checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21"
[[package]]
name = "libloading"
@@ -902,9 +902,9 @@ dependencies = [
[[package]]
name = "lock_api"
version = "0.4.5"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109"
checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
dependencies = [
"scopeguard",
]
@@ -918,15 +918,6 @@ dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "lz4_flex"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "177c079243f6867429aca5af5053747f57e329d44f0c58bebca078cd14873ec2"
dependencies = [
"twox-hash",
]
[[package]]
name = "matchers"
version = "0.0.1"
@@ -1052,19 +1043,6 @@ dependencies = [
"libc",
]
[[package]]
name = "nix"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f305c2c2e4c39a82f7bf0bf65fb557f9070ce06781d4f2454295cc34b1c43188"
dependencies = [
"bitflags",
"cc",
"cfg-if 1.0.0",
"libc",
"memoffset",
]
[[package]]
name = "nom"
version = "6.1.2"
@@ -1202,9 +1180,6 @@ dependencies = [
"hyper",
"lazy_static",
"log",
"lz4_flex",
"nix 0.23.0",
"parking_lot",
"postgres",
"postgres-protocol",
"postgres-types",
@@ -1223,16 +1198,15 @@ dependencies = [
"toml",
"tracing",
"workspace_hack",
"yakv",
"zenith_metrics",
"zenith_utils",
]
[[package]]
name = "parking_lot"
version = "0.11.2"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb"
dependencies = [
"instant",
"lock_api",
@@ -1241,9 +1215,9 @@ dependencies = [
[[package]]
name = "parking_lot_core"
version = "0.8.5"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018"
dependencies = [
"cfg-if 1.0.0",
"instant",
@@ -1934,12 +1908,6 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "stringprep"
version = "0.1.2"
@@ -2260,16 +2228,6 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
[[package]]
name = "twox-hash"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f559b464de2e2bdabcac6a210d12e9b5a5973c251e102c44c585c71d51bd78e"
dependencies = [
"cfg-if 1.0.0",
"static_assertions",
]
[[package]]
name = "typenum"
version = "1.13.0"
@@ -2584,17 +2542,6 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"
[[package]]
name = "yakv"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17eba1abb31dda774cd901a9692a47aac716050975a30993c79826a08de47a34"
dependencies = [
"anyhow",
"fs2",
"parking_lot",
]
[[package]]
name = "zenith"
version = "0.1.0"
@@ -2634,7 +2581,6 @@ dependencies = [
"jsonwebtoken",
"lazy_static",
"log",
"nix 0.23.0",
"postgres",
"rand",
"routerify",

View File

@@ -15,7 +15,3 @@ members = [
# This is useful for profiling and, to some extent, debug.
# Besides, debug info should not affect the performance.
debug = true
panic = 'abort'
[profile.dev]
panic = 'abort'

View File

@@ -298,10 +298,9 @@ impl PostgresNode {
conf.append("max_replication_slots", "10");
conf.append("hot_standby", "on");
conf.append("shared_buffers", "1MB");
conf.append("max_wal_size", "100GB");
conf.append("fsync", "off");
conf.append("max_connections", "100");
conf.append("wal_sender_timeout", "0");
conf.append("wal_sender_timeout", "10s");
conf.append("wal_level", "replica");
conf.append("listen_addresses", &self.address.ip().to_string());
conf.append("port", &self.address.port().to_string());

View File

@@ -37,16 +37,11 @@ async-trait = "0.1"
const_format = "0.2.21"
tracing = "0.1.27"
signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
nix = "0.23"
#yakv = { path = "../../yakv" }
yakv = "0.2.7"
lz4_flex = "0.9.0"
postgres_ffi = { path = "../postgres_ffi" }
zenith_metrics = { path = "../zenith_metrics" }
zenith_utils = { path = "../zenith_utils" }
workspace_hack = { path = "../workspace_hack" }
parking_lot = "0.11.2"
[dev-dependencies]
hex-literal = "0.3"

View File

@@ -0,0 +1,25 @@
//! Main entry point for the dump_layerfile executable
//!
//! A handy tool for debugging, that's all.
use anyhow::Result;
use clap::{App, Arg};
use pageserver::layered_repository::dump_layerfile_from_path;
use std::path::PathBuf;
fn main() -> Result<()> {
let arg_matches = App::new("Zenith dump_layerfile utility")
.about("Dump contents of one layer file, for debugging")
.arg(
Arg::with_name("path")
.help("Path to file to dump")
.required(true)
.index(1),
)
.get_matches();
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
dump_layerfile_from_path(&path)?;
Ok(())
}

View File

@@ -42,9 +42,6 @@ struct CfgFileParams {
listen_http_addr: Option<String>,
checkpoint_distance: Option<String>,
checkpoint_period: Option<String>,
upload_distance: Option<String>,
upload_period: Option<String>,
reconstruct_threshold: Option<String>,
gc_horizon: Option<String>,
gc_period: Option<String>,
pg_distrib_dir: Option<String>,
@@ -106,9 +103,6 @@ impl CfgFileParams {
listen_http_addr: get_arg("listen-http"),
checkpoint_distance: get_arg("checkpoint_distance"),
checkpoint_period: get_arg("checkpoint_period"),
upload_distance: get_arg("upload_distance"),
upload_period: get_arg("upload_period"),
reconstruct_threshold: get_arg("reconstruct_threshold"),
gc_horizon: get_arg("gc_horizon"),
gc_period: get_arg("gc_period"),
pg_distrib_dir: get_arg("postgres-distrib"),
@@ -127,9 +121,6 @@ impl CfgFileParams {
listen_http_addr: self.listen_http_addr.or(other.listen_http_addr),
checkpoint_distance: self.checkpoint_distance.or(other.checkpoint_distance),
checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
upload_distance: self.upload_distance.or(other.upload_distance),
upload_period: self.upload_period.or(other.upload_period),
reconstruct_threshold: self.reconstruct_threshold.or(other.reconstruct_threshold),
gc_horizon: self.gc_horizon.or(other.gc_horizon),
gc_period: self.gc_period.or(other.gc_period),
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
@@ -167,20 +158,6 @@ impl CfgFileParams {
None => DEFAULT_CHECKPOINT_PERIOD,
};
let upload_distance: u64 = match self.upload_distance.as_ref() {
Some(upload_distance_str) => upload_distance_str.parse()?,
None => DEFAULT_UPLOAD_DISTANCE,
};
let upload_period = match self.upload_period.as_ref() {
Some(upload_period_str) => humantime::parse_duration(upload_period_str)?,
None => DEFAULT_UPLOAD_PERIOD,
};
let reconstruct_threshold: u64 = match self.reconstruct_threshold.as_ref() {
Some(reconstruct_threshold_str) => reconstruct_threshold_str.parse()?,
None => DEFAULT_RECONSTRUCT_THRESHOLD,
};
let gc_horizon: u64 = match self.gc_horizon.as_ref() {
Some(horizon_str) => horizon_str.parse()?,
None => DEFAULT_GC_HORIZON,
@@ -259,9 +236,6 @@ impl CfgFileParams {
listen_http_addr,
checkpoint_distance,
checkpoint_period,
upload_distance,
upload_period,
reconstruct_threshold,
gc_horizon,
gc_period,
@@ -322,24 +296,6 @@ fn main() -> Result<()> {
.takes_value(true)
.help("Interval between checkpoint iterations"),
)
.arg(
Arg::with_name("upload_distance")
.long("upload_distance")
.takes_value(true)
.help("Distance from current LSN to perform checkpoint of in-memory layers"),
)
.arg(
Arg::with_name("upload_period")
.long("upload_period")
.takes_value(true)
.help("Interval between upload iterations"),
)
.arg(
Arg::with_name("reconstruct_threshold")
.long("reconstruct_threshold")
.takes_value(true)
.help("Minimal size of deltas after which page reconstruction (materialization) can be performed"),
)
.arg(
Arg::with_name("gc_horizon")
.long("gc_horizon")
@@ -644,9 +600,6 @@ mod tests {
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
upload_distance: Some("upload_distance_VALUE".to_string()),
upload_period: Some("upload_period_VALUE".to_string()),
reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
gc_horizon: Some("gc_horizon_VALUE".to_string()),
gc_period: Some("gc_period_VALUE".to_string()),
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
@@ -670,9 +623,6 @@ mod tests {
listen_http_addr = 'listen_http_addr_VALUE'
checkpoint_distance = 'checkpoint_distance_VALUE'
checkpoint_period = 'checkpoint_period_VALUE'
upload_distance = 'upload_distance_VALUE'
upload_period = 'upload_period_VALUE'
reconstruct_threshold = 'reconstruct_threshold_VALUE'
gc_horizon = 'gc_horizon_VALUE'
gc_period = 'gc_period_VALUE'
pg_distrib_dir = 'pg_distrib_dir_VALUE'
@@ -707,9 +657,6 @@ local_path = 'relish_storage_local_VALUE'
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
upload_distance: Some("upload_distance_VALUE".to_string()),
upload_period: Some("upload_period_VALUE".to_string()),
reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
gc_horizon: Some("gc_horizon_VALUE".to_string()),
gc_period: Some("gc_period_VALUE".to_string()),
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
@@ -736,9 +683,6 @@ local_path = 'relish_storage_local_VALUE'
listen_http_addr = 'listen_http_addr_VALUE'
checkpoint_distance = 'checkpoint_distance_VALUE'
checkpoint_period = 'checkpoint_period_VALUE'
upload_distance = 'upload_distance_VALUE'
upload_period = 'upload_period_VALUE'
reconstruct_threshold = 'reconstruct_threshold_VALUE'
gc_horizon = 'gc_horizon_VALUE'
gc_period = 'gc_period_VALUE'
pg_distrib_dir = 'pg_distrib_dir_VALUE'

View File

@@ -147,7 +147,7 @@ pub fn create_repo(
let tli = create_timeline(conf, None, &tenantid)?;
let repo = Arc::new(crate::buffered_repository::BufferedRepository::new(
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
conf,
wal_redo_manager,
tenantid,
@@ -182,7 +182,6 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
let initdb_output = Command::new(initdb_path)
.args(&["-D", initdbpath.to_str().unwrap()])
.args(&["-U", &conf.superuser])
.args(&["-E", "utf8"])
.arg("--no-instructions")
// This is only used for a temporary installation that is deleted shortly after,
// so no need to fsync it

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -39,8 +39,9 @@
//!
use crate::layered_repository::blob::BlobWriter;
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
use crate::repository::{PageReconstructData, PageReconstructResult, PageVersion};
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
};
use crate::waldecoder;
use crate::PageServerConf;
use crate::{ZTenantId, ZTimelineId};
@@ -147,10 +148,6 @@ pub struct DeltaLayerInner {
}
impl Layer for DeltaLayer {
fn get_tenant_id(&self) -> ZTenantId {
self.tenantid
}
fn get_timeline_id(&self) -> ZTimelineId {
self.timelineid
}
@@ -204,22 +201,22 @@ impl Layer for DeltaLayer {
for ((_blknum, pv_lsn), blob_range) in iter {
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
match pv {
PageVersion::Page(img) => {
// Found a page image, return it
reconstruct_data.page_img = Some(img);
if let Some(img) = pv.page_image {
// Found a page image, return it
reconstruct_data.page_img = Some(img);
need_image = false;
break;
} else if let Some(rec) = pv.record {
let will_init = rec.will_init;
reconstruct_data.records.push((*pv_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
PageVersion::Wal(rec) => {
let will_init = rec.will_init;
reconstruct_data.records.push((*pv_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
} else {
// No base image, and no WAL record. Huh?
bail!("no page image or WAL record for requested page");
}
}
@@ -229,7 +226,7 @@ impl Layer for DeltaLayer {
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
Ok(PageReconstructResult::Continue(self.start_lsn))
} else {
Ok(PageReconstructResult::Complete)
}
@@ -310,22 +307,19 @@ impl Layer for DeltaLayer {
let buf = read_blob(&chapter, blob_range)?;
let pv = PageVersion::des(&buf)?;
match pv {
PageVersion::Page(img) => {
write!(&mut desc, " img {} bytes", img.len())?;
}
PageVersion::Wal(rec) => {
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
write!(
&mut desc,
" rec {} bytes will_init: {} {}",
rec.rec.len(),
rec.will_init,
wal_desc
)?;
}
if let Some(img) = pv.page_image.as_ref() {
write!(&mut desc, " img {} bytes", img.len())?;
}
if let Some(rec) = pv.record.as_ref() {
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
write!(
&mut desc,
" rec {} bytes will_init: {} {}",
rec.rec.len(),
rec.will_init,
wal_desc
)?;
}
println!(" blk {} at {}: {}", blk, lsn, desc);
}
@@ -334,19 +328,6 @@ impl Layer for DeltaLayer {
}
impl DeltaLayer {
/// debugging function to print out the contents of the layer
pub fn versions(&self) -> Result<Vec<(u32, Lsn, PageVersion)>> {
let mut versions: Vec<(u32, Lsn, PageVersion)> = Vec::new();
let inner = self.load()?;
let (_path, book) = self.open_book()?;
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
let buf = read_blob(&chapter, blob_range)?;
versions.push((*blk, *lsn, PageVersion::des(&buf)?));
}
Ok(versions)
}
fn path_for(
path_or_conf: &PathOrConf,
timelineid: ZTimelineId,
@@ -379,7 +360,6 @@ impl DeltaLayer {
dropped: bool,
page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
relsizes: VecMap<Lsn, u32>,
nosync: bool,
) -> Result<DeltaLayer> {
if seg.rel.is_blocky() {
assert!(!relsizes.is_empty());
@@ -452,9 +432,8 @@ impl DeltaLayer {
// This flushes the underlying 'buf_writer'.
let writer = book.close()?;
if !nosync {
writer.get_ref().sync_all()?;
}
writer.get_ref().sync_all()?;
trace!("saved {}", &path.display());
drop(inner);
@@ -463,7 +442,12 @@ impl DeltaLayer {
}
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
let path = self.path();
let path = Self::path_for(
&self.path_or_conf,
self.timelineid,
self.tenantid,
&self.layer_name(),
);
let file = File::open(&path)?;
let book = Book::new(file)?;

View File

@@ -13,6 +13,8 @@ use anyhow::Result;
use log::*;
use zenith_utils::lsn::Lsn;
use super::METADATA_FILE_NAME;
// Note: LayeredTimeline::load_layer_map() relies on this sort order
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct DeltaFileName {
@@ -290,7 +292,7 @@ pub fn list_files(
deltafiles.push(deltafilename);
} else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
imgfiles.push(imgfilename);
} else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
} else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
// ignore these
} else {
warn!("unrecognized filename in timeline dir: {}", fname);

View File

@@ -22,8 +22,11 @@
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
//!
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
use crate::layered_repository::storage_layer::{Layer, SegmentTag, RELISH_SEG_SIZE};
use crate::repository::{PageReconstructData, PageReconstructResult};
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, SegmentTag,
};
use crate::layered_repository::LayeredTimeline;
use crate::layered_repository::RELISH_SEG_SIZE;
use crate::PageServerConf;
use crate::{ZTenantId, ZTimelineId};
use anyhow::{anyhow, bail, ensure, Result};
@@ -114,10 +117,6 @@ impl Layer for ImageLayer {
PathBuf::from(self.layer_name().to_string())
}
fn get_tenant_id(&self) -> ZTenantId {
self.tenantid
}
fn get_timeline_id(&self) -> ZTimelineId {
self.timelineid
}
@@ -251,14 +250,13 @@ impl ImageLayer {
}
/// Create a new image file, using the given array of pages.
pub fn create(
fn create(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
lsn: Lsn,
base_images: Vec<Bytes>,
nosync: bool,
) -> Result<ImageLayer> {
let image_type = if seg.rel.is_blocky() {
let num_blocks: u32 = base_images.len().try_into()?;
@@ -318,9 +316,8 @@ impl ImageLayer {
// This flushes the underlying 'buf_writer'.
let writer = book.close()?;
if !nosync {
writer.get_ref().sync_all()?;
}
writer.get_ref().sync_all()?;
trace!("saved {}", path.display());
drop(inner);
@@ -328,7 +325,6 @@ impl ImageLayer {
Ok(layer)
}
/*
// Create a new image file by materializing every page in a source layer
// at given LSN.
pub fn create_from_src(
@@ -366,7 +362,6 @@ impl ImageLayer {
Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
}
*/
///
/// Load the contents of the file into memory

View File

@@ -3,9 +3,10 @@
//!
use crate::relish::RelishTag;
use crate::repository::{PageReconstructData, PageReconstructResult};
use crate::{ZTenantId, ZTimelineId};
use crate::repository::WALRecord;
use crate::ZTimelineId;
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::path::PathBuf;
@@ -44,6 +45,56 @@ impl SegmentTag {
}
}
///
/// Represents a version of a page at a specific LSN. The LSN is the key of the
/// entry in the 'page_versions' hash, it is not duplicated here.
///
/// A page version can be stored as a full page image, or as WAL record that needs
/// to be applied over the previous page version to reconstruct this version.
///
/// It's also possible to have both a WAL record and a page image in the same
/// PageVersion. That happens if page version is originally stored as a WAL record
/// but it is later reconstructed by a GetPage@LSN request by performing WAL
/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
/// the WAL record in that case. TODO: That's pretty accidental, not the result
/// of any grand design. If we want to keep reconstructed page versions around, we
/// probably should have a separate buffer cache so that we could control the
/// replacement policy globally. Or if we keep a reconstructed page image, we
/// could throw away the WAL record.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageVersion {
/// an 8kb page image
pub page_image: Option<Bytes>,
/// WAL record to get from previous page version to this one.
pub record: Option<WALRecord>,
}
///
/// Data needed to reconstruct a page version
///
/// 'page_img' is the old base image of the page to start the WAL replay with.
/// It can be None, if the first WAL record initializes the page (will_init)
/// 'records' contains the records to apply over the base image.
///
pub struct PageReconstructData {
pub records: Vec<(Lsn, WALRecord)>,
pub page_img: Option<Bytes>,
}
/// Return value from Layer::get_page_reconstruct_data
pub enum PageReconstructResult {
/// Got all the data needed to reconstruct the requested page
Complete,
/// This layer didn't contain all the required data, the caller should look up
/// the predecessor layer at the returned LSN and collect more data from there.
Continue(Lsn),
/// This layer didn't contain data needed to reconstruct the page version at
/// the returned LSN. This is usually considered an error, but might be OK
/// in some circumstances.
Missing(Lsn),
}
///
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
@@ -53,8 +104,6 @@ impl SegmentTag {
/// in-memory and on-disk layers.
///
pub trait Layer: Send + Sync {
fn get_tenant_id(&self) -> ZTenantId;
/// Identify the timeline this relish belongs to
fn get_timeline_id(&self) -> ZTimelineId;

View File

@@ -9,7 +9,6 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
pub mod basebackup;
pub mod branches;
pub mod buffered_repository;
pub mod http;
pub mod layered_repository;
pub mod page_service;
@@ -18,7 +17,6 @@ pub mod relish_storage;
pub mod repository;
pub mod restore_local_repo;
pub mod tenant_mgr;
pub mod toast_store;
pub mod waldecoder;
pub mod walreceiver;
pub mod walredo;
@@ -32,17 +30,14 @@ pub mod defaults {
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
// Minimal size of WAL records chain to trigger materialization of the page
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
// would be more appropriate. But a low value forces the code to be exercised more,
// which is good for now to trigger bugs.
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);
pub const DEFAULT_UPLOAD_DISTANCE: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_UPLOAD_PERIOD: Duration = Duration::from_secs(3600);
pub const DEFAULT_RECONSTRUCT_THRESHOLD: u64 = 0;
pub const DEFAULT_GC_HORIZON: u64 = 1024;
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
@@ -69,9 +64,6 @@ pub struct PageServerConf {
// page server crashes.
pub checkpoint_distance: u64,
pub checkpoint_period: Duration,
pub upload_period: Duration,
pub upload_distance: u64,
pub reconstruct_threshold: u64,
pub gc_horizon: u64,
pub gc_period: Duration,
@@ -157,9 +149,6 @@ impl PageServerConf {
daemonize: false,
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_period: Duration::from_secs(10),
upload_distance: defaults::DEFAULT_UPLOAD_DISTANCE,
upload_period: defaults::DEFAULT_UPLOAD_PERIOD,
reconstruct_threshold: defaults::DEFAULT_RECONSTRUCT_THRESHOLD,
gc_horizon: defaults::DEFAULT_GC_HORIZON,
gc_period: Duration::from_secs(10),
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),

View File

@@ -693,21 +693,67 @@ impl postgres_backend::Handler for PageServerHandler {
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"meta_total"),
RowDescriptor::int8_col(b"meta_removed"),
RowDescriptor::int8_col(b"meta_dropped"),
RowDescriptor::int8_col(b"pages_total"),
RowDescriptor::int8_col(b"pages_removed"),
RowDescriptor::int8_col(b"pages_dropped"),
RowDescriptor::int8_col(b"layer_relfiles_total"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
RowDescriptor::int8_col(b"layer_relfiles_removed"),
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"elapsed"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(result.meta_total.to_string().as_bytes()),
Some(result.meta_removed.to_string().as_bytes()),
Some(result.meta_dropped.to_string().as_bytes()),
Some(result.pages_total.to_string().as_bytes()),
Some(result.pages_removed.to_string().as_bytes()),
Some(result.pages_dropped.to_string().as_bytes()),
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
Some(
result
.ondisk_relfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
result
.ondisk_relfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
Some(
result
.ondisk_relfiles_needed_as_tombstone
.to_string()
.as_bytes(),
),
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
Some(
result
.ondisk_nonrelfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
result
.ondisk_nonrelfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
Some(
result
.ondisk_nonrelfiles_needed_as_tombstone
.to_string()
.as_bytes(),
),
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
Some(result.elapsed.as_millis().to_string().as_bytes()),
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;

View File

@@ -44,27 +44,45 @@ pub trait Repository: Send + Sync {
///
/// Result of performing GC
///
#[derive(Default, Debug)]
#[derive(Default)]
pub struct GcResult {
pub meta_removed: u64, // removed versions beyond PITR interval for which new page image exists
pub meta_dropped: u64, // removed versions beyond PITR interval of dropped relations
pub meta_total: u64, // total number of metaobject version histories
pub ondisk_relfiles_total: u64,
pub ondisk_relfiles_needed_by_cutoff: u64,
pub ondisk_relfiles_needed_by_branches: u64,
pub ondisk_relfiles_not_updated: u64,
pub ondisk_relfiles_needed_as_tombstone: u64,
pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped
pub pages_removed: u64, // removed versions beyond PITR interval for which new page image exists
pub pages_dropped: u64, // removed versions beyond PITR interval of dropped relations
pub pages_total: u64, // total number of page vaersion histories
pub ondisk_nonrelfiles_total: u64,
pub ondisk_nonrelfiles_needed_by_cutoff: u64,
pub ondisk_nonrelfiles_needed_by_branches: u64,
pub ondisk_nonrelfiles_not_updated: u64,
pub ondisk_nonrelfiles_needed_as_tombstone: u64,
pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped
pub elapsed: Duration,
}
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.meta_total += other.meta_total;
self.meta_removed += other.meta_removed;
self.meta_dropped += other.meta_dropped;
self.pages_total += other.pages_total;
self.pages_removed += other.pages_removed;
self.pages_dropped += other.pages_dropped;
self.ondisk_relfiles_total += other.ondisk_relfiles_total;
self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff;
self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches;
self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated;
self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone;
self.ondisk_relfiles_removed += other.ondisk_relfiles_removed;
self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped;
self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total;
self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff;
self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches;
self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated;
self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone;
self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed;
self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped;
self.elapsed += other.elapsed;
}
}
@@ -93,18 +111,14 @@ pub trait Timeline: Send + Sync {
/// Get a list of all existing relations
/// Pass RelTag to get relation objects or None to get nonrels.
fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashSet<RelishTag>>;
/// Get a list of all existing relations in given tablespace and database.
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelishTag>>;
/// Get a list of all existing non-relational objects
fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
///
/// Export data as delats and image layers between 'start_lsn' to 'end_lsn'. The
/// start is inclusive, and end is exclusive.
///
fn export_timeline(&self, start_lsn: Lsn, end_lsn: Lsn) -> Result<()>;
/// Get the LSN where this branch was created
fn get_ancestor_lsn(&self) -> Lsn;
@@ -120,6 +134,7 @@ pub trait Timeline: Send + Sync {
fn get_last_record_lsn(&self) -> Lsn;
fn get_prev_record_lsn(&self) -> Lsn;
fn get_start_lsn(&self) -> Lsn;
fn get_disk_consistent_lsn(&self) -> Lsn;
/// Mutate the timeline with a [`TimelineWriter`].
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
@@ -167,16 +182,6 @@ pub trait TimelineWriter: Deref<Target = dyn Timeline> {
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
/// Previous last record LSN is stored alongside the latest and can be read.
fn advance_last_record_lsn(&self, lsn: Lsn);
///
/// Complete all delayed commits and advance disk_consistent_lsn
///
fn checkpoint(&self) -> Result<()>;
///
/// Import data from layer files
///
fn import_timeline(&self, snapshot_lsn: Lsn) -> Result<()>;
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -209,39 +214,6 @@ impl WALRecord {
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum PageVersion {
/// an 8kb page image
Page(Bytes),
/// WAL record to get from previous page version to this one.
Wal(WALRecord),
}
///
/// Data needed to reconstruct a page version
///
/// 'page_img' is the old base image of the page to start the WAL replay with.
/// It can be None, if the first WAL record initializes the page (will_init)
/// 'records' contains the records to apply over the base image.
///
pub struct PageReconstructData {
pub records: Vec<(Lsn, WALRecord)>,
pub page_img: Option<Bytes>,
}
/// Return value from Layer::get_page_reconstruct_data
pub enum PageReconstructResult {
/// Got all the data needed to reconstruct the requested page
Complete,
/// This layer didn't contain all the required data, the caller should look up
/// the predecessor layer at the returned LSN and collect more data from there.
Continue(Lsn),
/// This layer didn't contain data needed to reconstruct the page version at
/// the returned LSN. This is usually considered an error, but might be OK
/// in some circumstances.
Missing(Lsn),
}
///
/// Tests that should work the same with any Repository/Timeline implementation.
///
@@ -249,7 +221,7 @@ pub enum PageReconstructResult {
#[cfg(test)]
mod tests {
use super::*;
use crate::buffered_repository::{BufferedRepository, METADATA_FILE_NAME};
use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
use crate::walredo::{WalRedoError, WalRedoManager};
use crate::PageServerConf;
use hex_literal::hex;
@@ -325,7 +297,7 @@ mod tests {
fn load(&self) -> Box<dyn Repository> {
let walredo_mgr = Arc::new(TestRedoManager);
Box::new(BufferedRepository::new(
Box::new(LayeredRepository::new(
self.conf,
walredo_mgr,
self.tenant_id,

View File

@@ -11,7 +11,7 @@ use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use anyhow::{anyhow, bail, Result};
use bytes::{Buf, Bytes, BytesMut};
use bytes::{Buf, Bytes};
use tracing::*;
use crate::relish::*;
@@ -126,6 +126,7 @@ pub fn import_timeline_from_postgres_datadir(
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
}
// TODO: Scan pg_tblspc
writer.advance_last_record_lsn(lsn);
// Import WAL. This is needed even when starting from a shutdown checkpoint, because
@@ -139,7 +140,6 @@ pub fn import_timeline_from_postgres_datadir(
lsn,
&mut pg_control.checkPointCopy.clone(),
)?;
writer.checkpoint()?;
Ok(())
}
@@ -416,6 +416,7 @@ pub fn save_decoded_record(
if checkpoint.update_next_xid(decoded.xl_xid) {
*checkpoint_modified = true;
}
// Iterate through all the blocks that the record modifies, and
// "put" a separate copy of the record for each block.
for blk in decoded.blocks.iter() {
@@ -426,43 +427,13 @@ pub fn save_decoded_record(
forknum: blk.forknum as u8,
});
//
// Instead of storing full-page-image WAL record,
// it is better to store extracted image: we can skip wal-redo
// in this case. Also some FPI records may contain multiple (up to 32) pages,
// so them have to be copied multiple times.
//
if blk.apply_image
&& blk.has_image
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0
{
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize);
image.extend_from_slice(&recdata[img_offs..img_offs + img_len]);
let rec = WALRecord {
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?;
} else {
let rec = WALRecord {
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
}
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
}
let mut buf = decoded.record.clone();

View File

@@ -2,7 +2,7 @@
//! page server.
use crate::branches;
use crate::buffered_repository::BufferedRepository;
use crate::layered_repository::LayeredRepository;
use crate::repository::{Repository, Timeline};
use crate::walredo::PostgresRedoManager;
use crate::PageServerConf;
@@ -62,7 +62,6 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
struct TenantHandleEntry {
checkpointer_handle: Option<JoinHandle<()>>,
uploader_handle: Option<JoinHandle<()>>,
gc_handle: Option<JoinHandle<()>>,
}
@@ -99,22 +98,20 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
// Set up an object repository, for actual data storage.
let repo = Arc::new(BufferedRepository::new(
let repo = Arc::new(LayeredRepository::new(
conf,
Arc::new(walredo_mgr),
tenant_id,
true,
));
let checkpointer_handle = BufferedRepository::launch_checkpointer_thread(conf, repo.clone());
let gc_handle = BufferedRepository::launch_gc_thread(conf, repo.clone());
let uploader_handle = BufferedRepository::launch_upload_thread(conf, repo.clone());
let checkpointer_handle = LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
let gc_handle = LayeredRepository::launch_gc_thread(conf, repo.clone());
let mut handles = TENANT_HANDLES.lock().unwrap();
let h = TenantHandleEntry {
checkpointer_handle: Some(checkpointer_handle),
gc_handle: Some(gc_handle),
uploader_handle: Some(uploader_handle),
};
handles.insert(tenant_id, h);
@@ -173,8 +170,6 @@ pub fn stop_tenant_threads(tenantid: ZTenantId) {
if let Some(h) = handles.get_mut(&tenantid) {
h.checkpointer_handle.take().map(JoinHandle::join);
debug!("checkpointer for tenant {} has stopped", tenantid);
h.uploader_handle.take().map(JoinHandle::join);
debug!("uploader for tenant {} has stopped", tenantid);
h.gc_handle.take().map(JoinHandle::join);
debug!("gc for tenant {} has stopped", tenantid);
}

View File

@@ -1,268 +0,0 @@
use anyhow::{anyhow, Result};
use lz4_flex;
use std::convert::TryInto;
use std::ops::{Bound, RangeBounds};
use std::path::Path;
use yakv::storage::{
Key,
Select,
// ReadOnlyTransaction,
Snapshot,
Storage,
StorageConfig,
StorageIterator,
Transaction,
Value,
};
const TOAST_SEGMENT_SIZE: usize = 2000;
const CACHE_SIZE: usize = 1024; // 8Mb
///
/// Toast storage consistof two KV databases: one for storing main index
/// and second for storing sliced BLOB (values larger than 2kb).
/// BLOBs and main data are stored in different databases to improve
/// data locality and reduce key size for TOAST segments.
///
pub struct ToastStore {
db: Storage, // key-value database
}
pub struct ToastIterator<'a> {
iter: StorageIterator<'a>,
}
pub struct ToastSnapshot<'a> {
// tx: ReadOnlyTransaction<'a>,
tx: Snapshot<'a>,
}
impl<'a> ToastSnapshot<'a> {
pub fn range<R: RangeBounds<Key>>(&self, range: R) -> ToastIterator<'_> {
let from = match range.start_bound() {
Bound::Included(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0u8; 4]);
Bound::Included(key)
}
Bound::Excluded(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0u8; 4]);
Bound::Excluded(key)
}
_ => Bound::Unbounded,
};
let till = match range.end_bound() {
Bound::Included(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0xFFu8; 4]);
Bound::Included(key)
}
Bound::Excluded(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0xFFu8; 4]);
Bound::Excluded(key)
}
_ => Bound::Unbounded,
};
ToastIterator {
iter: self.tx.range((from, till)),
}
}
pub fn iter(&self) -> ToastIterator<'_> {
self.range(..)
}
}
impl<'a> Iterator for ToastIterator<'a> {
type Item = Result<(Key, Value)>;
fn next(&mut self) -> Option<Self::Item> {
let mut toast: Option<Vec<u8>> = None;
let mut next_segno = 0u16;
for elem in &mut self.iter {
let res = if let Ok((key, value)) = elem {
let key_len = key.len();
let n_segments =
u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
let key = key[..key_len - 4].to_vec();
if n_segments != 0 {
// TOAST
assert_eq!(segno, next_segno);
if next_segno == 0 {
toast = Some(Vec::with_capacity(n_segments as usize * TOAST_SEGMENT_SIZE))
}
toast.as_mut().unwrap().extend_from_slice(&value);
next_segno = segno + 1;
if next_segno != n_segments {
continue;
}
let res = lz4_flex::decompress_size_prepended(&toast.unwrap());
if let Ok(decompressed_data) = res {
Ok((key, decompressed_data))
} else {
Err(anyhow!(res.unwrap_err()))
}
} else {
Ok((key, value))
}
} else {
elem
};
return Some(res);
}
assert_eq!(next_segno, 0);
None
}
}
impl<'a> DoubleEndedIterator for ToastIterator<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
let mut toast: Option<Vec<u8>> = None;
let mut next_segno = 0u16;
while let Some(elem) = self.iter.next_back() {
if let Ok((key, value)) = elem {
assert!(!value.is_empty());
let key_len = key.len();
let n_segments =
u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
let key = key[..key_len - 4].to_vec();
if n_segments != 0 {
// TOAST
assert!(segno + 1 == next_segno || next_segno == 0);
if next_segno == 0 {
let len = (n_segments - 1) as usize * TOAST_SEGMENT_SIZE + value.len();
let mut vec = vec![0u8; len];
vec[len - value.len()..].copy_from_slice(&value);
toast = Some(vec);
} else {
toast.as_mut().unwrap()[segno as usize * TOAST_SEGMENT_SIZE
..(segno + 1) as usize * TOAST_SEGMENT_SIZE]
.copy_from_slice(&value);
}
next_segno = segno;
if next_segno == 0 {
let toast = toast.unwrap();
assert!(!toast.is_empty());
let res = lz4_flex::decompress_size_prepended(&toast);
return Some(if let Ok(decompressed_data) = res {
Ok((key, decompressed_data))
} else {
Err(anyhow!(res.unwrap_err()))
});
}
} else {
return Some(Ok((key, value)));
}
} else {
return Some(elem);
}
}
assert_eq!(next_segno, 0);
None
}
}
//
// FIXME-KK: not using WAL now. Implement asynchronous or delayed commit.
//
impl ToastStore {
pub fn new(path: &Path) -> Result<ToastStore> {
Ok(ToastStore {
db: Storage::open(
&path.join("pageserver.db"),
StorageConfig {
cache_size: CACHE_SIZE,
nosync: false,
},
)?,
})
}
pub fn put(&self, key: Key, value: Value) -> Result<()> {
let mut tx = self.db.start_transaction();
self.tx_remove(&mut tx, &key)?;
let value_len = value.len();
let mut key = key;
if value_len >= TOAST_SEGMENT_SIZE {
let compressed_data = lz4_flex::compress_prepend_size(&value);
let compressed_data_len = compressed_data.len();
let mut offs: usize = 0;
let mut segno = 0u16;
let n_segments =
((compressed_data_len + TOAST_SEGMENT_SIZE - 1) / TOAST_SEGMENT_SIZE) as u16;
assert!(n_segments != 0);
key.extend_from_slice(&n_segments.to_be_bytes());
key.extend_from_slice(&[0u8; 2]);
let key_len = key.len();
while offs + TOAST_SEGMENT_SIZE < compressed_data_len {
key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
tx.put(
&key,
&compressed_data[offs..offs + TOAST_SEGMENT_SIZE].to_vec(),
)?;
offs += TOAST_SEGMENT_SIZE;
segno += 1;
}
key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
tx.put(&key, &compressed_data[offs..].to_vec())?;
} else {
key.extend_from_slice(&[0u8; 4]);
tx.put(&key, &value)?;
}
tx.subcommit()?;
//tx.delay();
Ok(())
}
pub fn commit(&self) -> Result<()> {
let tx = self.db.start_transaction();
tx.commit()?;
Ok(())
}
pub fn take_snapshot(&self) -> ToastSnapshot<'_> {
ToastSnapshot {
//tx: self.db.read_only_transaction(),
tx: self.db.take_snapshot(),
}
}
pub fn remove(&self, key: Key) -> Result<()> {
let mut tx = self.db.start_transaction();
self.tx_remove(&mut tx, &key)?;
tx.subcommit()?;
//tx.delay();
Ok(())
}
pub fn tx_remove(&self, tx: &mut Transaction, key: &[u8]) -> Result<()> {
let mut min_key = key.to_vec();
let mut max_key = key.to_vec();
min_key.extend_from_slice(&[0u8; 4]);
max_key.extend_from_slice(&[0xFFu8; 4]);
let mut iter = tx.range(&min_key..&max_key);
if let Some(entry) = iter.next() {
let mut key = entry?.0;
let key_len = key.len();
let n_segments = u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
if n_segments != 0 {
// TOAST
for i in 0..n_segments {
key[key_len - 2..].copy_from_slice(&i.to_be_bytes());
tx.remove(&key)?;
}
} else {
tx.remove(&key)?;
}
}
Ok(())
}
pub fn size(&self) -> u64 {
self.db.get_database_info().db_used
}
}

View File

@@ -229,18 +229,17 @@ pub struct DecodedBkpBlock {
pub blkno: u32,
/* copy of the fork_flags field from the XLogRecordBlockHeader */
pub flags: u8,
flags: u8,
/* Information on full-page image, if any */
pub has_image: bool, /* has image, even for consistency checking */
has_image: bool, /* has image, even for consistency checking */
pub apply_image: bool, /* has image that should be restored */
pub will_init: bool, /* record doesn't need previous page version to apply */
//char *bkp_image;
pub hole_offset: u16,
pub hole_length: u16,
pub bimg_offset: u32,
pub bimg_len: u16,
pub bimg_info: u8,
hole_offset: u16,
hole_length: u16,
bimg_len: u16,
bimg_info: u8,
/* Buffer holding the rmgr-specific data associated with this block */
has_data: bool,
@@ -860,19 +859,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
}
// 3. Decode blocks.
let mut ptr = record.len() - buf.remaining();
for blk in blocks.iter_mut() {
if blk.has_image {
blk.bimg_offset = ptr as u32;
ptr += blk.bimg_len as usize;
}
if blk.has_data {
ptr += blk.data_len as usize;
}
}
// We don't need them, so just skip blocks_total_len bytes
buf.advance(blocks_total_len as usize);
assert_eq!(ptr, record.len() - buf.remaining());
let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;

View File

@@ -284,12 +284,14 @@ fn walreceiver_main(
if let Some(last_lsn) = status_update {
// TODO: More thought should go into what values are sent here.
let last_lsn = PgLsn::from(u64::from(last_lsn));
let write_lsn = last_lsn;
// We are using disk consistent LSN as `write_lsn`, i.e. LSN at which page server
// may guarantee persistence of all received data. Safekeeper is not free to remove
// WAL preceding `write_lsn`: it should not be requested by this page server.
let write_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
let flush_lsn = last_lsn;
let apply_lsn = PgLsn::from(0);
let ts = SystemTime::now();
const NO_REPLY: u8 = 0;
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
}

View File

@@ -22,26 +22,23 @@ use byteorder::{ByteOrder, LittleEndian};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use lazy_static::lazy_static;
use log::*;
use nix::poll::*;
use serde::Serialize;
use std::fs;
use std::fs::OpenOptions;
use std::io::prelude::*;
use std::io::{Error, ErrorKind};
use std::os::unix::io::AsRawFd;
use std::io::Error;
use std::path::PathBuf;
use std::process::Stdio;
use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::mpsc;
use std::sync::mpsc::{Receiver, Sender, SyncSender};
use std::sync::Mutex;
use std::time::Duration;
use std::time::Instant;
use tokio::io::AsyncBufReadExt;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::process::{ChildStdin, ChildStdout, Command};
use tokio::time::timeout;
use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::nonblock::set_nonblock;
use zenith_utils::zid::ZTenantId;
use crate::relish::*;
@@ -56,10 +53,6 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
use postgres_ffi::pg_constants;
use postgres_ffi::XLogRecord;
const N_CHANNELS: usize = 16;
const CHANNEL_SIZE: usize = 1024 * 1024;
type ChannelId = usize;
///
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
///
@@ -137,22 +130,17 @@ lazy_static! {
///
/// This is the real implementation that uses a Postgres process to
/// perform WAL replay. I multiplex requests from multiple threads
/// using `sender` channel and send them to the postgres wal-redo process
/// pipe by separate thread. Responses are returned through set of `receivers`
/// channels, used in round robin manner. Receiver thread is protected by mutex
/// to prevent it's usage by more than one thread
/// In the future, we might want to launch a pool of processes to allow concurrent
/// replay of multiple records.
/// perform WAL replay. Only one thread can use the processs at a time,
/// that is controlled by the Mutex. In the future, we might want to
/// launch a pool of processes to allow concurrent replay of multiple
/// records.
///
pub struct PostgresRedoManager {
// mutiplexor pipe: use sync_channel to allow sharing sender by multiple threads
// and limit size of buffer
sender: SyncSender<(ChannelId, Vec<u8>)>,
// set of receiver channels
receivers: Vec<Mutex<Receiver<Bytes>>>,
// atomicly incremented counter for choosing receiver
round_robin: AtomicUsize,
tenantid: ZTenantId,
conf: &'static PageServerConf,
runtime: tokio::runtime::Runtime,
process: Mutex<Option<PostgresRedoProcess>>,
}
#[derive(Debug)]
@@ -165,13 +153,6 @@ struct WalRedoRequest {
records: Vec<(Lsn, WALRecord)>,
}
impl WalRedoRequest {
// Can this request be served by zenith redo funcitons
// or we need to pass it to wal-redo postgres process?
fn can_apply_in_zenith(&self) -> bool {
!matches!(self.rel, RelishTag::Relation(_))
}
}
/// An error happened in WAL redo
#[derive(Debug, thiserror::Error)]
pub enum WalRedoError {
@@ -180,8 +161,6 @@ pub enum WalRedoError {
#[error("cannot perform WAL redo now")]
InvalidState,
#[error("cannot perform WAL redo for this request")]
InvalidRequest,
}
///
@@ -202,6 +181,10 @@ impl WalRedoManager for PostgresRedoManager {
base_img: Option<Bytes>,
records: Vec<(Lsn, WALRecord)>,
) -> Result<Bytes, WalRedoError> {
let start_time;
let lock_time;
let end_time;
let request = WalRedoRequest {
rel,
blknum,
@@ -209,14 +192,28 @@ impl WalRedoManager for PostgresRedoManager {
base_img,
records,
};
let start_time = Instant::now();
let result = if request.can_apply_in_zenith() {
self.handle_apply_request_zenith(&request)
} else {
self.handle_apply_request_postgres(&request)
start_time = Instant::now();
let result = {
let mut process_guard = self.process.lock().unwrap();
lock_time = Instant::now();
// launch the WAL redo process on first use
if process_guard.is_none() {
let p = self
.runtime
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
*process_guard = Some(p);
}
let process = process_guard.as_mut().unwrap();
self.runtime
.block_on(self.handle_apply_request(process, &request))
};
let end_time = Instant::now();
WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
end_time = Instant::now();
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
result
}
@@ -226,109 +223,32 @@ impl PostgresRedoManager {
///
/// Create a new PostgresRedoManager.
///
pub fn new(conf: &PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
let (tx, rx): (
SyncSender<(ChannelId, Vec<u8>)>,
Receiver<(ChannelId, Vec<u8>)>,
) = mpsc::sync_channel(CHANNEL_SIZE);
let mut senders: Vec<Sender<Bytes>> = Vec::with_capacity(N_CHANNELS);
let mut receivers: Vec<Mutex<Receiver<Bytes>>> = Vec::with_capacity(N_CHANNELS);
for _ in 0..N_CHANNELS {
let (tx, rx) = mpsc::channel();
senders.push(tx);
receivers.push(Mutex::new(rx));
}
if let Ok(mut proc) = PostgresRedoProcess::launch(conf, &tenantid) {
let _proxy = std::thread::spawn(move || loop {
let (id, data) = rx.recv().unwrap();
match proc.apply_wal_records(data) {
Ok(page) => senders[id as usize].send(page).unwrap(),
Err(err) => {
info!("wal-redo failed with error {:?}", err);
proc.kill();
break;
}
}
});
PostgresRedoManager {
sender: tx,
receivers,
round_robin: AtomicUsize::new(0),
}
} else {
panic!("Failed to launch wal-redo postgres");
pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
// We block on waiting for requests on the walredo request channel, but
// use async I/O to communicate with the child process. Initialize the
// runtime for the async part.
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
// The actual process is launched lazily, on first request.
PostgresRedoManager {
runtime,
tenantid,
conf,
process: Mutex::new(None),
}
}
fn apply_wal_records(
&self,
tag: BufferTag,
base_img: Option<Bytes>,
records: &[(Lsn, WALRecord)],
) -> Result<Bytes, std::io::Error> {
// Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
let mut writebuf: Vec<u8> = Vec::new();
build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
build_push_page_msg(tag, &img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
}
build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let id = self.round_robin.fetch_add(1, Ordering::Relaxed) % N_CHANNELS;
let rx = self.receivers[id].lock().unwrap();
self.sender.send((id, writebuf)).unwrap();
Ok(rx.recv().unwrap())
}
///
/// Process one request for WAL redo using wal-redo postgres
/// Process one request for WAL redo.
///
fn handle_apply_request_postgres(
async fn handle_apply_request(
&self,
process: &mut PostgresRedoProcess,
request: &WalRedoRequest,
) -> Result<Bytes, WalRedoError> {
let blknum = request.blknum;
let lsn = request.lsn;
let base_img = request.base_img.clone();
let records = &request.records;
let nrecords = records.len();
let start = Instant::now();
let apply_result: Result<Bytes, Error>;
if let RelishTag::Relation(rel) = request.rel {
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
apply_result = self.apply_wal_records(buf_tag, base_img, records);
let duration = start.elapsed();
debug!(
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
nrecords,
duration.as_micros(),
lsn
);
apply_result.map_err(WalRedoError::IoError)
} else {
Err(WalRedoError::InvalidRequest)
}
}
///
/// Process one request for WAL redo using custom zenith code
///
fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
let rel = request.rel;
let blknum = request.blknum;
let lsn = request.lsn;
@@ -340,158 +260,178 @@ impl PostgresRedoManager {
let start = Instant::now();
let apply_result: Result<Bytes, Error>;
// Non-relational WAL records are handled here, with custom code that has the
// same effects as the corresponding Postgres WAL redo function.
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let mut page = BytesMut::new();
if let Some(fpi) = base_img {
// If full-page image is provided, then use it...
page.extend_from_slice(&fpi[..]);
if let RelishTag::Relation(rel) = rel {
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
} else {
// otherwise initialize page with zeros
page.extend_from_slice(&ZERO_PAGE);
}
// Apply all collected WAL records
for (_lsn, record) in records {
let mut buf = record.rec.clone();
WAL_REDO_RECORD_COUNTER.inc();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
//move to main data
// TODO probably, we should store some records in our special format
// to avoid this weird parsing on replay
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
if buf.remaining() > skip {
buf.advance(skip);
// Non-relational WAL records are handled here, with custom code that has the
// same effects as the corresponding Postgres WAL redo function.
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let mut page = BytesMut::new();
if let Some(fpi) = base_img {
// If full-page image is provided, then use it...
page.extend_from_slice(&fpi[..]);
} else {
// otherwise initialize page with zeros
page.extend_from_slice(&ZERO_PAGE);
}
// Apply all collected WAL records
for (_lsn, record) in records {
let mut buf = record.rec.clone();
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
// Transaction manager stuff
let rec_segno = match rel {
RelishTag::Slru { slru, segno } => {
assert!(
slru == SlruKind::Clog,
"Not valid XACT relish tag {:?}",
rel
);
segno
}
_ => panic!("Not valid XACT relish tag {:?}", rel),
};
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
}
}
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
}
}
WAL_REDO_RECORD_COUNTER.inc();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
//move to main data
// TODO probably, we should store some records in our special format
// to avoid this weird parsing on replay
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
if buf.remaining() > skip {
buf.advance(skip);
}
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
// Multixact operations
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
if let RelishTag::Slru {
slru,
segno: rec_segno,
} = rel
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
// Transaction manager stuff
let rec_segno = match rel {
RelishTag::Slru { slru, segno } => {
assert!(
slru == SlruKind::Clog,
"Not valid XACT relish tag {:?}",
rel
);
segno
}
_ => panic!("Not valid XACT relish tag {:?}", rel),
};
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
{
if slru == SlruKind::MultiXactMembers {
for i in 0..xlrec.nmembers {
let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
if segno == rec_segno && rpageno == blknum {
// update only target block
let offset = xlrec.moff + i;
let memberoff = mx_offset_to_member_offset(offset);
let flagsoff = mx_offset_to_flags_offset(offset);
let bshift = mx_offset_to_flags_bitshift(offset);
let mut flagsval =
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
flagsval &=
!(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
}
}
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
}
}
}
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
// Multixact operations
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
if let RelishTag::Slru {
slru,
segno: rec_segno,
} = rel
{
if slru == SlruKind::MultiXactMembers {
for i in 0..xlrec.nmembers {
let pageno =
i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
if segno == rec_segno && rpageno == blknum {
// update only target block
let offset = xlrec.moff + i;
let memberoff = mx_offset_to_member_offset(offset);
let flagsoff = mx_offset_to_flags_offset(offset);
let bshift = mx_offset_to_flags_bitshift(offset);
let mut flagsval =
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
flagsval &= !(((1
<< pg_constants::MXACT_MEMBER_BITS_PER_XACT)
- 1)
<< bshift);
flagsval |= xlrec.members[i as usize].status << bshift;
LittleEndian::write_u32(
&mut page[flagsoff..flagsoff + 4],
flagsval,
);
LittleEndian::write_u32(
&mut page[memberoff..memberoff + 4],
xlrec.members[i as usize].xid,
);
flagsval |= xlrec.members[i as usize].status << bshift;
LittleEndian::write_u32(
&mut page[flagsoff..flagsoff + 4],
flagsval,
);
LittleEndian::write_u32(
&mut page[memberoff..memberoff + 4],
xlrec.members[i as usize].xid,
);
}
}
} else {
// Multixact offsets SLRU
let offs = (xlrec.mid
% pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
* 4) as usize;
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
}
} else {
// Multixact offsets SLRU
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
* 4) as usize;
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
panic!();
}
} else {
panic!();
}
} else {
panic!();
}
}
}
apply_result = Ok::<Bytes, Error>(page.freeze());
apply_result = Ok::<Bytes, Error>(page.freeze());
}
let duration = start.elapsed();
let result: Result<Bytes, WalRedoError>;
debug!(
"zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
nrecords,
duration.as_millis(),
lsn
);
apply_result.map_err(WalRedoError::IoError)
if let Err(e) = apply_result {
error!("could not apply WAL records: {:#}", e);
result = Err(WalRedoError::IoError(e));
} else {
let img = apply_result.unwrap();
result = Ok(img);
}
// The caller is responsible for sending the response
result
}
}
@@ -499,17 +439,18 @@ impl PostgresRedoManager {
/// Handle to the Postgres WAL redo process
///
struct PostgresRedoProcess {
child: Child,
stdin: ChildStdin,
stdout: ChildStdout,
stderr: ChildStderr,
}
impl PostgresRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<PostgresRedoProcess, Error> {
async fn launch(
conf: &PageServerConf,
tenantid: &ZTenantId,
) -> Result<PostgresRedoProcess, Error> {
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
// just create one with constant name. That fails if you try to launch more than
// one WAL redo manager concurrently.
@@ -530,6 +471,7 @@ impl PostgresRedoProcess {
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.output()
.await
.expect("failed to execute initdb");
if !initdb.status.success() {
@@ -566,114 +508,102 @@ impl PostgresRedoProcess {
datadir.display()
);
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stdin = child.stdin.take().expect("failed to open child's stdin");
let stderr = child.stderr.take().expect("failed to open child's stderr");
let stdout = child.stdout.take().expect("failed to open child's stdout");
set_nonblock(stdin.as_raw_fd())?;
set_nonblock(stdout.as_raw_fd())?;
set_nonblock(stderr.as_raw_fd())?;
// This async block reads the child's stderr, and forwards it to the logger
let f_stderr = async {
let mut stderr_buffered = tokio::io::BufReader::new(stderr);
Ok(PostgresRedoProcess {
child,
stdin,
stdout,
stderr,
})
}
let mut line = String::new();
loop {
let res = stderr_buffered.read_line(&mut line).await;
if res.is_err() {
debug!("could not convert line to utf-8");
continue;
}
if res.unwrap() == 0 {
break;
}
error!("wal-redo-postgres: {}", line.trim());
line.clear();
}
Ok::<(), Error>(())
};
tokio::spawn(f_stderr);
fn kill(mut self) {
let _ = self.child.kill();
if let Ok(exit_status) = self.child.wait() {
error!("wal-redo-postgres exited with code {}", exit_status);
}
drop(self);
Ok(PostgresRedoProcess { stdin, stdout })
}
//
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
fn apply_wal_records(&mut self, writebuf: Vec<u8>) -> Result<Bytes, std::io::Error> {
let mut nwrite = self.stdin.write(&writebuf)?;
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()];
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
// Prepare for calling poll()
let mut pollfds = [
PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
];
async fn apply_wal_records(
&mut self,
tag: BufferTag,
base_img: Option<Bytes>,
records: &[(Lsn, WALRecord)],
) -> Result<Bytes, std::io::Error> {
let stdout = &mut self.stdout;
// Buffer the writes to avoid a lot of small syscalls.
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
// We do three things simultaneously: send the old base image and WAL records to
// the child process's stdin, read the result from child's stdout, and forward any logging
// information that the child writes to its stderr to the page server's log.
while nresult < pg_constants::BLCKSZ.into() {
// If we have more data to write, wake up if 'stdin' becomes writeable or
// we have data to read. Otherwise only wake up if there's data to read.
let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
if n == 0 {
return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
//
// 'f_stdin' handles writing the base image and WAL records to the child process.
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
// tokio runtime in the 'launch' function already, forwards the logging.
let f_stdin = async {
// Send base image, if any. (If the record initializes the page, previous page
// version is not needed.)
timeout(
TIMEOUT,
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
)
.await??;
if let Some(img) = base_img {
timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
}
// If we have some messages in stderr, forward them to the log.
let err_revents = pollfds[1].revents().unwrap();
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
let mut errbuf: [u8; 16384] = [0; 16384];
let n = self.stderr.read(&mut errbuf)?;
// Send WAL records.
for (lsn, rec) in records.iter() {
WAL_REDO_RECORD_COUNTER.inc();
// The message might not be split correctly into lines here. But this is
// good enough, the important thing is to get the message to the log.
if n > 0 {
error!(
"wal-redo-postgres: {}",
String::from_utf8_lossy(&errbuf[0..n])
);
stdin
.write_all(&build_apply_record_msg(*lsn, &rec.rec))
.await?;
// To make sure we capture all log from the process if it fails, keep
// reading from the stderr, before checking the stdout.
continue;
}
} else if err_revents.contains(PollFlags::POLLHUP) {
return Err(Error::new(
ErrorKind::BrokenPipe,
"WAL redo process closed its stderr unexpectedly",
));
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
// r.lsn >> 32, r.lsn & 0xffff_ffff);
}
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
// If we have more data to write and 'stdin' is writeable, do write.
if nwrite < writebuf.len() {
let in_revents = pollfds[2].revents().unwrap();
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
nwrite += self.stdin.write(&writebuf[nwrite..])?;
} else if in_revents.contains(PollFlags::POLLHUP) {
// We still have more data to write, but the process closed the pipe.
return Err(Error::new(
ErrorKind::BrokenPipe,
"WAL redo process closed its stdin unexpectedly",
));
}
}
// Send GetPage command to get the result back
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
timeout(TIMEOUT, stdin.flush()).await??;
//debug!("sent GetPage for {}", tag.blknum);
Ok::<(), Error>(())
};
// If we have some data in stdout, read it to the result buffer.
let out_revents = pollfds[0].revents().unwrap();
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
nresult += self.stdout.read(&mut resultbuf[nresult..])?;
} else if out_revents.contains(PollFlags::POLLHUP) {
return Err(Error::new(
ErrorKind::BrokenPipe,
"WAL redo process closed its stdout unexpectedly",
));
}
}
// Read back new page image
let f_stdout = async {
let mut buf = [0u8; 8192];
Ok(Bytes::from(resultbuf))
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
//debug!("got response for {}", tag.blknum);
Ok::<[u8; 8192], Error>(buf)
};
let res = tokio::try_join!(f_stdout, f_stdin)?;
let buf = res.0;
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
}
}
@@ -681,42 +611,62 @@ impl PostgresRedoProcess {
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
// explanation of the protocol.
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
let len = 4 + 1 + 4 * 4;
let mut buf = Vec::with_capacity(1 + len);
buf.put_u8(b'B');
buf.put_u32(len as u32);
tag.ser_into(buf)
tag.ser_into(&mut buf)
.expect("serialize BufferTag should always succeed");
debug_assert!(buf.len() == 1 + len);
buf
}
fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
assert!(base_img.len() == 8192);
let len = 4 + 1 + 4 * 4 + base_img.len();
let mut buf = Vec::with_capacity(1 + len);
buf.put_u8(b'P');
buf.put_u32(len as u32);
tag.ser_into(buf)
tag.ser_into(&mut buf)
.expect("serialize BufferTag should always succeed");
buf.put(base_img);
debug_assert!(buf.len() == 1 + len);
buf
}
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
let len = 4 + 8 + rec.len();
let mut buf: Vec<u8> = Vec::with_capacity(1 + len);
buf.put_u8(b'A');
buf.put_u32(len as u32);
buf.put_u64(endlsn.0);
buf.put(rec);
debug_assert!(buf.len() == 1 + len);
buf
}
fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
let len = 4 + 1 + 4 * 4;
let mut buf = Vec::with_capacity(1 + len);
buf.put_u8(b'G');
buf.put_u32(len as u32);
tag.ser_into(buf)
tag.ser_into(&mut buf)
.expect("serialize BufferTag should always succeed");
debug_assert!(buf.len() == 1 + len);
buf
}

View File

@@ -2,7 +2,7 @@
//! with the "START_REPLICATION" message.
use crate::send_wal::SendWalHandler;
use crate::timeline::{Timeline, TimelineTools};
use crate::timeline::{ReplicaState, Timeline, TimelineTools};
use anyhow::{anyhow, Context, Result};
use bytes::Bytes;
use log::*;
@@ -20,7 +20,7 @@ use std::{str, thread};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::PostgresBackend;
use zenith_utils::pq_proto::{BeMessage, FeMessage, XLogDataBody};
use zenith_utils::pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody};
use zenith_utils::sock_split::ReadStream;
pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
@@ -32,7 +32,7 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
type FullTransactionId = u64;
/// Hot standby feedback received from replica
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct HotStandbyFeedback {
pub ts: TimestampTz,
pub xmin: FullTransactionId,
@@ -49,6 +49,16 @@ impl HotStandbyFeedback {
}
}
/// Standby status update
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StandbyReply {
pub write_lsn: Lsn, // disk consistent lSN
pub flush_lsn: Lsn, // LSN committedby quorum
pub apply_lsn: Lsn, // not used
pub reply_ts: TimestampTz,
pub reply_requested: bool,
}
/// A network connection that's speaking the replication protocol.
pub struct ReplicationConn {
/// This is an `Option` because we will spawn a background thread that will
@@ -56,16 +66,15 @@ pub struct ReplicationConn {
stream_in: Option<ReadStream>,
}
// TODO: move this to crate::timeline when there's more users
// TODO: design a proper Timeline mock api
trait HsFeedbackSubscriber {
fn add_hs_feedback(&self, _feedback: HotStandbyFeedback) {}
/// Scope guard to unregister replication connection from timeline
struct ReplicationConnGuard {
replica: usize, // replica internal ID assigned by timeline
timeline: Arc<Timeline>,
}
impl HsFeedbackSubscriber for Arc<Timeline> {
#[inline(always)]
fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
Timeline::add_hs_feedback(self, feedback);
impl Drop for ReplicationConnGuard {
fn drop(&mut self) {
self.timeline.update_replica_state(self.replica, None);
}
}
@@ -79,26 +88,33 @@ impl ReplicationConn {
/// Handle incoming messages from the network.
/// This is spawned into the background by `handle_start_replication`.
fn background_thread(
mut stream_in: impl Read,
subscriber: impl HsFeedbackSubscriber,
) -> Result<()> {
fn background_thread(mut stream_in: impl Read, timeline: Arc<Timeline>) -> Result<()> {
let mut state = ReplicaState::new();
let replica = timeline.add_replica(state);
let _guard = ReplicationConnGuard {
replica,
timeline: timeline.clone(),
};
// Wait for replica's feedback.
while let Some(msg) = FeMessage::read(&mut stream_in)? {
match &msg {
FeMessage::CopyData(m) => {
// There's two possible data messages that the client is supposed to send here:
// `HotStandbyFeedback` and `StandbyStatusUpdate`. We only handle hot standby
// feedback.
// `HotStandbyFeedback` and `StandbyStatusUpdate`.
match m.first().cloned() {
Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
// Note: deserializing is on m[1..] because we skip the tag byte.
let feedback = HotStandbyFeedback::des(&m[1..])
state.hs_feedback = HotStandbyFeedback::des(&m[1..])
.context("failed to deserialize HotStandbyFeedback")?;
subscriber.add_hs_feedback(feedback);
timeline.update_replica_state(replica, Some(state));
}
Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
let reply = StandbyReply::des(&m[1..])
.context("failed to deserialize StandbyReply")?;
state.disk_consistent_lsn = reply.write_lsn;
timeline.update_replica_state(replica, Some(state));
}
Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => (),
_ => warn!("unexpected message {:?}", msg),
}
}
@@ -187,7 +203,7 @@ impl ReplicationConn {
// switch to copy
pgb.write_message(&BeMessage::CopyBothResponse)?;
let mut end_pos: Lsn;
let mut end_pos = Lsn(0);
let mut wal_file: Option<File> = None;
loop {
@@ -202,7 +218,18 @@ impl ReplicationConn {
} else {
/* normal mode */
let timeline = swh.timeline.get();
end_pos = timeline.wait_for_lsn(start_pos);
if let Some(lsn) = timeline.wait_for_lsn(start_pos) {
end_pos = lsn
} else {
// timeout expired: request pageserver status
pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
sent_ptr: end_pos.0,
timestamp: get_current_timestamp(),
request_reply: true,
}))
.context("Failed to send KeepAlive message")?;
continue;
}
}
if end_pos == END_REPLICATION_MARKER {
break;
@@ -257,18 +284,3 @@ impl ReplicationConn {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
// A no-op impl for tests
impl HsFeedbackSubscriber for () {}
#[test]
fn test_replication_conn_background_thread_eof() {
// Test that background_thread recognizes EOF
let stream: &[u8] = &[];
ReplicationConn::background_thread(stream, ()).unwrap();
}
}

View File

@@ -191,6 +191,8 @@ pub struct AppendResponse {
// We report back our awareness about which WAL is committed, as this is
// a criterion for walproposer --sync mode exit
pub commit_lsn: Lsn,
// Min disk consistent lsn of pageservers (portion of WAL applied and written to the disk by pageservers)
pub disk_consistent_lsn: Lsn,
pub hs_feedback: HotStandbyFeedback,
}
@@ -458,6 +460,7 @@ where
epoch: self.s.acceptor_state.epoch,
commit_lsn: Lsn(0),
flush_lsn: Lsn(0),
disk_consistent_lsn: Lsn(0),
hs_feedback: HotStandbyFeedback::empty(),
};
return Ok(AcceptorProposerMessage::AppendResponse(resp));
@@ -567,6 +570,7 @@ where
epoch: self.s.acceptor_state.epoch,
flush_lsn: self.flush_lsn,
commit_lsn: self.s.commit_lsn,
disk_consistent_lsn: Lsn(0),
// will be filled by caller code to avoid bothering safekeeper
hs_feedback: HotStandbyFeedback::empty(),
};

View File

@@ -11,9 +11,9 @@ use std::collections::HashMap;
use std::fs::{self, File, OpenOptions};
use std::io::{Seek, SeekFrom, Write};
use std::sync::{Arc, Condvar, Mutex};
use std::time::Duration;
use zenith_utils::bin_ser::LeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
@@ -25,6 +25,35 @@ use crate::WalAcceptorConf;
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
const CONTROL_FILE_NAME: &str = "safekeeper.control";
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
/// Replica status: host standby feedback + disk consistent lsn
#[derive(Debug, Clone, Copy)]
pub struct ReplicaState {
/// combined disk_consistent_lsn of pageservers
pub disk_consistent_lsn: Lsn,
/// combined hot standby feedback from all replicas
pub hs_feedback: HotStandbyFeedback,
}
impl Default for ReplicaState {
fn default() -> Self {
Self::new()
}
}
impl ReplicaState {
pub fn new() -> ReplicaState {
ReplicaState {
disk_consistent_lsn: Lsn(u64::MAX),
hs_feedback: HotStandbyFeedback {
ts: 0,
xmin: u64::MAX,
catalog_xmin: u64::MAX,
},
}
}
}
/// Shared state associated with database instance (tenant)
struct SharedState {
@@ -33,8 +62,8 @@ struct SharedState {
/// For receiving-sending wal cooperation
/// quorum commit LSN we've notified walsenders about
notified_commit_lsn: Lsn,
/// combined hot standby feedback from all replicas
hs_feedback: HotStandbyFeedback,
/// State of replicas
replicas: Vec<Option<ReplicaState>>,
}
// A named boolean.
@@ -45,6 +74,31 @@ pub enum CreateControlFile {
}
impl SharedState {
/// Get combined stateof all alive replicas
pub fn get_replicas_state(&self) -> ReplicaState {
let mut acc = ReplicaState::new();
for state in self.replicas.iter().flatten() {
acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts);
acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin);
acc.hs_feedback.catalog_xmin =
min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin);
acc.disk_consistent_lsn = Lsn::min(acc.disk_consistent_lsn, state.disk_consistent_lsn);
}
acc
}
/// Assign new replica ID. We choose first empty cell in the replicas vector
/// or extend the vector if there are not free items.
pub fn add_replica(&mut self, state: ReplicaState) -> usize {
if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) {
self.replicas[pos] = Some(state);
return pos;
}
let pos = self.replicas.len();
self.replicas.push(Some(state));
pos
}
/// Restore SharedState from control file. Locks the control file along the
/// way to prevent running more than one instance of safekeeper on the same
/// data dir.
@@ -74,21 +128,10 @@ impl SharedState {
Ok(Self {
notified_commit_lsn: Lsn(0),
sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state),
hs_feedback: HotStandbyFeedback {
ts: 0,
xmin: u64::MAX,
catalog_xmin: u64::MAX,
},
replicas: Vec::new(),
})
}
/// Accumulate hot standby feedbacks from replicas
pub fn add_hs_feedback(&mut self, feedback: HotStandbyFeedback) {
self.hs_feedback.xmin = min(self.hs_feedback.xmin, feedback.xmin);
self.hs_feedback.catalog_xmin = min(self.hs_feedback.catalog_xmin, feedback.catalog_xmin);
self.hs_feedback.ts = max(self.hs_feedback.ts, feedback.ts);
}
/// Fetch and lock control file (prevent running more than one instance of safekeeper)
/// If create=false and file doesn't exist, bails out.
fn load_control_file(
@@ -178,20 +221,27 @@ impl Timeline {
}
}
/// Wait for an LSN to be committed.
/// Timed wait for an LSN to be committed.
///
/// Returns the last committed LSN, which will be at least
/// as high as the LSN waited for.
/// as high as the LSN waited for, or None if timeout expired.
///
pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
pub fn wait_for_lsn(&self, lsn: Lsn) -> Option<Lsn> {
let mut shared_state = self.mutex.lock().unwrap();
loop {
let commit_lsn = shared_state.notified_commit_lsn;
// This must be `>`, not `>=`.
if commit_lsn > lsn {
return commit_lsn;
return Some(commit_lsn);
}
shared_state = self.cond.wait(shared_state).unwrap();
let result = self
.cond
.wait_timeout(shared_state, POLL_STATE_TIMEOUT)
.unwrap();
if result.1.timed_out() {
return None;
}
shared_state = result.0
}
}
@@ -219,9 +269,11 @@ impl Timeline {
// commit_lsn if we are catching up safekeeper.
commit_lsn = shared_state.sk.commit_lsn;
// if this is AppendResponse, fill in proper hot standby feedback
// if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
if let AcceptorProposerMessage::AppendResponse(ref mut resp) = rmsg {
resp.hs_feedback = shared_state.hs_feedback.clone();
let state = shared_state.get_replicas_state();
resp.hs_feedback = state.hs_feedback;
resp.disk_consistent_lsn = state.disk_consistent_lsn;
}
}
// Ping wal sender that new data might be available.
@@ -233,15 +285,14 @@ impl Timeline {
self.mutex.lock().unwrap().sk.s.clone()
}
// Accumulate hot standby feedbacks from replicas
pub fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
pub fn add_replica(&self, state: ReplicaState) -> usize {
let mut shared_state = self.mutex.lock().unwrap();
shared_state.add_hs_feedback(feedback);
shared_state.add_replica(state)
}
pub fn get_hs_feedback(&self) -> HotStandbyFeedback {
let shared_state = self.mutex.lock().unwrap();
shared_state.hs_feedback.clone()
pub fn update_replica_state(&self, id: usize, state: Option<ReplicaState>) {
let mut shared_state = self.mutex.lock().unwrap();
shared_state.replicas[id] = state;
}
pub fn get_end_of_wal(&self) -> (Lsn, u32) {

View File

@@ -27,7 +27,6 @@ workspace_hack = { path = "../workspace_hack" }
rand = "0.8.3"
jsonwebtoken = "7"
hex = { version = "0.4.3", features = ["serde"] }
nix = "0.23.0"
rustls = "0.19.1"
rustls-split = "0.2.1"

View File

@@ -40,7 +40,3 @@ pub mod logging;
// Misc
pub mod accum;
// Utility for putting a raw file descriptor into non-blocking mode
pub mod nonblock;

View File

@@ -1,17 +0,0 @@
use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
use std::os::unix::io::RawFd;
/// Put a file descriptor into non-blocking mode
pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
let bits = fcntl(fd, F_GETFL)?;
// Safety: If F_GETFL returns some unknown bits, they should be valid
// for passing back to F_SETFL, too. If we left them out, the F_SETFL
// would effectively clear them, which is not what we want.
let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
flags |= OFlag::O_NONBLOCK;
fcntl(fd, F_SETFL(flags))?;
Ok(())
}

View File

@@ -358,6 +358,7 @@ pub enum BeMessage<'a> {
RowDescription(&'a [RowDescriptor<'a>]),
XLogData(XLogDataBody<'a>),
NoticeResponse(String),
KeepAlive(WalSndKeepAlive),
}
// One row desciption in RowDescription packet.
@@ -409,6 +410,13 @@ pub struct XLogDataBody<'a> {
pub data: &'a [u8],
}
#[derive(Debug)]
pub struct WalSndKeepAlive {
pub sent_ptr: u64,
pub timestamp: i64,
pub request_reply: bool,
}
pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]);
// single text column
@@ -721,6 +729,18 @@ impl<'a> BeMessage<'a> {
})
.unwrap();
}
BeMessage::KeepAlive(req) => {
buf.put_u8(b'd');
write_body(buf, |buf| {
buf.put_u8(b'k');
buf.put_u64(req.sent_ptr);
buf.put_i64(req.timestamp);
buf.put_u8(if req.request_reply { 1u8 } else { 0u8 });
Ok::<_, io::Error>(())
})
.unwrap();
}
}
Ok(())
}

View File

@@ -87,10 +87,6 @@ impl<K: Ord, V> VecMap<K, V> {
Ok(None)
}
pub fn last(&self) -> Option<&(K, V)> {
self.0.last()
}
/// Split the map into two.
///
/// The left map contains everything before `cutoff` (exclusive).