From 07fb30747ac28e74d22ad6670997577a33b24e73 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 8 Apr 2021 19:39:30 +0300 Subject: [PATCH 01/21] Store pageserver data in RocksDB --- Cargo.lock | 122 +++++++++ pageserver/Cargo.toml | 1 + pageserver/src/page_cache.rs | 488 ++++++++++++++++++++------------- pageserver/src/page_service.rs | 10 +- pageserver/src/restore_s3.rs | 10 +- pageserver/src/walreceiver.rs | 12 +- pageserver/src/walredo.rs | 25 +- 7 files changed, 444 insertions(+), 224 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0ac61eb60d..fb694ec2f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -241,6 +241,25 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "bindgen" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", +] + [[package]] name = "bitflags" version = "1.2.1" @@ -322,6 +341,18 @@ name = "cc" version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cexpr" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" +dependencies = [ + "nom", +] [[package]] name = "cfg-if" @@ -348,6 +379,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "clang-sys" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "853eda514c284c2287f4bf20ae614f8781f40a81d32ecda6e91449304dfe077c" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "2.33.3" @@ -729,6 +771,12 @@ dependencies = [ "wasi 0.10.0+wasi-snapshot-preview1", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "gloo-timers" version = "0.2.1" @@ -920,6 +968,15 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" +[[package]] +name = "jobserver" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c71313ebb9439f74b00d9d2dcec36440beaf57a6aa0623068441dd7cd81a7f2" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.50" @@ -944,12 +1001,39 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56d855069fafbb9b344c0f962150cd2c1187975cb1c22c1522c240d8c4986714" +[[package]] +name = "libloading" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f84d96438c15fcd6c3f244c8fce01d1e2b9c6b5623e9c711dc9286d8fc92d6a" +dependencies = [ + "cfg-if 1.0.0", + "winapi", +] + +[[package]] +name = "librocksdb-sys" +version = "6.17.3" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#0b700fe70da8ee30483fde79f44df549f8fe11ec" +dependencies = [ + "bindgen", + "cc", + "glob", + "libc", +] + [[package]] name = "lock_api" version = "0.4.3" @@ -1065,6 +1149,16 @@ dependencies = [ "socket2 0.4.0", ] +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "memchr", + "version_check", +] + [[package]] name = "ntapi" version = "0.3.6" @@ -1184,6 +1278,7 @@ dependencies = [ "postgres-protocol", "rand 0.8.3", "regex", + "rocksdb", "rust-s3", "slog", "slog-async", @@ -1228,6 +1323,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "percent-encoding" version = "2.1.0" @@ -1566,6 +1667,15 @@ dependencies = [ "winreg", ] +[[package]] +name = "rocksdb" +version = "0.15.0" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#0b700fe70da8ee30483fde79f44df549f8fe11ec" +dependencies = [ + "libc", + "librocksdb-sys", +] + [[package]] name = "rust-argon2" version = "0.8.3" @@ -1619,6 +1729,12 @@ dependencies = [ "url", ] +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.2.3" @@ -1759,6 +1875,12 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "shlex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" + [[package]] name = "signal-hook-registry" version = "1.3.0" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ab8b78dd2d..8d629deabc 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -32,5 +32,6 @@ tokio-stream = { version = "0.1.4" } tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" } postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" } postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" } +rocksdb = { git = "https://github.com/rust-rocksdb/rust-rocksdb.git" } anyhow = "1.0" crc32c = "0.6.0" diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 7c77ca5926..ea36de3af3 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -6,8 +6,7 @@ // per-entry mutex. // -use core::ops::Bound::Included; -use std::collections::{BTreeMap, HashMap}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; use std::error::Error; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; @@ -16,10 +15,10 @@ use std::thread; use std::time::Duration; use std::{convert::TryInto, ops::AddAssign}; // use tokio::sync::RwLock; -use bytes::Bytes; use lazy_static::lazy_static; use log::*; -use rand::Rng; +use rocksdb::*; +use std::collections::HashMap; use crate::{walredo, PageServerConf}; @@ -32,6 +31,9 @@ static TIMEOUT: Duration = Duration::from_secs(60); pub struct PageCache { shared: Mutex, + // RocksDB handle + db: DB, + // Channel for communicating with the WAL redo process here. pub walredo_sender: Sender>, pub walredo_receiver: Receiver>, @@ -80,9 +82,6 @@ impl AddAssign for PageCacheStats { // Shared data structure, holding page cache and related auxiliary information // struct PageCacheShared { - // The actual page cache - pagecache: BTreeMap>, - // Relation n_blocks cache // // This hashtable should be updated together with the pagecache. Now it is @@ -117,7 +116,7 @@ pub fn get_pagecache(conf: PageServerConf, sys_id: u64) -> Arc { let mut pcaches = PAGECACHES.lock().unwrap(); if !pcaches.contains_key(&sys_id) { - pcaches.insert(sys_id, Arc::new(init_page_cache())); + pcaches.insert(sys_id, Arc::new(init_page_cache(&conf, sys_id))); // Initialize the WAL redo thread // @@ -135,13 +134,22 @@ pub fn get_pagecache(conf: PageServerConf, sys_id: u64) -> Arc { pcaches.get(&sys_id).unwrap().clone() } -fn init_page_cache() -> PageCache { +fn open_rocksdb(conf: &PageServerConf, sys_id: u64) -> DB { + let path = conf.data_dir.join(sys_id.to_string()); + let mut opts = Options::default(); + opts.create_if_missing(true); + opts.set_use_fsync(true); + opts.set_compression_type(DBCompressionType::Lz4); + DB::open(&opts, &path).unwrap() +} + +fn init_page_cache(conf: &PageServerConf, sys_id: u64) -> PageCache { // Initialize the channel between the page cache and the WAL applicator let (s, r) = unbounded(); PageCache { + db: open_rocksdb(&conf, sys_id), shared: Mutex::new(PageCacheShared { - pagecache: BTreeMap::new(), relsize_cache: HashMap::new(), first_valid_lsn: 0, last_valid_lsn: 0, @@ -182,6 +190,19 @@ pub struct CacheKey { pub lsn: u64, } +impl CacheKey { + pub fn pack(&self, buf: &mut BytesMut) { + self.tag.pack(buf); + buf.put_u64(self.lsn); + } + pub fn unpack(buf: &mut BytesMut) -> CacheKey { + CacheKey { + tag: BufferTag::unpack(buf), + lsn: buf.get_u64(), + } + } +} + pub struct CacheEntry { pub key: CacheKey, @@ -201,21 +222,47 @@ pub struct CacheEntryContent { pub apply_pending: bool, } -impl CacheEntry { - fn new(key: CacheKey) -> CacheEntry { - CacheEntry { - key: key, - content: Mutex::new(CacheEntryContent { - page_image: None, +impl CacheEntryContent { + pub fn pack(&self, buf: &mut BytesMut) { + if let Some(image) = &self.page_image { + buf.put_u8(1); + buf.put_u16(image.len() as u16); + buf.put_slice(&image[..]); + } else if let Some(rec) = &self.wal_record { + buf.put_u8(0); + rec.pack(buf); + } + } + pub fn unpack(buf: &mut BytesMut) -> CacheEntryContent { + if buf.get_u8() == 1 { + let mut dst = vec![0u8; buf.get_u16() as usize]; + buf.copy_to_slice(&mut dst); + CacheEntryContent { + page_image: Some(Bytes::from(dst)), wal_record: None, apply_pending: false, - }), + } + } else { + CacheEntryContent { + page_image: None, + wal_record: Some(WALRecord::unpack(buf)), + apply_pending: false, + } + } + } +} + +impl CacheEntry { + fn new(key: CacheKey, content: CacheEntryContent) -> CacheEntry { + CacheEntry { + key, + content: Mutex::new(content), walredo_condvar: Condvar::new(), } } } -#[derive(Eq, PartialEq, Hash, Clone, Copy)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)] pub struct RelTag { pub spcnode: u32, pub dbnode: u32, @@ -223,15 +270,42 @@ pub struct RelTag { pub forknum: u8, } -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +impl RelTag { + pub fn pack(&self, buf: &mut BytesMut) { + buf.put_u32(self.spcnode); + buf.put_u32(self.dbnode); + buf.put_u32(self.relnode); + buf.put_u32(self.forknum as u32); + } + pub fn unpack(buf: &mut BytesMut) -> RelTag { + RelTag { + spcnode: buf.get_u32(), + dbnode: buf.get_u32(), + relnode: buf.get_u32(), + forknum: buf.get_u32() as u8, + } + } +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] pub struct BufferTag { - pub spcnode: u32, - pub dbnode: u32, - pub relnode: u32, - pub forknum: u8, + pub rel: RelTag, pub blknum: u32, } +impl BufferTag { + pub fn pack(&self, buf: &mut BytesMut) { + self.rel.pack(buf); + buf.put_u32(self.blknum); + } + pub fn unpack(buf: &mut BytesMut) -> BufferTag { + BufferTag { + rel: RelTag::unpack(buf), + blknum: buf.get_u32(), + } + } +} + #[derive(Clone)] pub struct WALRecord { pub lsn: u64, // LSN at the *end* of the record @@ -239,6 +313,26 @@ pub struct WALRecord { pub rec: Bytes, } +impl WALRecord { + pub fn pack(&self, buf: &mut BytesMut) { + buf.put_u64(self.lsn); + buf.put_u8(self.will_init as u8); + buf.put_u16(self.rec.len() as u16); + buf.put_slice(&self.rec[..]); + } + pub fn unpack(buf: &mut BytesMut) -> WALRecord { + let lsn = buf.get_u64(); + let will_init = buf.get_u8() != 0; + let mut dst = vec![0u8; buf.get_u16() as usize]; + buf.copy_to_slice(&mut dst); + WALRecord { + lsn, + will_init, + rec: Bytes::from(dst), + } + } +} + // Public interface functions impl PageCache { @@ -252,10 +346,9 @@ impl PageCache { // Look up cache entry. If it's a page image, return that. If it's a WAL record, // ask the WAL redo service to reconstruct the page image from the WAL records. - let minkey = CacheKey { tag: tag, lsn: 0 }; - let maxkey = CacheKey { tag: tag, lsn: lsn }; + let minkey = CacheKey { tag, lsn: 0 }; + let maxkey = CacheKey { tag, lsn }; - let entry_rc: Arc; { let mut shared = self.shared.lock().unwrap(); let mut waited = false; @@ -275,6 +368,10 @@ impl PageCache { shared = wait_result.0; if wait_result.1.timed_out() { + error!( + "Timed out while waiting for WAL record at LSN {} to arrive", + lsn + ); return Err(format!( "Timed out while waiting for WAL record at LSN {} to arrive", lsn @@ -286,68 +383,66 @@ impl PageCache { } if lsn < shared.first_valid_lsn { + error!( + "LSN {} has already been removed", + lsn + ); return Err(format!("LSN {} has already been removed", lsn))?; } - - let pagecache = &shared.pagecache; - - let mut entries = pagecache.range((Included(&minkey), Included(&maxkey))); - - let entry_opt = entries.next_back(); - - if entry_opt.is_none() { - static ZERO_PAGE: [u8; 8192] = [0 as u8; 8192]; - return Ok(Bytes::from_static(&ZERO_PAGE)); - /* return Err("could not find page image")?; */ - } - let (_key, entry) = entry_opt.unwrap(); - entry_rc = entry.clone(); - - // Now that we have a reference to the cache entry, drop the lock on the map. - // It's important to do this before waiting on the condition variable below, - // and better to do it as soon as possible to maximize concurrency. } + let mut buf = BytesMut::new(); + minkey.pack(&mut buf); - // Lock the cache entry and dig the page image out of it. + let mut readopts = ReadOptions::default(); + readopts.set_iterate_lower_bound(buf.to_vec()); + + buf.clear(); + maxkey.pack(&mut buf); + let mut iter = self + .db + .iterator_opt(IteratorMode::From(&buf[..], Direction::Reverse), readopts); + let entry_opt = iter.next(); + + if entry_opt.is_none() { + static ZERO_PAGE: [u8; 8192] = [0 as u8; 8192]; + return Ok(Bytes::from_static(&ZERO_PAGE)); + /* return Err("could not find page image")?; */ + } + let (k, v) = entry_opt.unwrap(); + buf.clear(); + buf.extend_from_slice(&v); + let content = CacheEntryContent::unpack(&mut buf); let page_img: Bytes; - { + if let Some(img) = &content.page_image { + page_img = img.clone(); + } else if content.wal_record.is_some() { + buf.clear(); + buf.extend_from_slice(&k); + let entry_rc = Arc::new(CacheEntry::new(CacheKey::unpack(&mut buf), content)); + let mut entry_content = entry_rc.content.lock().unwrap(); + entry_content.apply_pending = true; - if let Some(img) = &entry_content.page_image { - assert!(!entry_content.apply_pending); - page_img = img.clone(); - } else if entry_content.wal_record.is_some() { - // - // If this page needs to be reconstructed by applying some WAL, - // send a request to the WAL redo thread. - // - if !entry_content.apply_pending { - assert!(!entry_content.apply_pending); - entry_content.apply_pending = true; + let s = &self.walredo_sender; + s.send(entry_rc.clone())?; - let s = &self.walredo_sender; - s.send(entry_rc.clone())?; - } - - while entry_content.apply_pending { - entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap(); - } - - // We should now have a page image. If we don't, it means that WAL redo - // failed to reconstruct it. WAL redo should've logged that error already. - page_img = match &entry_content.page_image { - Some(p) => p.clone(), - None => { - error!( - "could not apply WAL to reconstruct page image for GetPage@LSN request" - ); - return Err("could not apply WAL to reconstruct page image".into()); - } - }; - } else { - // No base image, and no WAL record. Huh? - return Err(format!("no page image or WAL record for requested page"))?; + while entry_content.apply_pending { + entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap(); } + + // We should now have a page image. If we don't, it means that WAL redo + // failed to reconstruct it. WAL redo should've logged that error already. + page_img = match &entry_content.page_image { + Some(p) => p.clone(), + None => { + error!("could not apply WAL to reconstruct page image for GetPage@LSN request"); + return Err("could not apply WAL to reconstruct page image".into()); + } + }; + self.put_page_image(tag, lsn, page_img.clone()); + } else { + // No base image, and no WAL record. Huh? + return Err(format!("no page image or WAL record for requested page"))?; } // FIXME: assumes little-endian. Only used for the debugging log though @@ -357,10 +452,10 @@ impl PageCache { "Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}", page_lsn_hi, page_lsn_lo, - tag.spcnode, - tag.dbnode, - tag.relnode, - tag.forknum, + tag.rel.spcnode, + tag.rel.dbnode, + tag.rel.relnode, + tag.rel.forknum, tag.blknum ); @@ -375,38 +470,41 @@ impl PageCache { // over it. // pub fn collect_records_for_apply(&self, entry: &CacheEntry) -> (Option, Vec) { - // Scan the BTreeMap backwards, starting from the given entry. - let shared = self.shared.lock().unwrap(); - let pagecache = &shared.pagecache; - let minkey = CacheKey { - tag: entry.key.tag, + tag: BufferTag { + rel: entry.key.tag.rel, + blknum: 0, + }, lsn: 0, }; - let maxkey = CacheKey { - tag: entry.key.tag, - lsn: entry.key.lsn, - }; - let entries = pagecache.range((Included(&minkey), Included(&maxkey))); - // the last entry in the range should be the CacheEntry we were given - //let _last_entry = entries.next_back(); - //assert!(last_entry == entry); + let mut buf = BytesMut::new(); + minkey.pack(&mut buf); + + let mut readopts = ReadOptions::default(); + readopts.set_iterate_lower_bound(buf.to_vec()); + + buf.clear(); + entry.key.pack(&mut buf); + let iter = self + .db + .iterator_opt(IteratorMode::From(&buf[..], Direction::Reverse), readopts); let mut base_img: Option = None; let mut records: Vec = Vec::new(); // Scan backwards, collecting the WAL records, until we hit an // old page image. - for (_key, e) in entries.rev() { - let e = e.content.lock().unwrap(); - - if let Some(img) = &e.page_image { + for (_k, v) in iter { + buf.clear(); + buf.extend_from_slice(&v); + let content = CacheEntryContent::unpack(&mut buf); + if let Some(img) = &content.page_image { // We have a base image. No need to dig deeper into the list of // records base_img = Some(img.clone()); break; - } else if let Some(rec) = &e.wal_record { + } else if let Some(rec) = &content.wal_record { records.push(rec.clone()); // If this WAL record initializes the page, no need to dig deeper. @@ -422,40 +520,40 @@ impl PageCache { return (base_img, records); } + fn update_rel_size(&self, tag: &BufferTag) { + let mut shared = self.shared.lock().unwrap(); + let rel_entry = shared + .relsize_cache + .entry(tag.rel) + .or_insert(tag.blknum + 1); + if tag.blknum >= *rel_entry { + *rel_entry = tag.blknum + 1; + } + } + // // Adds a WAL record to the page cache // pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) { - let key = CacheKey { - tag: tag, - lsn: rec.lsn, + let key = CacheKey { tag, lsn: rec.lsn }; + + let content = CacheEntryContent { + page_image: None, + wal_record: Some(rec), + apply_pending: false, }; - let entry = CacheEntry::new(key.clone()); - entry.content.lock().unwrap().wal_record = Some(rec); + self.update_rel_size(&tag); - let mut shared = self.shared.lock().unwrap(); - - let rel_tag = RelTag { - spcnode: tag.spcnode, - dbnode: tag.dbnode, - relnode: tag.relnode, - forknum: tag.forknum, - }; - let rel_entry = shared.relsize_cache.entry(rel_tag).or_insert(0); - if tag.blknum >= *rel_entry { - *rel_entry = tag.blknum + 1; - } + let mut key_buf = BytesMut::new(); + key.pack(&mut key_buf); + let mut val_buf = BytesMut::new(); + content.pack(&mut val_buf); trace!("put_wal_record lsn: {}", key.lsn); + let _res = self.db.put(&key_buf[..], &val_buf[..]); - let oldentry = shared.pagecache.insert(key, Arc::new(entry)); self.num_entries.fetch_add(1, Ordering::Relaxed); - - if !oldentry.is_none() { - error!("overwriting WAL record in page cache"); - } - self.num_wal_records.fetch_add(1, Ordering::Relaxed); } @@ -463,21 +561,23 @@ impl PageCache { // Memorize a full image of a page version // pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) { - let key = CacheKey { tag: tag, lsn: lsn }; + let key = CacheKey { tag, lsn }; + let content = CacheEntryContent { + page_image: Some(img), + wal_record: None, + apply_pending: false, + }; - let entry = CacheEntry::new(key.clone()); - entry.content.lock().unwrap().page_image = Some(img); + let mut key_buf = BytesMut::new(); + key.pack(&mut key_buf); + let mut val_buf = BytesMut::new(); + content.pack(&mut val_buf); - let mut shared = self.shared.lock().unwrap(); - let pagecache = &mut shared.pagecache; - - let oldentry = pagecache.insert(key, Arc::new(entry)); - self.num_entries.fetch_add(1, Ordering::Relaxed); - assert!(oldentry.is_none()); + trace!("put_wal_record lsn: {}", key.lsn); + let _res = self.db.put(&key_buf[..], &val_buf[..]); //debug!("inserted page image for {}/{}/{}_{} blk {} at {}", // tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn); - self.num_page_images.fetch_add(1, Ordering::Relaxed); } @@ -486,12 +586,15 @@ impl PageCache { let mut shared = self.shared.lock().unwrap(); // Can't move backwards. - assert!(lsn >= shared.last_valid_lsn); + //assert!(lsn >= shared.last_valid_lsn); + if lsn > shared.last_valid_lsn { + shared.last_valid_lsn = lsn; + self.valid_lsn_condvar.notify_all(); - shared.last_valid_lsn = lsn; - self.valid_lsn_condvar.notify_all(); - - self.last_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + } else { + trace!("lsn={}, shared.last_valid_lsn={}", lsn, shared.last_valid_lsn); + } } // @@ -509,7 +612,7 @@ impl PageCache { self.valid_lsn_condvar.notify_all(); self.last_valid_lsn.store(lsn, Ordering::Relaxed); - self.last_valid_lsn.store(lsn, Ordering::Relaxed); + self.last_record_lsn.store(lsn, Ordering::Relaxed); } // @@ -549,54 +652,6 @@ impl PageCache { return shared.last_record_lsn; } - // - // Simple test function for the WAL redo code: - // - // 1. Pick a page from the page cache at random. - // 2. Request that page with GetPage@LSN, using Max LSN (i.e. get the latest page version) - // - // - pub fn _test_get_page_at_lsn(&self) { - // for quick testing of the get_page_at_lsn() funcion. - // - // Get a random page from the page cache. Apply all its WAL, by requesting - // that page at the highest lsn. - - let mut tag: Option = None; - - { - let shared = self.shared.lock().unwrap(); - let pagecache = &shared.pagecache; - - if pagecache.is_empty() { - info!("page cache is empty"); - return; - } - - // Find nth entry in the map, where n is picked at random - let n = rand::thread_rng().gen_range(0..pagecache.len()); - let mut i = 0; - for (key, _e) in pagecache.iter() { - if i == n { - tag = Some(key.tag); - break; - } - i += 1; - } - } - - info!("testing GetPage@LSN for block {}", tag.unwrap().blknum); - match self.get_page_at_lsn(tag.unwrap(), 0xffff_ffff_ffff_eeee) { - Ok(_img) => { - // This prints out the whole page image. - //println!("{:X?}", img); - } - Err(error) => { - error!("GetPage@LSN failed: {}", error); - } - } - } - // FIXME: Shouldn't relation size also be tracked with an LSN? // If a replica is lagging behind, it needs to get the size as it was on // the replica's current replay LSN. @@ -613,14 +668,63 @@ impl PageCache { pub fn relsize_get(&self, rel: &RelTag) -> u32 { let mut shared = self.shared.lock().unwrap(); - let entry = shared.relsize_cache.entry(*rel).or_insert(0); - *entry + if let Some(relsize) = shared.relsize_cache.get(rel) { + return *relsize; + } + let key = CacheKey { + tag: BufferTag { + rel: *rel, + blknum: u32::MAX, + }, + lsn: u64::MAX, + }; + let mut buf = BytesMut::new(); + key.pack(&mut buf); + let mut iter = self + .db + .iterator(IteratorMode::From(&buf[..], Direction::Reverse)); + if let Some((k, _v)) = iter.next() { + buf.clear(); + buf.extend_from_slice(&k); + let tag = BufferTag::unpack(&mut buf); + if tag.rel == *rel { + let relsize = tag.blknum + 1; + shared.relsize_cache.insert(*rel, relsize); + return relsize; + } + } + return 0; } pub fn relsize_exist(&self, rel: &RelTag) -> bool { - let shared = self.shared.lock().unwrap(); + let mut shared = self.shared.lock().unwrap(); let relsize_cache = &shared.relsize_cache; - relsize_cache.contains_key(rel) + if relsize_cache.contains_key(rel) { + return true; + } + + let key = CacheKey { + tag: BufferTag { + rel: *rel, + blknum: 0, + }, + lsn: 0, + }; + let mut buf = BytesMut::new(); + key.pack(&mut buf); + let mut iter = self + .db + .iterator(IteratorMode::From(&buf[..], Direction::Forward)); + if let Some((k, _v)) = iter.next() { + buf.clear(); + buf.extend_from_slice(&k); + let tag = BufferTag::unpack(&mut buf); + if tag.rel == *rel { + shared.relsize_cache.insert(*rel, tag.blknum + 1); + return true; + } + } + return false; } pub fn get_stats(&self) -> PageCacheStats { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 06760c6f68..0130cbd2f2 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -566,10 +566,12 @@ impl Connection { } Some(FeMessage::ZenithReadRequest(req)) => { let buf_tag = page_cache::BufferTag { - spcnode: req.spcnode, - dbnode: req.dbnode, - relnode: req.relnode, - forknum: req.forknum, + rel: page_cache::RelTag { + spcnode: req.spcnode, + dbnode: req.dbnode, + relnode: req.relnode, + forknum: req.forknum, + }, blknum: req.blkno, }; diff --git a/pageserver/src/restore_s3.rs b/pageserver/src/restore_s3.rs index 08ba3e7fa3..0884f17453 100644 --- a/pageserver/src/restore_s3.rs +++ b/pageserver/src/restore_s3.rs @@ -309,10 +309,12 @@ async fn slurp_base_file( while bytes.remaining() >= 8192 { let tag = page_cache::BufferTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, + rel: page_cache::RelTag { + spcnode: parsed.spcnode, + dbnode: parsed.dbnode, + relnode: parsed.relnode, + forknum: parsed.forknum as u8, + }, blknum: blknum, }; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 9f382b2efb..f41c9274b2 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -12,7 +12,7 @@ use tokio::time::{sleep, Duration}; use tokio_stream::StreamExt; use crate::page_cache; -use crate::page_cache::BufferTag; +use crate::page_cache::{BufferTag, RelTag}; use crate::waldecoder::WalStreamDecoder; use crate::PageServerConf; @@ -141,10 +141,12 @@ async fn walreceiver_main( // so having multiple copies of it doesn't cost that much) for blk in decoded.blocks.iter() { let tag = BufferTag { - spcnode: blk.rnode_spcnode, - dbnode: blk.rnode_dbnode, - relnode: blk.rnode_relnode, - forknum: blk.forknum as u8, + rel: RelTag { + spcnode: blk.rnode_spcnode, + dbnode: blk.rnode_dbnode, + relnode: blk.rnode_relnode, + forknum: blk.forknum as u8, + }, blknum: blk.blkno, }; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index a06c87d584..90d2ed470e 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -65,6 +65,7 @@ pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) { let _guard = runtime.enter(); process = WalRedoProcess::launch(&datadir, &runtime).unwrap(); } + info!("WAL redo postgres started"); // Pretty arbitrarily, reuse the same Postgres process for 100 requests. // After that, kill it and start a new one. This is mostly to avoid @@ -76,11 +77,11 @@ pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) { let result = handle_apply_request(&pcache, &process, &runtime, request); if result.is_err() { // On error, kill the process. + error!("Kill wal redo process on error"); break; } } - info!("killing WAL redo postgres process"); let _ = runtime.block_on(process.stdin.get_mut().shutdown()); let mut child = process.child; drop(process.stdin); @@ -99,6 +100,7 @@ fn handle_apply_request( let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref()); let mut entry = entry_rc.content.lock().unwrap(); + assert!(entry.apply_pending); entry.apply_pending = false; let nrecords = records.len(); @@ -122,9 +124,6 @@ fn handle_apply_request( result = Err(e); } else { entry.page_image = Some(apply_result.unwrap()); - pcache - .num_page_images - .fetch_add(1, std::sync::atomic::Ordering::Relaxed); result = Ok(()); } @@ -296,11 +295,7 @@ fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes { buf.put_u8('B' as u8); buf.put_u32(len as u32); - buf.put_u32(tag.spcnode); - buf.put_u32(tag.dbnode); - buf.put_u32(tag.relnode); - buf.put_u32(tag.forknum as u32); - buf.put_u32(tag.blknum); + tag.pack(&mut buf); assert!(buf.len() == 1 + len); @@ -315,11 +310,7 @@ fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes { buf.put_u8('P' as u8); buf.put_u32(len as u32); - buf.put_u32(tag.spcnode); - buf.put_u32(tag.dbnode); - buf.put_u32(tag.relnode); - buf.put_u32(tag.forknum as u32); - buf.put_u32(tag.blknum); + tag.pack(&mut buf); buf.put(base_img); assert!(buf.len() == 1 + len); @@ -347,11 +338,7 @@ fn build_get_page_msg(tag: BufferTag) -> Bytes { buf.put_u8('G' as u8); buf.put_u32(len as u32); - buf.put_u32(tag.spcnode); - buf.put_u32(tag.dbnode); - buf.put_u32(tag.relnode); - buf.put_u32(tag.forknum as u32); - buf.put_u32(tag.blknum); + tag.pack(&mut buf); assert!(buf.len() == 1 + len); From 542dffa4a68aec026ae47b462683327300c5c355 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 8 Apr 2021 20:33:51 +0300 Subject: [PATCH 02/21] Set LD_LIBRARY_PATH for tests --- integration_tests/tests/control_plane/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/integration_tests/tests/control_plane/mod.rs b/integration_tests/tests/control_plane/mod.rs index eab3f345af..5ace192ac0 100644 --- a/integration_tests/tests/control_plane/mod.rs +++ b/integration_tests/tests/control_plane/mod.rs @@ -178,6 +178,7 @@ impl PageServerNode { .arg("--skip-recovery") .env_clear() .env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary + .env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap()) .status() .expect("failed to start pageserver"); From a60633607407eeb163bd2523488a012ffcecde8f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 9 Apr 2021 20:31:34 +0300 Subject: [PATCH 03/21] Fix bug in WALRecord serializer --- integration_tests/tests/control_plane/mod.rs | 1 + pageserver/src/bin/pageserver.rs | 11 +++++++---- pageserver/src/page_cache.rs | 6 +++--- pageserver/src/page_service.rs | 1 + pageserver/src/walredo.rs | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/integration_tests/tests/control_plane/mod.rs b/integration_tests/tests/control_plane/mod.rs index 5ace192ac0..965aa614f6 100644 --- a/integration_tests/tests/control_plane/mod.rs +++ b/integration_tests/tests/control_plane/mod.rs @@ -338,6 +338,7 @@ impl ComputeControlPlane<'_> { shared_buffers = 1MB\n\ max_connections = 100\n\ wal_level = replica\n\ + wal_sender_timeout = 0\n\ listen_addresses = '{address}'\n\ port = {port}\n\ ", diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9fdf405145..b1ff34afc4 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -7,7 +7,7 @@ use std::fs; use std::io; use std::path::PathBuf; use std::thread; -use std::{fs::File, fs::OpenOptions, str::FromStr}; +use std::{fs::OpenOptions, str::FromStr}; use clap::{App, Arg}; use daemonize::Daemonize; @@ -132,12 +132,12 @@ fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> { let stdout = OpenOptions::new() .create(true) .append(true) - .open(conf.data_dir.join("pageserver.log")) + .open(conf.data_dir.join("pageserver-stdout.log")) .unwrap(); let stderr = OpenOptions::new() .create(true) .append(true) - .open(conf.data_dir.join("pageserver.log")) + .open(conf.data_dir.join("pageserver-stderr.log")) .unwrap(); let daemonize = Daemonize::new() @@ -222,7 +222,10 @@ fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard { tui::init_logging() } else if conf.daemonize { let log = conf.data_dir.join("pageserver.log"); - let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file")); + let log_file = OpenOptions::new() + .create(true) + .append(true) + .open(log).unwrap_or_else(|_| panic!("Could not create log file")); let decorator = slog_term::PlainSyncDecorator::new(log_file); let drain = slog_term::CompactFormat::new(decorator).build(); let drain = slog::Filter::new(drain, |record: &slog::Record| { diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index a4c108cd59..bc8744d30c 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -317,13 +317,13 @@ impl WALRecord { pub fn pack(&self, buf: &mut BytesMut) { buf.put_u64(self.lsn); buf.put_u8(self.will_init as u8); - buf.put_u16(self.rec.len() as u16); + buf.put_u32(self.rec.len() as u32); buf.put_slice(&self.rec[..]); } pub fn unpack(buf: &mut BytesMut) -> WALRecord { let lsn = buf.get_u64(); let will_init = buf.get_u8() != 0; - let mut dst = vec![0u8; buf.get_u16() as usize]; + let mut dst = vec![0u8; buf.get_u32() as usize]; buf.copy_to_slice(&mut dst); WALRecord { lsn, @@ -439,7 +439,7 @@ impl PageCache { self.put_page_image(tag, lsn, page_img.clone()); } else { // No base image, and no WAL record. Huh? - return Err(format!("no page image or WAL record for requested page"))?; + panic!("no page image or WAL record for requested page"); } // FIXME: assumes little-endian. Only used for the debugging log though diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0130cbd2f2..b4d3fddfc0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -234,6 +234,7 @@ pub fn thread_main(conf: PageServerConf) { loop { let (socket, peer_addr) = listener.accept().await.unwrap(); debug!("accepted connection from {}", peer_addr); + socket.set_nodelay(true).unwrap(); let mut conn_handler = Connection::new(conf.clone(), socket); task::spawn(async move { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 227cc6924f..2a6cdb99b9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -197,7 +197,7 @@ impl WalRedoProcess { if res.unwrap() == 0 { break; } - debug!("wal-redo-postgres: {}", line.trim()); + error!("wal-redo-postgres: {}", line.trim()); line.clear(); } Ok::<(), Error>(()) From d9bc2109bb1ae4d973ddd86487a03cad5bc652af Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 14 Apr 2021 14:33:55 +0300 Subject: [PATCH 04/21] Reduce size of shared buffers for wal-redo-postgtes --- integration_tests/tests/control_plane/mod.rs | 3 ++- pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 6 +++--- pageserver/src/walredo.rs | 11 ++++++++--- vendor/postgres | 2 +- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/integration_tests/tests/control_plane/mod.rs b/integration_tests/tests/control_plane/mod.rs index 965aa614f6..00ba0c52f1 100644 --- a/integration_tests/tests/control_plane/mod.rs +++ b/integration_tests/tests/control_plane/mod.rs @@ -635,6 +635,7 @@ pub fn regress_check(pg: &PostgresNode) { let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress"); fs::create_dir_all(regress_run_path.clone()).unwrap(); + fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap(); std::env::set_current_dir(regress_run_path).unwrap(); let regress_build_path = @@ -650,7 +651,7 @@ pub fn regress_check(pg: &PostgresNode) { format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(), format!( "--schedule={}", - regress_src_path.join("parallel_schedule").to_str().unwrap() + regress_src_path.join("serial_schedule").to_str().unwrap() ) .as_str(), format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(), diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index bc8744d30c..8c3a9eecf7 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -138,7 +138,7 @@ fn open_rocksdb(conf: &PageServerConf, sys_id: u64) -> DB { let path = conf.data_dir.join(sys_id.to_string()); let mut opts = Options::default(); opts.create_if_missing(true); - opts.set_use_fsync(true); + //opts.set_use_fsync(true); opts.set_compression_type(DBCompressionType::Lz4); DB::open(&opts, &path).unwrap() } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b4d3fddfc0..b5267688a1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -508,11 +508,11 @@ impl Connection { loop { let message = self.read_message().await?; - +/* if let Some(m) = &message { - info!("query({}): {:?}", sysid, m); + trace!("query({}): {:?}", sysid, m); }; - +*/ if message.is_none() { // connection was closed return Ok(()); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 2a6cdb99b9..9fb676d109 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -18,6 +18,8 @@ use log::*; use std::assert; use std::cell::RefCell; use std::fs; +use std::fs::OpenOptions; +use std::io::prelude::*; use std::io::Error; use std::sync::Arc; use std::time::Duration; @@ -71,7 +73,7 @@ pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) { // After that, kill it and start a new one. This is mostly to avoid // using up all shared buffers in Postgres's shared buffer cache; we don't // want to write any pages to disk in the WAL redo process. - for _i in 1..100 { + for _i in 1..100000 { let request = walredo_channel_receiver.recv().unwrap(); let result = handle_apply_request(&pcache, &process, &runtime, request); @@ -162,8 +164,11 @@ impl WalRedoProcess { panic!("initdb failed: {}\nstderr:\n{}", std::str::from_utf8(&initdb.stdout).unwrap(), std::str::from_utf8(&initdb.stderr).unwrap()); - } - + } else { + // Limit shared cache for wal-redo-postres + let mut config = OpenOptions::new().append(true).open(datadir.join("postgresql.conf"))?; + config.write(b"shared_buffers=128kB\n")?; + } // Start postgres itself let mut child = Command::new("postgres") .arg("--wal-redo") diff --git a/vendor/postgres b/vendor/postgres index b1f5a5ec14..90d4144e38 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit b1f5a5ec145d5d9614eec4824074edae1162e5fa +Subproject commit 90d4144e386302bae43eae9f332bad42dcb1c631 From 24b925d528df65656193e4c63c51909af1b0117e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 15 Apr 2021 15:50:47 +0300 Subject: [PATCH 05/21] Support truncate WAL record --- integration_tests/tests/control_plane/mod.rs | 2 + pageserver/src/bin/pageserver.rs | 5 +- pageserver/src/page_cache.rs | 71 ++++++++++++++++---- pageserver/src/page_service.rs | 24 +++---- pageserver/src/waldecoder.rs | 67 +++++++++++++++--- pageserver/src/walreceiver.rs | 26 ++++++- pageserver/src/walredo.rs | 24 ++++--- vendor/postgres | 2 +- 8 files changed, 169 insertions(+), 52 deletions(-) diff --git a/integration_tests/tests/control_plane/mod.rs b/integration_tests/tests/control_plane/mod.rs index 00ba0c52f1..844c9839cc 100644 --- a/integration_tests/tests/control_plane/mod.rs +++ b/integration_tests/tests/control_plane/mod.rs @@ -338,6 +338,7 @@ impl ComputeControlPlane<'_> { shared_buffers = 1MB\n\ max_connections = 100\n\ wal_level = replica\n\ + max_parallel_workers = 0\n\ wal_sender_timeout = 0\n\ listen_addresses = '{address}'\n\ port = {port}\n\ @@ -396,6 +397,7 @@ impl ComputeControlPlane<'_> { shared_buffers = 1MB\n\ max_connections = 100\n\ wal_level = replica\n\ + max_parallel_workers = 0\n\ listen_addresses = '{address}'\n\ port = {port}\n\ computenode_mode = true\n\ diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b1ff34afc4..bb920d46de 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -222,10 +222,11 @@ fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard { tui::init_logging() } else if conf.daemonize { let log = conf.data_dir.join("pageserver.log"); - let log_file = OpenOptions::new() + let log_file = OpenOptions::new() .create(true) .append(true) - .open(log).unwrap_or_else(|_| panic!("Could not create log file")); + .open(log) + .unwrap_or_else(|_| panic!("Could not create log file")); let decorator = slog_term::PlainSyncDecorator::new(log_file); let drain = slog_term::CompactFormat::new(decorator).build(); let drain = slog::Filter::new(drain, |record: &slog::Record| { diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 8c3a9eecf7..5bd24e4e22 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -368,13 +368,14 @@ impl PageCache { shared = wait_result.0; if wait_result.1.timed_out() { - error!( - "Timed out while waiting for WAL record at LSN {} to arrive", + error!( + "Timed out while waiting for WAL record at LSN {} to arrive", lsn - ); + ); return Err(format!( "Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive", - lsn >> 32, lsn & 0xffff_ffff + lsn >> 32, + lsn & 0xffff_ffff ))?; } } @@ -383,8 +384,11 @@ impl PageCache { } if lsn < shared.first_valid_lsn { - return Err(format!("LSN {:X}/{:X} has already been removed", - lsn >> 32, lsn & 0xffff_ffff))?; + return Err(format!( + "LSN {:X}/{:X} has already been removed", + lsn >> 32, + lsn & 0xffff_ffff + ))?; } } let mut buf = BytesMut::new(); @@ -439,7 +443,7 @@ impl PageCache { self.put_page_image(tag, lsn, page_img.clone()); } else { // No base image, and no WAL record. Huh? - panic!("no page image or WAL record for requested page"); + panic!("no page image or WAL record for requested page"); } // FIXME: assumes little-endian. Only used for the debugging log though @@ -554,6 +558,41 @@ impl PageCache { self.num_wal_records.fetch_add(1, Ordering::Relaxed); } + // + // Adds a relation-wide WAL record (like truncate) to the page cache, + // associating it with all pages started with specified block number + // + pub fn put_rel_wal_record(&self, tag: BufferTag, rec: WALRecord) { + let mut key = CacheKey { tag, lsn: rec.lsn }; + let old_rel_size = self.relsize_get(&tag.rel); + let content = CacheEntryContent { + page_image: None, + wal_record: Some(rec), + apply_pending: false, + }; + // set new relation size + self.shared + .lock() + .unwrap() + .relsize_cache + .insert(tag.rel, tag.blknum); + + let mut key_buf = BytesMut::new(); + let mut val_buf = BytesMut::new(); + content.pack(&mut val_buf); + + for blknum in tag.blknum..old_rel_size { + key_buf.clear(); + key.tag.blknum = blknum; + key.pack(&mut key_buf); + trace!("put_wal_record lsn: {}", key.lsn); + let _res = self.db.put(&key_buf[..], &val_buf[..]); + } + let n = (old_rel_size - tag.blknum) as u64; + self.num_entries.fetch_add(n, Ordering::Relaxed); + self.num_wal_records.fetch_add(n, Ordering::Relaxed); + } + // // Memorize a full image of a page version // @@ -584,14 +623,18 @@ impl PageCache { // Can't move backwards. //assert!(lsn >= shared.last_valid_lsn); - if lsn > shared.last_valid_lsn { - shared.last_valid_lsn = lsn; - self.valid_lsn_condvar.notify_all(); + if lsn > shared.last_valid_lsn { + shared.last_valid_lsn = lsn; + self.valid_lsn_condvar.notify_all(); - self.last_valid_lsn.store(lsn, Ordering::Relaxed); - } else { - trace!("lsn={}, shared.last_valid_lsn={}", lsn, shared.last_valid_lsn); - } + self.last_valid_lsn.store(lsn, Ordering::Relaxed); + } else { + trace!( + "lsn={}, shared.last_valid_lsn={}", + lsn, + shared.last_valid_lsn + ); + } } // diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b5267688a1..b23c65e44a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -234,7 +234,7 @@ pub fn thread_main(conf: PageServerConf) { loop { let (socket, peer_addr) = listener.accept().await.unwrap(); debug!("accepted connection from {}", peer_addr); - socket.set_nodelay(true).unwrap(); + socket.set_nodelay(true).unwrap(); let mut conn_handler = Connection::new(conf.clone(), socket); task::spawn(async move { @@ -318,7 +318,7 @@ impl Connection { .await?; self.stream.write_i16(1).await?; - self.stream.write_buf(&mut b).await?; + self.stream.write_all(&mut b).await?; self.stream.write_i32(0).await?; /* table oid */ self.stream.write_i16(0).await?; /* attnum */ self.stream.write_i32(25).await?; /* TEXTOID */ @@ -337,7 +337,7 @@ impl Connection { self.stream.write_i16(1).await?; self.stream.write_i32(b.len() as i32).await?; - self.stream.write_buf(&mut b).await?; + self.stream.write_all(&mut b).await?; } BeMessage::ControlFile => { @@ -349,7 +349,7 @@ impl Connection { self.stream.write_i16(1).await?; self.stream.write_i32(b.len() as i32).await?; - self.stream.write_buf(&mut b).await?; + self.stream.write_all(&mut b).await?; } BeMessage::CommandComplete => { @@ -357,7 +357,7 @@ impl Connection { self.stream.write_u8(b'C').await?; self.stream.write_i32(4 + b.len() as i32).await?; - self.stream.write_buf(&mut b).await?; + self.stream.write_all(&mut b).await?; } BeMessage::ZenithStatusResponse(resp) => { @@ -384,7 +384,7 @@ impl Connection { self.stream.write_u8(102).await?; /* tag from pagestore_client.h */ self.stream.write_u8(resp.ok as u8).await?; self.stream.write_u32(resp.n_blocks).await?; - self.stream.write_buf(&mut resp.page.clone()).await?; + self.stream.write_all(&mut resp.page.clone()).await?; } } @@ -405,7 +405,7 @@ impl Connection { match m.kind { StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => { let mut b = Bytes::from("N"); - self.stream.write_buf(&mut b).await?; + self.stream.write_all(&mut b).await?; self.stream.flush().await?; } StartupRequestCode::Normal => { @@ -508,11 +508,11 @@ impl Connection { loop { let message = self.read_message().await?; -/* - if let Some(m) = &message { - trace!("query({}): {:?}", sysid, m); - }; -*/ + /* + if let Some(m) = &message { + trace!("query({}): {:?}", sysid, m); + }; + */ if message.is_none() { // connection was closed return Ok(()); diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 1f1a5dfc99..43ce634970 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -287,15 +287,22 @@ pub struct DecodedBkpBlock { const SizeOfXLogRecord: u32 = 24; pub struct DecodedWALRecord { - pub lsn: u64, // LSN at the *end* of the record + pub lsn: u64, // LSN at the *end* of the record + pub xl_info: u8, + pub xl_rmid: u8, pub record: Bytes, // raw XLogRecord pub blocks: Vec, } // From pg_control.h and rmgrlist.h -const XLOG_SWITCH: u8 = 0x40; -const RM_XLOG_ID: u8 = 0; +pub const XLOG_SWITCH: u8 = 0x40; +pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; +pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; + +pub const RM_XLOG_ID: u8 = 0; +pub const RM_XACT_ID: u8 = 1; +pub const RM_SMGR_ID: u8 = 2; // Is this record an XLOG_SWITCH record? They need some special processing, // so we need to check for that before the rest of the parsing. @@ -316,25 +323,61 @@ fn is_xlog_switch_record(rec: &Bytes) -> bool { return xl_info == XLOG_SWITCH && xl_rmid == RM_XLOG_ID; } +pub type Oid = u32; +pub type BlockNumber = u32; + +pub const MAIN_FORKNUM: u8 = 0; +pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; + +#[repr(C)] +#[derive(Debug)] +pub struct RelFileNode { + pub spcnode: Oid, /* tablespace */ + pub dbnode: Oid, /* database */ + pub relnode: Oid, /* relation */ +} + +#[repr(C)] +#[derive(Debug)] +pub struct XlSmgrTruncate { + pub blkno: BlockNumber, + pub rnode: RelFileNode, + pub flags: u32, +} + +pub fn decode_truncate_record(decoded: &DecodedWALRecord) -> XlSmgrTruncate { + let mut buf = decoded.record.clone(); + buf.advance(SizeOfXLogRecord as usize); + XlSmgrTruncate { + blkno: buf.get_u32_le(), + rnode: RelFileNode { + spcnode: buf.get_u32_le(), /* tablespace */ + dbnode: buf.get_u32_le(), /* database */ + relnode: buf.get_u32_le(), /* relation */ + }, + flags: buf.get_u32_le(), + } +} + // // Routines to decode a WAL record and figure out which blocks are modified // -pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { +pub fn decode_wal_record(lsn: u64, record: Bytes) -> DecodedWALRecord { trace!( "decoding record with LSN {:08X}/{:08X} ({} bytes)", lsn >> 32, lsn & 0xffff_ffff, - rec.remaining() + record.remaining() ); - let mut buf = rec.clone(); + let mut buf = record.clone(); // FIXME: assume little-endian here let xl_tot_len = buf.get_u32_le(); let _xl_xid = buf.get_u32_le(); let _xl_prev = buf.get_u64_le(); - let _xl_info = buf.get_u8(); - let _xl_rmid = buf.get_u8(); + let xl_info = buf.get_u8(); + let xl_rmid = buf.get_u8(); buf.advance(2); // 2 bytes of padding let _xl_crc = buf.get_u32_le(); @@ -582,8 +625,10 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord { // Since we don't care about the data payloads here, we're done. return DecodedWALRecord { - lsn: lsn, - record: rec, - blocks: blocks, + lsn, + xl_info, + xl_rmid, + record, + blocks, }; } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index f41c9274b2..c8a5fa612c 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -13,7 +13,7 @@ use tokio_stream::StreamExt; use crate::page_cache; use crate::page_cache::{BufferTag, RelTag}; -use crate::waldecoder::WalStreamDecoder; +use crate::waldecoder::*; use crate::PageServerConf; use postgres_protocol::message::backend::ReplicationMessage; @@ -158,7 +158,29 @@ async fn walreceiver_main( pcache.put_wal_record(tag, rec); } - + // include truncate wal record in all pages + if decoded.xl_rmid == RM_SMGR_ID + && (decoded.xl_info & XLR_RMGR_INFO_MASK) == XLOG_SMGR_TRUNCATE + { + let truncate = decode_truncate_record(&decoded); + if (truncate.flags & SMGR_TRUNCATE_HEAP) != 0 { + let tag = BufferTag { + rel: RelTag { + spcnode: truncate.rnode.spcnode, + dbnode: truncate.rnode.dbnode, + relnode: truncate.rnode.relnode, + forknum: MAIN_FORKNUM, + }, + blknum: truncate.blkno, + }; + let rec = page_cache::WALRecord { + lsn: lsn, + will_init: false, + rec: recdata.clone(), + }; + pcache.put_rel_wal_record(tag, rec); + } + } // Now that this record has been handled, let the page cache know that // it is up-to-date to this LSN pcache.advance_last_valid_lsn(lsn); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9fb676d109..5a1489757d 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -67,7 +67,7 @@ pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) { let _guard = runtime.enter(); process = WalRedoProcess::launch(&datadir, &runtime).unwrap(); } - info!("WAL redo postgres started"); + info!("WAL redo postgres started"); // Pretty arbitrarily, reuse the same Postgres process for 100 requests. // After that, kill it and start a new one. This is mostly to avoid @@ -79,7 +79,7 @@ pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) { let result = handle_apply_request(&pcache, &process, &runtime, request); if result.is_err() { // On error, kill the process. - error!("Kill wal redo process on error"); + error!("Kill wal redo process on error"); break; } } @@ -102,7 +102,7 @@ fn handle_apply_request( let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref()); let mut entry = entry_rc.content.lock().unwrap(); - assert!(entry.apply_pending); + assert!(entry.apply_pending); entry.apply_pending = false; let nrecords = records.len(); @@ -161,14 +161,18 @@ impl WalRedoProcess { .expect("failed to execute initdb"); if !initdb.status.success() { - panic!("initdb failed: {}\nstderr:\n{}", - std::str::from_utf8(&initdb.stdout).unwrap(), - std::str::from_utf8(&initdb.stderr).unwrap()); + panic!( + "initdb failed: {}\nstderr:\n{}", + std::str::from_utf8(&initdb.stdout).unwrap(), + std::str::from_utf8(&initdb.stderr).unwrap() + ); } else { - // Limit shared cache for wal-redo-postres - let mut config = OpenOptions::new().append(true).open(datadir.join("postgresql.conf"))?; - config.write(b"shared_buffers=128kB\n")?; - } + // Limit shared cache for wal-redo-postres + let mut config = OpenOptions::new() + .append(true) + .open(datadir.join("postgresql.conf"))?; + config.write(b"shared_buffers=128kB\n")?; + } // Start postgres itself let mut child = Command::new("postgres") .arg("--wal-redo") diff --git a/vendor/postgres b/vendor/postgres index 90d4144e38..2d0b8458eb 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 90d4144e386302bae43eae9f332bad42dcb1c631 +Subproject commit 2d0b8458eb53fc45547eb889ec61bee0a9f078dc From b67df00bffe04edb7e37e175fed14553733b9516 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 15 Apr 2021 17:09:30 +0300 Subject: [PATCH 06/21] Fix bug in decoding of truncate record --- pageserver/src/waldecoder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 43ce634970..828ad7b59f 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -347,7 +347,7 @@ pub struct XlSmgrTruncate { pub fn decode_truncate_record(decoded: &DecodedWALRecord) -> XlSmgrTruncate { let mut buf = decoded.record.clone(); - buf.advance(SizeOfXLogRecord as usize); + buf.advance((SizeOfXLogRecord+2) as usize); XlSmgrTruncate { blkno: buf.get_u32_le(), rnode: RelFileNode { From 8b70ea4d79f6b4b7a7577ccfe9bfbb131e3d691c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 16 Apr 2021 22:54:11 +0300 Subject: [PATCH 07/21] Undo debugg settings for computation node --- integration_tests/tests/control_plane/mod.rs | 4 +--- vendor/postgres | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/integration_tests/tests/control_plane/mod.rs b/integration_tests/tests/control_plane/mod.rs index 844c9839cc..cf0dadf2f3 100644 --- a/integration_tests/tests/control_plane/mod.rs +++ b/integration_tests/tests/control_plane/mod.rs @@ -338,7 +338,6 @@ impl ComputeControlPlane<'_> { shared_buffers = 1MB\n\ max_connections = 100\n\ wal_level = replica\n\ - max_parallel_workers = 0\n\ wal_sender_timeout = 0\n\ listen_addresses = '{address}'\n\ port = {port}\n\ @@ -397,7 +396,6 @@ impl ComputeControlPlane<'_> { shared_buffers = 1MB\n\ max_connections = 100\n\ wal_level = replica\n\ - max_parallel_workers = 0\n\ listen_addresses = '{address}'\n\ port = {port}\n\ computenode_mode = true\n\ @@ -653,7 +651,7 @@ pub fn regress_check(pg: &PostgresNode) { format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(), format!( "--schedule={}", - regress_src_path.join("serial_schedule").to_str().unwrap() + regress_src_path.join("parallel_schedule").to_str().unwrap() ) .as_str(), format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(), diff --git a/vendor/postgres b/vendor/postgres index 2d0b8458eb..5717bc00cb 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 2d0b8458eb53fc45547eb889ec61bee0a9f078dc +Subproject commit 5717bc00cbb95bc07f6f436fd747b74ac61179b6 From 33ee5b6ba017d3aabe0e540e07e3bb1eaefa7e39 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 17 Apr 2021 15:54:57 +0300 Subject: [PATCH 08/21] Skip truncate records when calculating relation size --- pageserver/src/page_cache.rs | 47 +++++++++++++++++++++++++---------- pageserver/src/waldecoder.rs | 2 +- pageserver/src/walreceiver.rs | 2 ++ vendor/postgres | 2 +- 4 files changed, 38 insertions(+), 15 deletions(-) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 5bd24e4e22..3e463e5a09 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -310,6 +310,7 @@ impl BufferTag { pub struct WALRecord { pub lsn: u64, // LSN at the *end* of the record pub will_init: bool, + pub truncate: bool, pub rec: Bytes, } @@ -317,17 +318,20 @@ impl WALRecord { pub fn pack(&self, buf: &mut BytesMut) { buf.put_u64(self.lsn); buf.put_u8(self.will_init as u8); + buf.put_u8(self.truncate as u8); buf.put_u32(self.rec.len() as u32); buf.put_slice(&self.rec[..]); } pub fn unpack(buf: &mut BytesMut) -> WALRecord { let lsn = buf.get_u64(); let will_init = buf.get_u8() != 0; + let truncate = buf.get_u8() != 0; let mut dst = vec![0u8; buf.get_u32() as usize]; buf.copy_to_slice(&mut dst); WALRecord { lsn, will_init, + truncate, rec: Bytes::from(dst), } } @@ -576,7 +580,7 @@ impl PageCache { .unwrap() .relsize_cache .insert(tag.rel, tag.blknum); - + info!("Truncate relation {:?}", tag); let mut key_buf = BytesMut::new(); let mut val_buf = BytesMut::new(); content.pack(&mut val_buf); @@ -711,7 +715,7 @@ impl PageCache { if let Some(relsize) = shared.relsize_cache.get(rel) { return *relsize; } - let key = CacheKey { + let mut key = CacheKey { tag: BufferTag { rel: *rel, blknum: u32::MAX, @@ -719,19 +723,36 @@ impl PageCache { lsn: u64::MAX, }; let mut buf = BytesMut::new(); - key.pack(&mut buf); - let mut iter = self - .db - .iterator(IteratorMode::From(&buf[..], Direction::Reverse)); - if let Some((k, _v)) = iter.next() { + + loop { buf.clear(); - buf.extend_from_slice(&k); - let tag = BufferTag::unpack(&mut buf); - if tag.rel == *rel { - let relsize = tag.blknum + 1; - shared.relsize_cache.insert(*rel, relsize); - return relsize; + key.pack(&mut buf); + let mut iter = self + .db + .iterator(IteratorMode::From(&buf[..], Direction::Reverse)); + if let Some((k, v)) = iter.next() { + buf.clear(); + buf.extend_from_slice(&k); + let tag = BufferTag::unpack(&mut buf); + if tag.rel == *rel { + buf.clear(); + buf.extend_from_slice(&v); + let content = CacheEntryContent::unpack(&mut buf); + if let Some(rec) = &content.wal_record { + if rec.truncate { + if tag.blknum > 0 { + key.tag.blknum = tag.blknum - 1; + continue; + } + break; + } + } + let relsize = tag.blknum + 1; + shared.relsize_cache.insert(*rel, relsize); + return relsize; + } } + break; } return 0; } diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index 828ad7b59f..fa79403563 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -347,7 +347,7 @@ pub struct XlSmgrTruncate { pub fn decode_truncate_record(decoded: &DecodedWALRecord) -> XlSmgrTruncate { let mut buf = decoded.record.clone(); - buf.advance((SizeOfXLogRecord+2) as usize); + buf.advance((SizeOfXLogRecord + 2) as usize); XlSmgrTruncate { blkno: buf.get_u32_le(), rnode: RelFileNode { diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index c8a5fa612c..129a80dd32 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -153,6 +153,7 @@ async fn walreceiver_main( let rec = page_cache::WALRecord { lsn: lsn, will_init: blk.will_init || blk.apply_image, + truncate: false, rec: recdata.clone(), }; @@ -176,6 +177,7 @@ async fn walreceiver_main( let rec = page_cache::WALRecord { lsn: lsn, will_init: false, + truncate: true, rec: recdata.clone(), }; pcache.put_rel_wal_record(tag, rec); diff --git a/vendor/postgres b/vendor/postgres index 5717bc00cb..32cc0a1c3a 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 5717bc00cbb95bc07f6f436fd747b74ac61179b6 +Subproject commit 32cc0a1c3a5406c13403b170548f7cd9f0b053bf From 1e65848551ac9daea0f479a675f98460d82aaf31 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 18 Apr 2021 13:13:13 +0300 Subject: [PATCH 09/21] Add pgbench test --- integration_tests/tests/control_plane/mod.rs | 102 ++++++++++++------- integration_tests/tests/test_pageserver.rs | 16 ++- pageserver/src/page_cache.rs | 2 +- pageserver/src/walredo.rs | 1 + vendor/postgres | 2 +- 5 files changed, 82 insertions(+), 41 deletions(-) diff --git a/integration_tests/tests/control_plane/mod.rs b/integration_tests/tests/control_plane/mod.rs index cf0dadf2f3..7ad9bbd320 100644 --- a/integration_tests/tests/control_plane/mod.rs +++ b/integration_tests/tests/control_plane/mod.rs @@ -336,6 +336,7 @@ impl ComputeControlPlane<'_> { max_replication_slots = 10\n\ hot_standby = on\n\ shared_buffers = 1MB\n\ + fsync = off\n\ max_connections = 100\n\ wal_level = replica\n\ wal_sender_timeout = 0\n\ @@ -394,6 +395,7 @@ impl ComputeControlPlane<'_> { max_replication_slots = 10\n\ hot_standby = on\n\ shared_buffers = 1MB\n\ + fsync = off\n\ max_connections = 100\n\ wal_level = replica\n\ listen_addresses = '{address}'\n\ @@ -615,9 +617,68 @@ impl PostgresNode { } } - // TODO - pub fn pg_bench() {} - pub fn pg_regress() {} + pub fn pg_regress(&self) { + self.safe_psql("postgres", "CREATE DATABASE regression"); + + let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress"); + fs::create_dir_all(regress_run_path.clone()).unwrap(); + fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap(); + std::env::set_current_dir(regress_run_path).unwrap(); + + let regress_build_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress"); + let regress_src_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress"); + + let _regress_check = Command::new(regress_build_path.join("pg_regress")) + .args(&[ + "--bindir=''", + "--use-existing", + format!("--bindir={}", PG_BIN_DIR.to_str().unwrap()).as_str(), + format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(), + format!( + "--schedule={}", + regress_src_path.join("parallel_schedule").to_str().unwrap() + ) + .as_str(), + format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(), + ]) + .env_clear() + .env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap()) + .env("PGPORT", self.port.to_string()) + .env("PGUSER", self.whoami()) + .env("PGHOST", self.ip.to_string()) + .status() + .expect("pg_regress failed"); + } + + pub fn pg_bench(&self, clients: u32, seconds: u32) { + let port = self.port.to_string(); + let clients = clients.to_string(); + let seconds = seconds.to_string(); + let _pg_bench_init = Command::new(PG_BIN_DIR.join("pgbench")) + .args(&["-i", "-p", port.as_str(), "postgres"]) + .env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap()) + .status() + .expect("pgbench -i"); + let _pg_bench_run = Command::new(PG_BIN_DIR.join("pgbench")) + .args(&[ + "-p", + port.as_str(), + "-T", + seconds.as_str(), + "-P", + "1", + "-c", + clients.as_str(), + "-M", + "prepared", + "postgres", + ]) + .env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap()) + .status() + .expect("pgbench run"); + } } impl Drop for PostgresNode { @@ -629,38 +690,3 @@ impl Drop for PostgresNode { // fs::remove_dir_all(self.pgdata.clone()).unwrap(); } } - -pub fn regress_check(pg: &PostgresNode) { - pg.safe_psql("postgres", "CREATE DATABASE regression"); - - let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress"); - fs::create_dir_all(regress_run_path.clone()).unwrap(); - fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap(); - std::env::set_current_dir(regress_run_path).unwrap(); - - let regress_build_path = - Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress"); - let regress_src_path = - Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress"); - - let _regress_check = Command::new(regress_build_path.join("pg_regress")) - .args(&[ - "--bindir=''", - "--use-existing", - format!("--bindir={}", PG_BIN_DIR.to_str().unwrap()).as_str(), - format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(), - format!( - "--schedule={}", - regress_src_path.join("parallel_schedule").to_str().unwrap() - ) - .as_str(), - format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(), - ]) - .env_clear() - .env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap()) - .env("PGPORT", pg.port.to_string()) - .env("PGUSER", pg.whoami()) - .env("PGHOST", pg.ip.to_string()) - .status() - .expect("pg_regress failed"); -} diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index 8adacb3c54..93fa1db861 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -58,7 +58,21 @@ fn test_regress() { let node = compute_cplane.new_node(); node.start(&storage_cplane); - control_plane::regress_check(&node); + node.pg_regress(); +} + +// Runs pg_bench on a compute node +#[test] +fn pgbench() { + // Start pageserver that reads WAL directly from that postgres + let storage_cplane = StorageControlPlane::one_page_server(); + let mut compute_cplane = ComputeControlPlane::local(&storage_cplane); + + // start postgres + let node = compute_cplane.new_node(); + node.start(&storage_cplane); + + node.pg_bench(10, 100); } // Run two postgres instances on one pageserver diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 3e463e5a09..536678d194 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -138,7 +138,7 @@ fn open_rocksdb(conf: &PageServerConf, sys_id: u64) -> DB { let path = conf.data_dir.join(sys_id.to_string()); let mut opts = Options::default(); opts.create_if_missing(true); - //opts.set_use_fsync(true); + opts.set_use_fsync(true); opts.set_compression_type(DBCompressionType::Lz4); DB::open(&opts, &path).unwrap() } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 5a1489757d..713efd9045 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -172,6 +172,7 @@ impl WalRedoProcess { .append(true) .open(datadir.join("postgresql.conf"))?; config.write(b"shared_buffers=128kB\n")?; + config.write(b"fsync=off\n")?; } // Start postgres itself let mut child = Command::new("postgres") diff --git a/vendor/postgres b/vendor/postgres index 32cc0a1c3a..167196910d 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 32cc0a1c3a5406c13403b170548f7cd9f0b053bf +Subproject commit 167196910d6f41466c82793bcf14bfe442468776 From 95160dee6d2a4a5dd8271629a8d191ce5db4a881 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 19 Apr 2021 17:00:30 +0300 Subject: [PATCH 10/21] Merge with main branch --- control_plane/src/compute.rs | 68 +++++++++++++++++++++- control_plane/src/storage.rs | 40 +------------ integration_tests/tests/test_pageserver.rs | 12 ++-- pageserver/Cargo.toml | 7 ++- pageserver/src/bin/pageserver.rs | 6 +- pageserver/src/page_cache.rs | 30 +--------- pageserver/src/restore_datadir.rs | 10 ++-- pageserver/src/walreceiver.rs | 2 +- 8 files changed, 88 insertions(+), 87 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b39d901be7..bfe38ef528 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use std::time::Duration; use std::{collections::BTreeMap, path::PathBuf}; use std::{io::Write, net::SocketAddr}; +use std::path::Path; use lazy_static::lazy_static; use postgres::{Client, NoTls}; @@ -246,7 +247,9 @@ impl PostgresNode { max_replication_slots = 10\n\ hot_standby = on\n\ shared_buffers = 1MB\n\ + fsync = off\n\ max_connections = 100\n\ + wal_sender_timeout = 0\n\ wal_level = replica\n\ listen_addresses = '{address}'\n\ port = {port}\n", @@ -415,8 +418,69 @@ impl PostgresNode { } } - // TODO - pub fn pg_bench() {} + + pub fn pg_regress(&self) { + self.safe_psql("postgres", "CREATE DATABASE regression"); + + let regress_run_path = self.env.data_dir.join("regress"); + fs::create_dir_all(regress_run_path.clone()).unwrap(); + fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap(); + std::env::set_current_dir(regress_run_path).unwrap(); + + let regress_build_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress"); + let regress_src_path = + Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress"); + + let _regress_check = Command::new(regress_build_path.join("pg_regress")) + .args(&[ + "--bindir=''", + "--use-existing", + format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(), + format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(), + format!( + "--schedule={}", + regress_src_path.join("parallel_schedule").to_str().unwrap() + ) + .as_str(), + format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(), + ]) + .env_clear() + .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .env("PGPORT", self.address.port().to_string()) + .env("PGUSER", self.whoami()) + .env("PGHOST", self.address.ip().to_string()) + .status() + .expect("pg_regress failed"); + } + + pub fn pg_bench(&self, clients: u32, seconds: u32) { + let port = self.address.port().to_string(); + let clients = clients.to_string(); + let seconds = seconds.to_string(); + let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench")) + .args(&["-i", "-p", port.as_str(), "postgres"]) + .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .status() + .expect("pgbench -i"); + let _pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench")) + .args(&[ + "-p", + port.as_str(), + "-T", + seconds.as_str(), + "-P", + "1", + "-c", + clients.as_str(), + "-M", + "prepared", + "postgres", + ]) + .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .status() + .expect("pgbench run"); + } } impl Drop for PostgresNode { diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index eba2966849..3175998f9e 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -12,7 +12,6 @@ use std::time::Duration; use postgres::{Client, NoTls}; -use crate::compute::PostgresNode; use crate::local_env::{self, LocalEnv}; type Result = std::result::Result>; @@ -104,6 +103,9 @@ impl TestStorageControlPlane { } pub fn stop(&self) { + for wa in self.wal_acceptors.iter() { + let _unused = wa.stop(); + } self.test_done.store(true, Ordering::Relaxed); } @@ -350,42 +352,6 @@ impl Drop for WalProposerNode { } } -/////////////////////////////////////////////////////////////////////////////// - -pub fn regress_check(pg: &PostgresNode) { - pg.safe_psql("postgres", "CREATE DATABASE regression"); - - let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress"); - fs::create_dir_all(regress_run_path.clone()).unwrap(); - std::env::set_current_dir(regress_run_path).unwrap(); - - let regress_build_path = - Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress"); - let regress_src_path = - Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress"); - - let _regress_check = Command::new(regress_build_path.join("pg_regress")) - .args(&[ - "--bindir=''", - "--use-existing", - format!("--bindir={}", pg.env.pg_bin_dir().to_str().unwrap()).as_str(), - format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(), - format!( - "--schedule={}", - regress_src_path.join("parallel_schedule").to_str().unwrap() - ) - .as_str(), - format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(), - ]) - .env_clear() - .env("LD_LIBRARY_PATH", pg.env.pg_lib_dir().to_str().unwrap()) - .env("PGHOST", pg.address.ip().to_string()) - .env("PGPORT", pg.address.port().to_string()) - .env("PGUSER", pg.whoami()) - .status() - .expect("pg_regress failed"); -} - /// Read a PID file /// /// This should contain an unsigned integer, but we return it as a String diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index a7e389455e..bfb9b71d0f 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -60,7 +60,6 @@ fn test_regress() { let node = compute_cplane.new_test_node(); node.start().unwrap(); -<<<<<<< HEAD node.pg_regress(); } @@ -68,17 +67,14 @@ fn test_regress() { #[test] fn pgbench() { // Start pageserver that reads WAL directly from that postgres - let storage_cplane = StorageControlPlane::one_page_server(); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane); + let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); + let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_node(); - node.start(&storage_cplane); + let node = compute_cplane.new_test_node(); + node.start().unwrap(); node.pg_bench(10, 100); -======= - control_plane::storage::regress_check(&node); ->>>>>>> main } // Run two postgres instances on one pageserver diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 17a5f48d18..d5b3481073 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -29,9 +29,10 @@ daemonize = "0.4.1" rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] } tokio = { version = "1.3.0", features = ["full"] } tokio-stream = { version = "0.1.4" } -tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" } -postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" } -postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" } +tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } +postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } +postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } +postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } rocksdb = { git = "https://github.com/rust-rocksdb/rust-rocksdb.git" } anyhow = "1.0" crc32c = "0.6.0" diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9e46e3d0a9..2ba51e83a2 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -224,11 +224,11 @@ fn init_logging(conf: &PageServerConf) -> Result CacheEntry { - CacheEntry { - key, - content: Mutex::new(CacheEntryContent { - page_image: None, - wal_record: None, - apply_pending: false, - } - } - } -} - impl CacheEntry { fn new(key: CacheKey, content: CacheEntryContent) -> CacheEntry { CacheEntry { @@ -404,18 +388,6 @@ impl PageCache { lsn & 0xffff_ffff ); } - - let pagecache = &shared.pagecache; - - let mut entries = pagecache.range((Included(&minkey), Included(&maxkey))); - - let entry_opt = entries.next_back(); - - if entry_opt.is_none() { - static ZERO_PAGE: [u8; 8192] = [0u8; 8192]; - return Ok(Bytes::from_static(&ZERO_PAGE)); - /* return Err("could not find page image")?; */ - } } let mut buf = BytesMut::new(); minkey.pack(&mut buf); diff --git a/pageserver/src/restore_datadir.rs b/pageserver/src/restore_datadir.rs index 985f5e3905..3b28d64585 100644 --- a/pageserver/src/restore_datadir.rs +++ b/pageserver/src/restore_datadir.rs @@ -324,10 +324,12 @@ async fn slurp_base_file( while bytes.remaining() >= 8192 { let tag = page_cache::BufferTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, + rel: page_cache::RelTag { + spcnode: parsed.spcnode, + dbnode: parsed.dbnode, + relnode: parsed.relnode, + forknum: parsed.forknum as u8, + }, blknum: blknum, }; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index c11a00fc78..692d7f466d 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -8,7 +8,7 @@ use crate::page_cache; use crate::page_cache::{BufferTag, RelTag}; -use crate::waldecoder::{decode_wal_record, WalStreamDecoder}; +use crate::waldecoder::*; use crate::PageServerConf; use anyhow::Error; use log::*; From 936cad17e49a515f3196d8acaa64ba1a1ef98437 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 20 Apr 2021 18:28:35 +0300 Subject: [PATCH 11/21] LSN-aware smgrnblock/smgrexists implementations --- integration_tests/tests/test_pageserver.rs | 1 - pageserver/src/page_cache.rs | 159 ++++++++------------- pageserver/src/page_service.rs | 62 +------- pageserver/src/restore_datadir.rs | 8 -- vendor/postgres | 2 +- 5 files changed, 63 insertions(+), 169 deletions(-) diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index bfb9b71d0f..8af066ae90 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -50,7 +50,6 @@ fn test_redo_cases() { // Runs pg_regress on a compute node #[test] -#[ignore] fn test_regress() { // Start pageserver that reads WAL directly from that postgres let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 46474921a7..336eba1f55 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -79,13 +79,6 @@ impl AddAssign for PageCacheStats { // Shared data structure, holding page cache and related auxiliary information // struct PageCacheShared { - // Relation n_blocks cache - // - // This hashtable should be updated together with the pagecache. Now it is - // accessed unreasonably often through the smgr_nblocks(). It is better to just - // cache it in postgres smgr and ask only on restart. - relsize_cache: HashMap, - // What page versions do we hold in the cache? If we get GetPage with // LSN < first_valid_lsn, that's an error because we (no longer) hold that // page version. If we get a request > last_valid_lsn, we need to wait until @@ -148,7 +141,6 @@ fn init_page_cache(conf: &PageServerConf, sys_id: u64) -> PageCache { PageCache { db: open_rocksdb(&conf, sys_id), shared: Mutex::new(PageCacheShared { - relsize_cache: HashMap::new(), first_valid_lsn: 0, last_valid_lsn: 0, last_record_lsn: 0, @@ -338,6 +330,46 @@ impl WALRecord { // Public interface functions impl PageCache { + fn wait_lsn(&self, lsn: u64) -> anyhow::Result<()> { + let mut shared = self.shared.lock().unwrap(); + let mut waited = false; + + while lsn > shared.last_valid_lsn { + // TODO: Wait for the WAL receiver to catch up + waited = true; + trace!( + "not caught up yet: {}, requested {}", + shared.last_valid_lsn, + lsn + ); + let wait_result = self + .valid_lsn_condvar + .wait_timeout(shared, TIMEOUT) + .unwrap(); + + shared = wait_result.0; + if wait_result.1.timed_out() { + bail!( + "Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive", + lsn >> 32, + lsn & 0xffff_ffff + ); + } + } + if waited { + trace!("caught up now, continuing"); + } + + if lsn < shared.first_valid_lsn { + bail!( + "LSN {:X}/{:X} has already been removed", + lsn >> 32, + lsn & 0xffff_ffff + ); + } + Ok(()) + } + // // GetPage@LSN // @@ -346,49 +378,12 @@ impl PageCache { pub fn get_page_at_lsn(&self, tag: BufferTag, lsn: u64) -> anyhow::Result { self.num_getpage_requests.fetch_add(1, Ordering::Relaxed); + self.wait_lsn(lsn)?; + // Look up cache entry. If it's a page image, return that. If it's a WAL record, // ask the WAL redo service to reconstruct the page image from the WAL records. let minkey = CacheKey { tag, lsn: 0 }; let maxkey = CacheKey { tag, lsn }; - - { - let mut shared = self.shared.lock().unwrap(); - let mut waited = false; - - while lsn > shared.last_valid_lsn { - // TODO: Wait for the WAL receiver to catch up - waited = true; - trace!( - "not caught up yet: {}, requested {}", - shared.last_valid_lsn, - lsn - ); - let wait_result = self - .valid_lsn_condvar - .wait_timeout(shared, TIMEOUT) - .unwrap(); - - shared = wait_result.0; - if wait_result.1.timed_out() { - bail!( - "Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive", - lsn >> 32, - lsn & 0xffff_ffff - ); - } - } - if waited { - trace!("caught up now, continuing"); - } - - if lsn < shared.first_valid_lsn { - bail!( - "LSN {:X}/{:X} has already been removed", - lsn >> 32, - lsn & 0xffff_ffff - ); - } - } let mut buf = BytesMut::new(); minkey.pack(&mut buf); @@ -521,17 +516,6 @@ impl PageCache { return (base_img, records); } - fn update_rel_size(&self, tag: &BufferTag) { - let mut shared = self.shared.lock().unwrap(); - let rel_entry = shared - .relsize_cache - .entry(tag.rel) - .or_insert(tag.blknum + 1); - if tag.blknum >= *rel_entry { - *rel_entry = tag.blknum + 1; - } - } - // // Adds a WAL record to the page cache // @@ -544,8 +528,6 @@ impl PageCache { apply_pending: false, }; - self.update_rel_size(&tag); - let mut key_buf = BytesMut::new(); key.pack(&mut key_buf); let mut val_buf = BytesMut::new(); @@ -564,19 +546,14 @@ impl PageCache { // pub fn put_rel_wal_record(&self, tag: BufferTag, rec: WALRecord) { let mut key = CacheKey { tag, lsn: rec.lsn }; - let old_rel_size = self.relsize_get(&tag.rel); + let old_rel_size = self.relsize_get(&tag.rel, u64::MAX).unwrap(); let content = CacheEntryContent { page_image: None, wal_record: Some(rec), apply_pending: false, }; // set new relation size - self.shared - .lock() - .unwrap() - .relsize_cache - .insert(tag.rel, tag.blknum); - info!("Truncate relation {:?}", tag); + trace!("Truncate relation {:?}", tag); let mut key_buf = BytesMut::new(); let mut val_buf = BytesMut::new(); content.pack(&mut val_buf); @@ -692,31 +669,17 @@ impl PageCache { return shared.last_record_lsn; } - // FIXME: Shouldn't relation size also be tracked with an LSN? - // If a replica is lagging behind, it needs to get the size as it was on - // the replica's current replay LSN. - pub fn relsize_inc(&self, rel: &RelTag, to: Option) { - let mut shared = self.shared.lock().unwrap(); - let entry = shared.relsize_cache.entry(*rel).or_insert(0); + pub fn relsize_get(&self, rel: &RelTag, lsn: u64) -> anyhow::Result { + if lsn != u64::MAX { + self.wait_lsn(lsn)?; + } - if let Some(to) = to { - if to >= *entry { - *entry = to + 1; - } - } - } - - pub fn relsize_get(&self, rel: &RelTag) -> u32 { - let mut shared = self.shared.lock().unwrap(); - if let Some(relsize) = shared.relsize_cache.get(rel) { - return *relsize; - } let mut key = CacheKey { tag: BufferTag { rel: *rel, blknum: u32::MAX, }, - lsn: u64::MAX, + lsn, }; let mut buf = BytesMut::new(); @@ -744,44 +707,38 @@ impl PageCache { } } let relsize = tag.blknum + 1; - shared.relsize_cache.insert(*rel, relsize); - return relsize; + return Ok(relsize); } } break; } - return 0; + return Ok(0); } - pub fn relsize_exist(&self, rel: &RelTag) -> bool { - let mut shared = self.shared.lock().unwrap(); - let relsize_cache = &shared.relsize_cache; - if relsize_cache.contains_key(rel) { - return true; - } + pub fn relsize_exist(&self, rel: &RelTag, lsn: u64) -> anyhow::Result { + self.wait_lsn(lsn)?; let key = CacheKey { tag: BufferTag { rel: *rel, - blknum: 0, + blknum: u32::MAX, }, - lsn: 0, + lsn, }; let mut buf = BytesMut::new(); key.pack(&mut buf); let mut iter = self .db - .iterator(IteratorMode::From(&buf[..], Direction::Forward)); + .iterator(IteratorMode::From(&buf[..], Direction::Reverse)); if let Some((k, _v)) = iter.next() { buf.clear(); buf.extend_from_slice(&k); let tag = BufferTag::unpack(&mut buf); if tag.rel == *rel { - shared.relsize_cache.insert(*rel, tag.blknum + 1); - return true; + return Ok(true); } } - return false; + return Ok(false); } pub fn get_stats(&self) -> PageCacheStats { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 84e155c940..2335cc3bce 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -36,12 +36,8 @@ enum FeMessage { // All that messages are actually CopyData from libpq point of view. // ZenithExistsRequest(ZenithRequest), - ZenithTruncRequest(ZenithRequest), - ZenithUnlinkRequest(ZenithRequest), ZenithNblocksRequest(ZenithRequest), ZenithReadRequest(ZenithRequest), - ZenithCreateRequest(ZenithRequest), - ZenithExtendRequest(ZenithRequest), } #[derive(Debug)] @@ -193,12 +189,8 @@ impl FeMessage { // serialization. match smgr_tag { 0 => Ok(Some(FeMessage::ZenithExistsRequest(zreq))), - 1 => Ok(Some(FeMessage::ZenithTruncRequest(zreq))), - 2 => Ok(Some(FeMessage::ZenithUnlinkRequest(zreq))), - 3 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))), - 4 => Ok(Some(FeMessage::ZenithReadRequest(zreq))), - 5 => Ok(Some(FeMessage::ZenithCreateRequest(zreq))), - 6 => Ok(Some(FeMessage::ZenithExtendRequest(zreq))), + 1 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))), + 2 => Ok(Some(FeMessage::ZenithReadRequest(zreq))), _ => Err(io::Error::new( io::ErrorKind::InvalidInput, format!("unknown smgr message tag: {},'{:?}'", smgr_tag, buf), @@ -527,7 +519,7 @@ impl Connection { forknum: req.forknum, }; - let exist = pcache.relsize_exist(&tag); + let exist = pcache.relsize_exist(&tag, req.lsn).unwrap_or(false); self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { ok: exist, @@ -535,20 +527,6 @@ impl Connection { })) .await? } - Some(FeMessage::ZenithTruncRequest(_)) => { - self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { - ok: true, - n_blocks: 0, - })) - .await? - } - Some(FeMessage::ZenithUnlinkRequest(_)) => { - self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { - ok: true, - n_blocks: 0, - })) - .await? - } Some(FeMessage::ZenithNblocksRequest(req)) => { let tag = page_cache::RelTag { spcnode: req.spcnode, @@ -557,7 +535,7 @@ impl Connection { forknum: req.forknum, }; - let n_blocks = pcache.relsize_get(&tag); + let n_blocks = pcache.relsize_get(&tag, req.lsn).unwrap_or(0); self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse { ok: true, @@ -595,38 +573,6 @@ impl Connection { self.write_message(&msg).await? } - Some(FeMessage::ZenithCreateRequest(req)) => { - let tag = page_cache::RelTag { - spcnode: req.spcnode, - dbnode: req.dbnode, - relnode: req.relnode, - forknum: req.forknum, - }; - - pcache.relsize_inc(&tag, None); - - self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { - ok: true, - n_blocks: 0, - })) - .await? - } - Some(FeMessage::ZenithExtendRequest(req)) => { - let tag = page_cache::RelTag { - spcnode: req.spcnode, - dbnode: req.dbnode, - relnode: req.relnode, - forknum: req.forknum, - }; - - pcache.relsize_inc(&tag, Some(req.blkno)); - - self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { - ok: true, - n_blocks: 0, - })) - .await? - } _ => {} } } diff --git a/pageserver/src/restore_datadir.rs b/pageserver/src/restore_datadir.rs index 3b28d64585..b16276e7ab 100644 --- a/pageserver/src/restore_datadir.rs +++ b/pageserver/src/restore_datadir.rs @@ -315,13 +315,6 @@ async fn slurp_base_file( let pcache = page_cache::get_pagecache(conf, sys_id); - let reltag = page_cache::RelTag { - spcnode: parsed.spcnode, - dbnode: parsed.dbnode, - relnode: parsed.relnode, - forknum: parsed.forknum as u8, - }; - while bytes.remaining() >= 8192 { let tag = page_cache::BufferTag { rel: page_cache::RelTag { @@ -335,7 +328,6 @@ async fn slurp_base_file( pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192)); - pcache.relsize_inc(&reltag, Some(blknum)); blknum += 1; } } diff --git a/vendor/postgres b/vendor/postgres index 167196910d..9f9aa9c300 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 167196910d6f41466c82793bcf14bfe442468776 +Subproject commit 9f9aa9c300c9bbac296e2c126b3f96701d4e683d From 8604bb8750bfe848283a29de2d0cf2354bef7e85 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 20 Apr 2021 18:46:31 +0300 Subject: [PATCH 12/21] Increase timeout for running github tests --- .github/workflows/testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 450b93d85a..c5a601b7ce 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -4,7 +4,7 @@ on: [push] jobs: regression-check: - timeout-minutes: 10 + timeout-minutes: 30 name: run regression test suite runs-on: ubuntu-latest From d8fa2ec3676faccaa129511b8b2f250b65d4d770 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 21 Apr 2021 16:10:05 +0300 Subject: [PATCH 13/21] Merge with main branch --- control_plane/src/compute.rs | 5 ++- control_plane/src/local_env.rs | 9 +--- control_plane/src/storage.rs | 1 - integration_tests/tests/test_pageserver.rs | 9 ++-- pageserver/src/bin/pageserver.rs | 16 ++----- pageserver/src/lib.rs | 9 ++++ pageserver/src/page_cache.rs | 8 ++-- pageserver/src/page_service.rs | 49 +--------------------- pageserver/src/restore_local_repo.rs | 32 +++++++------- pageserver/src/walredo.rs | 3 +- postgres_ffi/Cargo.toml | 2 +- vendor/postgres | 2 +- 12 files changed, 47 insertions(+), 98 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index c336792b83..a3f14f858d 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -19,6 +19,7 @@ use postgres::{Client, NoTls}; use crate::local_env::LocalEnv; use crate::storage::{PageServerNode, WalProposerNode}; use pageserver::ZTimelineId; +use pageserver::zenith_repo_dir; // // ComputeControlPlane @@ -449,8 +450,8 @@ impl PostgresNode { pub fn pg_regress(&self) { self.safe_psql("postgres", "CREATE DATABASE regression"); - - let regress_run_path = self.env.data_dir.join("regress"); + let data_dir = zenith_repo_dir(); + let regress_run_path = data_dir.join("regress"); fs::create_dir_all(regress_run_path.clone()).unwrap(); fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap(); std::env::set_current_dir(regress_run_path).unwrap(); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index adf5d6164c..db71721e21 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -16,6 +16,7 @@ use anyhow::Result; use serde_derive::{Deserialize, Serialize}; use pageserver::ZTimelineId; +use pageserver::zenith_repo_dir; use walkeeper::xlog_utils; // @@ -52,14 +53,6 @@ impl LocalEnv { } } -fn zenith_repo_dir() -> PathBuf { - // Find repository path - match std::env::var_os("ZENITH_REPO_DIR") { - Some(val) => PathBuf::from(val.to_str().unwrap()), - None => ".zenith".into(), - } -} - // // Initialize a new Zenith repository // diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 914cbbf578..3674307fd3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -13,7 +13,6 @@ use std::time::Duration; use postgres::{Client, NoTls}; -use crate::compute::PostgresNode; use crate::local_env::LocalEnv; use pageserver::ZTimelineId; diff --git a/integration_tests/tests/test_pageserver.rs b/integration_tests/tests/test_pageserver.rs index 67df31ef65..a50040d358 100644 --- a/integration_tests/tests/test_pageserver.rs +++ b/integration_tests/tests/test_pageserver.rs @@ -69,12 +69,15 @@ fn test_regress() { // Runs pg_bench on a compute node #[test] fn pgbench() { + let local_env = local_env::test_env("pgbench"); + // Start pageserver that reads WAL directly from that postgres - let storage_cplane = TestStorageControlPlane::one_page_server(String::new()); - let mut compute_cplane = ComputeControlPlane::local(&storage_cplane.pageserver); + let storage_cplane = TestStorageControlPlane::one_page_server(&local_env); + let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver); // start postgres - let node = compute_cplane.new_test_node(); + let maintli = storage_cplane.get_branch_timeline("main"); + let node = compute_cplane.new_test_node(maintli); node.start().unwrap(); node.pg_bench(10, 100); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 12db5180af..340894d55b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,12 +4,11 @@ use log::*; use std::fs; -use std::fs::{File, OpenOptions}; +use std::fs::OpenOptions; use std::io; use std::path::PathBuf; use std::process::exit; use std::thread; -use std::fs::OpenOptions; use anyhow::{Context, Result}; use clap::{App, Arg}; @@ -18,18 +17,11 @@ use daemonize::Daemonize; use slog::Drain; use pageserver::page_service; +use pageserver::zenith_repo_dir; use pageserver::tui; //use pageserver::walreceiver; use pageserver::PageServerConf; -fn zenith_repo_dir() -> String { - // Find repository path - match std::env::var_os("ZENITH_REPO_DIR") { - Some(val) => String::from(val.to_str().unwrap()), - None => ".zenith".into(), - } -} - fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") @@ -140,7 +132,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> { // does this for us. let repodir = zenith_repo_dir(); std::env::set_current_dir(&repodir)?; - info!("Changed current directory to repository in {}", &repodir); + info!("Changed current directory to repository in {:?}", &repodir); } let mut threads = Vec::new(); @@ -186,7 +178,7 @@ fn init_logging(conf: &PageServerConf) -> Result PathBuf { + // Find repository path + match std::env::var_os("ZENITH_REPO_DIR") { + Some(val) => PathBuf::from(val.to_str().unwrap()), + None => ".zenith".into(), + } +} diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 9ed0a422c5..79ea7f072b 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -8,7 +8,7 @@ use crate::restore_local_repo::restore_timeline; use crate::ZTimelineId; -use crate::{walredo, PageServerConf}; +use crate::{walredo, PageServerConf, zenith_repo_dir}; use anyhow::bail; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crossbeam_channel::unbounded; @@ -150,8 +150,8 @@ pub fn get_or_restore_pagecache( } } -fn open_rocksdb(conf: &PageServerConf, timelineid: u64) -> DB { - let path = conf.data_dir.join(timelineid.to_string()); +fn open_rocksdb(_conf: &PageServerConf, timelineid: ZTimelineId) -> DB { + let path = zenith_repo_dir().join(timelineid.to_string()); let mut opts = Options::default(); opts.create_if_missing(true); opts.set_use_fsync(true); @@ -159,7 +159,7 @@ fn open_rocksdb(conf: &PageServerConf, timelineid: u64) -> DB { DB::open(&opts, &path).unwrap() } -fn init_page_cache(conf: &PageServerConf, timelineid: u64) -> PageCache { +fn init_page_cache(conf: &PageServerConf, timelineid: ZTimelineId) -> PageCache { // Initialize the channel between the page cache and the WAL applicator let (s, r) = unbounded(); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1ab7ee4eb4..239b89e306 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -428,12 +428,8 @@ pub fn thread_main(conf: &PageServerConf) { loop { let (socket, peer_addr) = listener.accept().await.unwrap(); debug!("accepted connection from {}", peer_addr); -<<<<<<< HEAD socket.set_nodelay(true).unwrap(); - let mut conn_handler = Connection::new(conf.clone(), socket); -======= let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref); ->>>>>>> main task::spawn(async move { if let Err(err) = conn_handler.run().await { @@ -788,19 +784,11 @@ impl Connection { loop { let message = self.read_message().await?; -<<<<<<< HEAD - /* - if let Some(m) = &message { - trace!("query({}): {:?}", sysid, m); - }; - */ -======= if let Some(m) = &message { - info!("query({:?}): {:?}", timelineid, m); + trace!("query({:?}): {:?}", timelineid, m); }; ->>>>>>> main if message.is_none() { // connection was closed return Ok(()); @@ -869,41 +857,6 @@ impl Connection { self.write_message(&msg).await? } -<<<<<<< HEAD -======= - Some(FeMessage::ZenithCreateRequest(req)) => { - let tag = page_cache::RelTag { - spcnode: req.spcnode, - dbnode: req.dbnode, - relnode: req.relnode, - forknum: req.forknum, - }; - - pcache.relsize_inc(&tag, 0); - - self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { - ok: true, - n_blocks: 0, - })) - .await? - } - Some(FeMessage::ZenithExtendRequest(req)) => { - let tag = page_cache::RelTag { - spcnode: req.spcnode, - dbnode: req.dbnode, - relnode: req.relnode, - forknum: req.forknum, - }; - - pcache.relsize_inc(&tag, req.blkno + 1); - - self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse { - ok: true, - n_blocks: 0, - })) - .await? - } ->>>>>>> main _ => {} } } diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index 262479a556..4308fd66a9 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -27,6 +27,7 @@ use anyhow::Result; use bytes::Bytes; use crate::page_cache; +use crate::page_cache::RelTag; use crate::page_cache::BufferTag; use crate::page_cache::PageCache; use crate::waldecoder::WalStreamDecoder; @@ -202,11 +203,13 @@ fn restore_relfile( let r = file.read_exact(&mut buf); match r { Ok(_) => { - let tag = page_cache::BufferTag { - spcnode: spcoid, - dbnode: dboid, - relnode: relnode, - forknum: forknum as u8, + let tag = BufferTag { + rel: RelTag { + spcnode: spcoid, + dbnode: dboid, + relnode: relnode, + forknum: forknum as u8, + }, blknum: blknum, }; pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf)); @@ -233,14 +236,6 @@ fn restore_relfile( blknum += 1; } - let tag = page_cache::RelTag { - spcnode: spcoid, - dbnode: dboid, - relnode: relnode, - forknum: forknum as u8, - }; - pcache.relsize_inc(&tag, blknum); - Ok(()) } @@ -308,16 +303,19 @@ fn restore_wal( // so having multiple copies of it doesn't cost that much) for blk in decoded.blocks.iter() { let tag = BufferTag { - spcnode: blk.rnode_spcnode, - dbnode: blk.rnode_dbnode, - relnode: blk.rnode_relnode, - forknum: blk.forknum as u8, + rel: RelTag { + spcnode: blk.rnode_spcnode, + dbnode: blk.rnode_dbnode, + relnode: blk.rnode_relnode, + forknum: blk.forknum as u8, + }, blknum: blk.blkno, }; let rec = page_cache::WALRecord { lsn: lsn, will_init: blk.will_init || blk.apply_image, + truncate: false, rec: recdata.clone(), }; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 400c8c59da..f20b7935c2 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,6 +21,7 @@ use std::fs; use std::fs::OpenOptions; use std::io::prelude::*; use std::io::Error; +use std::path::PathBuf; use std::process::Stdio; use std::sync::Arc; use std::time::Duration; @@ -171,7 +172,7 @@ impl WalRedoProcess { // Limit shared cache for wal-redo-postres let mut config = OpenOptions::new() .append(true) - .open(datadir.join("postgresql.conf"))?; + .open(PathBuf::from(&datadir).join("postgresql.conf"))?; config.write(b"shared_buffers=128kB\n")?; config.write(b"fsync=off\n")?; } diff --git a/postgres_ffi/Cargo.toml b/postgres_ffi/Cargo.toml index 77cc5cf028..9ca97154c3 100644 --- a/postgres_ffi/Cargo.toml +++ b/postgres_ffi/Cargo.toml @@ -16,4 +16,4 @@ crc32c = "0.6.0" hex = "0.4.3" [build-dependencies] -bindgen = "0.53.1" +bindgen = "0.57" diff --git a/vendor/postgres b/vendor/postgres index b898ad7e3b..9f9aa9c300 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit b898ad7e3b9acce72b64bf064257e392f979a659 +Subproject commit 9f9aa9c300c9bbac296e2c126b3f96701d4e683d From c981f4ad667689f6157c515f9f39f7f4f4e86ff5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 21 Apr 2021 19:04:30 +0300 Subject: [PATCH 14/21] Implement garbage collection of unused versions --- Cargo.lock | 70 +++++++++++++++ pageserver/Cargo.toml | 1 + pageserver/src/bin/pageserver.rs | 21 +++++ pageserver/src/lib.rs | 3 + pageserver/src/page_cache.rs | 147 +++++++++++++++++++++++++------ pageserver/src/page_service.rs | 3 +- 6 files changed, 219 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a16bd155c9..c63d0746a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1222,6 +1222,41 @@ dependencies = [ "winapi", ] +[[package]] +name = "num" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8536030f9fea7127f841b45bb6243b27255787fb4eb83958aa1ef9d2fdc0c36" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6b19411a9719e753aff12e5187b74d60d3dc449ec3f4dc21e3989c3f554bc95" +dependencies = [ + "autocfg", + "num-traits", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -1232,6 +1267,29 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c000134b5dbf44adc5cb772486d335293351644b801551abe8f75c84cfa4aef" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.14" @@ -1330,6 +1388,7 @@ dependencies = [ "hex", "lazy_static", "log", + "parse_duration", "postgres", "postgres-protocol", "postgres-types", @@ -1384,6 +1443,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "parse_duration" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7037e5e93e0172a5a96874380bf73bc6ecef022e26fa25f2be26864d6b3ba95d" +dependencies = [ + "lazy_static", + "num", + "regex", +] + [[package]] name = "peeking_take_while" version = "0.1.2" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 41e0a548fb..fc2c8618b6 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -41,5 +41,6 @@ walkdir = "2" thiserror = "1.0" hex = "0.4.3" tar = "0.4.33" +parse_duration = "*" postgres_ffi = { path = "../postgres_ffi" } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 340894d55b..0a966a81a6 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -9,6 +9,8 @@ use std::io; use std::path::PathBuf; use std::process::exit; use std::thread; +use std::time::Duration; +use parse_duration::parse; use anyhow::{Context, Result}; use clap::{App, Arg}; @@ -22,6 +24,8 @@ use pageserver::tui; //use pageserver::walreceiver; use pageserver::PageServerConf; +const DEFAULT_GC_HORIZON : u64 = 64*1024*1024; + fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") .about("Materializes WAL stream to pages and serves them to the postgres") @@ -46,11 +50,20 @@ fn main() -> Result<()> { .takes_value(false) .help("Run in the background"), ) + .arg( + Arg::with_name("gc_horizon") + .short("g") + .long("gc_horizon") + .takes_value(true) + .help("Garbage colletor horizon"), + ) .get_matches(); let mut conf = PageServerConf { daemonize: false, interactive: false, + gc_horizon: DEFAULT_GC_HORIZON, + gc_period: Duration::from_secs(10), listen_addr: "127.0.0.1:5430".parse().unwrap(), }; @@ -71,6 +84,14 @@ fn main() -> Result<()> { conf.listen_addr = addr.parse()?; } + if let Some(horizon) = arg_matches.value_of("gc_horizon") { + conf.gc_horizon = horizon.parse()?; + } + + if let Some(period) = arg_matches.value_of("gc_period") { + conf.gc_period = parse(period)?; + } + start_pageserver(&conf) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3b9eea17f5..f6992cd8e1 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -2,6 +2,7 @@ use std::fmt; use std::net::SocketAddr; use std::str::FromStr; use std::path::PathBuf; +use std::time::Duration; pub mod basebackup; pub mod page_cache; @@ -20,6 +21,8 @@ pub struct PageServerConf { pub daemonize: bool, pub interactive: bool, pub listen_addr: SocketAddr, + pub gc_horizon: u64, + pub gc_period: Duration, } // Zenith Timeline ID is a 32-byte random ID. diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 79ea7f072b..04bd65a319 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -15,6 +15,7 @@ use crossbeam_channel::unbounded; use crossbeam_channel::{Receiver, Sender}; use lazy_static::lazy_static; use log::*; +use std::cmp::min; use std::collections::HashMap; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; @@ -124,7 +125,7 @@ pub fn get_or_restore_pagecache( match pcaches.get(&timelineid) { Some(pcache) => Ok(pcache.clone()), None => { - let pcache = init_page_cache(&conf, timelineid); + let pcache = init_page_cache(conf, timelineid); restore_timeline(conf, &pcache, timelineid)?; @@ -145,11 +146,25 @@ pub fn get_or_restore_pagecache( }) .unwrap(); + let conf_copy = conf.clone(); + let _gc_thread = thread::Builder::new() + .name("Garbage collection thread".into()) + .spawn(move || { + gc_thread_main(&conf_copy, timelineid); + }) + .unwrap(); + return Ok(result); } } } +fn gc_thread_main(conf: &PageServerConf, timelineid: ZTimelineId) { + info!("Garbage collection thread started {}", timelineid); + let pcache = get_pagecache(conf, timelineid).unwrap(); + pcache.do_gc(conf).unwrap(); +} + fn open_rocksdb(_conf: &PageServerConf, timelineid: ZTimelineId) -> DB { let path = zenith_repo_dir().join(timelineid.to_string()); let mut opts = Options::default(); @@ -355,6 +370,110 @@ impl WALRecord { // Public interface functions impl PageCache { + fn do_gc(&self, conf: &PageServerConf) -> anyhow::Result { + let mut minbuf = BytesMut::new(); + let mut maxbuf = BytesMut::new(); + let cf = self.db.cf_handle(DEFAULT_COLUMN_FAMILY_NAME).unwrap(); + loop { + thread::sleep(conf.gc_period); + let last_lsn = self.get_last_valid_lsn(); + if last_lsn > conf.gc_horizon { + let horizon = last_lsn - conf.gc_horizon; + let mut maxkey = CacheKey { + tag: BufferTag { + rel: RelTag { + spcnode: u32::MAX, + dbnode: u32::MAX, + relnode: u32::MAX, + forknum: u8::MAX, + }, + blknum: u32::MAX, + }, + lsn: u64::MAX + }; + loop { + maxbuf.clear(); + maxkey.pack(&mut maxbuf); + let mut iter = self.db.iterator(IteratorMode::From(&maxbuf[..], Direction::Reverse)); + if let Some((k,v)) = iter.next() { + minbuf.clear(); + minbuf.extend_from_slice(&v); + let content = CacheEntryContent::unpack(&mut minbuf); + minbuf.clear(); + minbuf.extend_from_slice(&k); + let key = CacheKey::unpack(&mut minbuf); + + // Construct boundaries for old records cleanup + maxkey.tag = key.tag; + let last_lsn = key.lsn; + maxkey.lsn = min(horizon, last_lsn); // do not remove last version + + let mut minkey = maxkey.clone(); + minkey.lsn = 0; + + // reconstruct most recent page version + if content.wal_record.is_some() { + // force reconstruction of most recent page version + self.reconstruct_page(key, content)?; + } + + maxbuf.clear(); + maxkey.pack(&mut maxbuf); + + if last_lsn > horizon { + // locate most recent record before horizon + let mut iter = self.db.iterator(IteratorMode::From(&maxbuf[..], Direction::Reverse)); + if let Some((k,v)) = iter.next() { + minbuf.clear(); + minbuf.extend_from_slice(&v); + let content = CacheEntryContent::unpack(&mut minbuf); + if content.wal_record.is_some() { + minbuf.clear(); + minbuf.extend_from_slice(&k); + let key = CacheKey::unpack(&mut minbuf); + self.reconstruct_page(key, content)?; + } + } + } + // remove records prior to horizon + minbuf.clear(); + minkey.pack(&mut minbuf); + self.db.delete_range_cf(cf, &minbuf[..], &maxbuf[..])?; + + maxkey = minkey; + } + } + } + } + } + + fn reconstruct_page(&self, key: CacheKey, content: CacheEntryContent) -> anyhow::Result { + let entry_rc = Arc::new(CacheEntry::new(key.clone(), content)); + + let mut entry_content = entry_rc.content.lock().unwrap(); + entry_content.apply_pending = true; + + let s = &self.walredo_sender; + s.send(entry_rc.clone())?; + + while entry_content.apply_pending { + entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap(); + } + // We should now have a page image. If we don't, it means that WAL redo + // failed to reconstruct it. WAL redo should've logged that error already. + let page_img = match &entry_content.page_image { + Some(p) => p.clone(), + None => { + error!( + "could not apply WAL to reconstruct page image for GetPage@LSN request" + ); + bail!("could not apply WAL to reconstruct page image"); + } + }; + self.put_page_image(key.tag, key.lsn, page_img.clone()); + Ok(page_img) + } + fn wait_lsn(&self, lsn: u64) -> anyhow::Result<()> { let mut shared = self.shared.lock().unwrap(); let mut waited = false; @@ -437,30 +556,8 @@ impl PageCache { } else if content.wal_record.is_some() { buf.clear(); buf.extend_from_slice(&k); - let entry_rc = Arc::new(CacheEntry::new(CacheKey::unpack(&mut buf), content)); - - let mut entry_content = entry_rc.content.lock().unwrap(); - entry_content.apply_pending = true; - - let s = &self.walredo_sender; - s.send(entry_rc.clone())?; - - while entry_content.apply_pending { - entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap(); - } - - // We should now have a page image. If we don't, it means that WAL redo - // failed to reconstruct it. WAL redo should've logged that error already. - page_img = match &entry_content.page_image { - Some(p) => p.clone(), - None => { - error!( - "could not apply WAL to reconstruct page image for GetPage@LSN request" - ); - bail!("could not apply WAL to reconstruct page image"); - } - }; - self.put_page_image(tag, lsn, page_img.clone()); + let key = CacheKey::unpack(&mut buf); + page_img = self.reconstruct_page(key, content)?; } else { // No base image, and no WAL record. Huh? bail!("no page image or WAL record for requested page"); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 239b89e306..7ce285164c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -24,6 +24,7 @@ use tokio::runtime; use tokio::runtime::Runtime; use tokio::sync::mpsc; use tokio::task; +use std::time::Duration; use crate::basebackup; use crate::page_cache; @@ -936,7 +937,7 @@ impl Connection { // FIXME: I'm getting an error from the tokio copyout driver without this. // I think it happens when the CommandComplete, CloseComplete and ReadyForQuery // are sent in the same TCP packet as the CopyDone. I don't understand why. - thread::sleep(std::time::Duration::from_secs(1)); + thread::sleep(Duration::from_secs(1)); Ok(()) } From 785502c92c628befc2b6af44a590f05fb80a96a6 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 21 Apr 2021 19:52:28 +0300 Subject: [PATCH 15/21] New version of postgres --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 9f9aa9c300..b3af0d345c 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 9f9aa9c300c9bbac296e2c126b3f96701d4e683d +Subproject commit b3af0d345cc78ef0805063df5569a0389a39dbf2 From 2dbbb8c59b05313991453ccddc4258f5c7e5ebbe Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Apr 2021 10:12:22 +0300 Subject: [PATCH 16/21] Address issues from Eric's review --- control_plane/src/compute.rs | 5 ++--- control_plane/src/storage.rs | 2 +- pageserver/src/bin/pageserver.rs | 9 ++------ pageserver/src/page_cache.rs | 35 ++++++++++++++++---------------- 4 files changed, 23 insertions(+), 28 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 9807756232..1279d941e7 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -17,8 +17,7 @@ use postgres::{Client, NoTls}; use crate::local_env::LocalEnv; use crate::storage::{PageServerNode, WalProposerNode}; -use pageserver::zenith_repo_dir; -use pageserver::ZTimelineId; +use pageserver::{zenith_repo_dir, ZTimelineId}; // // ComputeControlPlane @@ -450,7 +449,7 @@ impl PostgresNode { self.safe_psql("postgres", "CREATE DATABASE regression"); let data_dir = zenith_repo_dir(); let regress_run_path = data_dir.join("regress"); - fs::create_dir_all(regress_run_path.clone()).unwrap(); + fs::create_dir_all(®ress_run_path).unwrap(); fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap(); std::env::set_current_dir(regress_run_path).unwrap(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 28198a3008..1a85de68f9 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -111,7 +111,7 @@ impl TestStorageControlPlane { pub fn stop(&self) { for wa in self.wal_acceptors.iter() { - let _unused = wa.stop(); + let _ = wa.stop(); } self.test_done.store(true, Ordering::Relaxed); } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 8801e5de14..062fc4eb21 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -4,8 +4,7 @@ use log::*; use parse_duration::parse; -use std::fs; -use std::fs::OpenOptions; +use std::fs::{self, OpenOptions}; use std::io; use std::path::PathBuf; use std::process::exit; @@ -18,11 +17,7 @@ use daemonize::Daemonize; use slog::Drain; -use pageserver::page_service; -use pageserver::tui; -use pageserver::zenith_repo_dir; -//use pageserver::walreceiver; -use pageserver::PageServerConf; +use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf}; const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index c4723315f5..0777230ac3 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -15,7 +15,7 @@ use crossbeam_channel::unbounded; use crossbeam_channel::{Receiver, Sender}; use lazy_static::lazy_static; use log::*; -use rocksdb::*; +use rocksdb; use std::cmp::min; use std::collections::HashMap; use std::sync::atomic::Ordering; @@ -33,7 +33,7 @@ pub struct PageCache { shared: Mutex, // RocksDB handle - db: DB, + db: rocksdb::DB, // Channel for communicating with the WAL redo process here. pub walredo_sender: Sender>, @@ -168,13 +168,13 @@ fn gc_thread_main(conf: &PageServerConf, timelineid: ZTimelineId) { pcache.do_gc(conf).unwrap(); } -fn open_rocksdb(_conf: &PageServerConf, timelineid: ZTimelineId) -> DB { +fn open_rocksdb(_conf: &PageServerConf, timelineid: ZTimelineId) -> rocksdb::DB { let path = zenith_repo_dir().join(timelineid.to_string()); - let mut opts = Options::default(); + let mut opts = rocksdb::Options::default(); opts.create_if_missing(true); opts.set_use_fsync(true); - opts.set_compression_type(DBCompressionType::Lz4); - DB::open(&opts, &path).unwrap() + opts.set_compression_type(rocksdb::DBCompressionType::Lz4); + rocksdb::DB::open(&opts, &path).unwrap() } fn init_page_cache(conf: &PageServerConf, timelineid: ZTimelineId) -> PageCache { @@ -309,7 +309,7 @@ impl RelTag { buf.put_u32(self.spcnode); buf.put_u32(self.dbnode); buf.put_u32(self.relnode); - buf.put_u32(self.forknum as u32); + buf.put_u32(self.forknum as u32); // encode forknum as u32 to provide compatibility with wal_redo_postgres } pub fn unpack(buf: &mut BytesMut) -> RelTag { RelTag { @@ -381,10 +381,11 @@ impl WALRecord { // Public interface functions impl PageCache { + fn do_gc(&self, conf: &PageServerConf) -> anyhow::Result { let mut minbuf = BytesMut::new(); let mut maxbuf = BytesMut::new(); - let cf = self.db.cf_handle(DEFAULT_COLUMN_FAMILY_NAME).unwrap(); + let cf = self.db.cf_handle(rocksdb::DEFAULT_COLUMN_FAMILY_NAME).unwrap(); loop { thread::sleep(conf.gc_period); let last_lsn = self.get_last_valid_lsn(); @@ -407,7 +408,7 @@ impl PageCache { maxkey.pack(&mut maxbuf); let mut iter = self .db - .iterator(IteratorMode::From(&maxbuf[..], Direction::Reverse)); + .iterator(rocksdb::IteratorMode::From(&maxbuf[..], rocksdb::Direction::Reverse)); if let Some((k, v)) = iter.next() { minbuf.clear(); minbuf.extend_from_slice(&v); @@ -437,7 +438,7 @@ impl PageCache { // locate most recent record before horizon let mut iter = self .db - .iterator(IteratorMode::From(&maxbuf[..], Direction::Reverse)); + .iterator(rocksdb::IteratorMode::From(&maxbuf[..], rocksdb::Direction::Reverse)); if let Some((k, v)) = iter.next() { minbuf.clear(); minbuf.extend_from_slice(&v); @@ -538,18 +539,18 @@ impl PageCache { let mut buf = BytesMut::new(); minkey.pack(&mut buf); - let mut readopts = ReadOptions::default(); + let mut readopts = rocksdb::ReadOptions::default(); readopts.set_iterate_lower_bound(buf.to_vec()); buf.clear(); maxkey.pack(&mut buf); let mut iter = self .db - .iterator_opt(IteratorMode::From(&buf[..], Direction::Reverse), readopts); + .iterator_opt(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse), readopts); let entry_opt = iter.next(); if entry_opt.is_none() { - static ZERO_PAGE: [u8; 8192] = [0 as u8; 8192]; + static ZERO_PAGE: [u8; 8192] = [0u8; 8192]; return Ok(Bytes::from_static(&ZERO_PAGE)); /* return Err("could not find page image")?; */ } @@ -606,14 +607,14 @@ impl PageCache { let mut buf = BytesMut::new(); minkey.pack(&mut buf); - let mut readopts = ReadOptions::default(); + let mut readopts = rocksdb::ReadOptions::default(); readopts.set_iterate_lower_bound(buf.to_vec()); buf.clear(); entry.key.pack(&mut buf); let iter = self .db - .iterator_opt(IteratorMode::From(&buf[..], Direction::Reverse), readopts); + .iterator_opt(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse), readopts); let mut base_img: Option = None; let mut records: Vec = Vec::new(); @@ -826,7 +827,7 @@ impl PageCache { key.pack(&mut buf); let mut iter = self .db - .iterator(IteratorMode::From(&buf[..], Direction::Reverse)); + .iterator(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse)); if let Some((k, v)) = iter.next() { buf.clear(); buf.extend_from_slice(&k); @@ -867,7 +868,7 @@ impl PageCache { key.pack(&mut buf); let mut iter = self .db - .iterator(IteratorMode::From(&buf[..], Direction::Reverse)); + .iterator(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse)); if let Some((k, _v)) = iter.next() { buf.clear(); buf.extend_from_slice(&k); From da9508716dfca3fdacb524239a81156b89366686 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Apr 2021 10:37:52 +0300 Subject: [PATCH 17/21] Address issues from Eric's review --- Cargo.lock | 4 ++-- pageserver/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5bac69326..1f0cb04e53 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1090,7 +1090,7 @@ dependencies = [ [[package]] name = "librocksdb-sys" version = "6.17.3" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#7dd6258b07861b9332f827b416e50e5aee69aea1" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git?rev=7dd6258b07861b9332f827b416e50e5aee69aea1#7dd6258b07861b9332f827b416e50e5aee69aea1" dependencies = [ "bindgen", "cc", @@ -1814,7 +1814,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.16.0" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git#7dd6258b07861b9332f827b416e50e5aee69aea1" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git?rev=7dd6258b07861b9332f827b416e50e5aee69aea1#7dd6258b07861b9332f827b416e50e5aee69aea1" dependencies = [ "libc", "librocksdb-sys", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 4fb2c789eb..52fd0a9d5a 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -32,7 +32,7 @@ tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } -rocksdb = { git = "https://github.com/rust-rocksdb/rust-rocksdb.git" } +rocksdb = { git = "https://github.com/rust-rocksdb/rust-rocksdb.git", rev="7dd6258b07861b9332f827b416e50e5aee69aea1" } anyhow = "1.0" crc32c = "0.6.0" walkdir = "2" From ed30f2096c02f65046e2b017371cf075be80b2a7 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Apr 2021 11:30:27 +0300 Subject: [PATCH 18/21] Disable GC by default --- Cargo.lock | 6 ++- pageserver/Cargo.toml | 2 +- pageserver/src/bin/pageserver.rs | 11 ++++-- pageserver/src/page_cache.rs | 66 ++++++++++++++++++-------------- pageserver/src/walredo.rs | 2 +- vendor/postgres | 2 +- 6 files changed, 52 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1f0cb04e53..d1fe1c187a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1090,7 +1090,8 @@ dependencies = [ [[package]] name = "librocksdb-sys" version = "6.17.3" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git?rev=7dd6258b07861b9332f827b416e50e5aee69aea1#7dd6258b07861b9332f827b416e50e5aee69aea1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5da125e1c0f22c7cae785982115523a0738728498547f415c9054cb17c7e89f9" dependencies = [ "bindgen", "cc", @@ -1814,7 +1815,8 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.16.0" -source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git?rev=7dd6258b07861b9332f827b416e50e5aee69aea1#7dd6258b07861b9332f827b416e50e5aee69aea1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c749134fda8bfc90d0de643d59bfc841dcb3ac8a1062e12b6754bd60235c48b3" dependencies = [ "libc", "librocksdb-sys", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 52fd0a9d5a..a198f6403a 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -32,7 +32,7 @@ tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" } -rocksdb = { git = "https://github.com/rust-rocksdb/rust-rocksdb.git", rev="7dd6258b07861b9332f827b416e50e5aee69aea1" } +rocksdb = "0.16.0" anyhow = "1.0" crc32c = "0.6.0" walkdir = "2" diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 062fc4eb21..ecab571bc0 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -19,7 +19,7 @@ use slog::Drain; use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf}; -const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; +const DEFAULT_GC_HORIZON: u64 = 0; //64 * 1024 * 1024; fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") @@ -47,10 +47,15 @@ fn main() -> Result<()> { ) .arg( Arg::with_name("gc_horizon") - .short("g") .long("gc_horizon") .takes_value(true) - .help("Garbage colletor horizon"), + .help("Distance from current LSN to perform all wal records cleanup"), + ) + .arg( + Arg::with_name("gc_period") + .long("gc_period") + .takes_value(true) + .help("Interval between garbage collector iterations"), ) .get_matches(); diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 0777230ac3..21f794913d 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -148,15 +148,15 @@ pub fn get_or_restore_pagecache( walredo::wal_redo_main(&conf_copy, timelineid); }) .unwrap(); - - let conf_copy = conf.clone(); - let _gc_thread = thread::Builder::new() - .name("Garbage collection thread".into()) - .spawn(move || { - gc_thread_main(&conf_copy, timelineid); - }) - .unwrap(); - + if conf.gc_horizon != 0 { + let conf_copy = conf.clone(); + let _gc_thread = thread::Builder::new() + .name("Garbage collection thread".into()) + .spawn(move || { + gc_thread_main(&conf_copy, timelineid); + }) + .unwrap(); + } Ok(result) } } @@ -381,11 +381,13 @@ impl WALRecord { // Public interface functions impl PageCache { - fn do_gc(&self, conf: &PageServerConf) -> anyhow::Result { let mut minbuf = BytesMut::new(); let mut maxbuf = BytesMut::new(); - let cf = self.db.cf_handle(rocksdb::DEFAULT_COLUMN_FAMILY_NAME).unwrap(); + let cf = self + .db + .cf_handle(rocksdb::DEFAULT_COLUMN_FAMILY_NAME) + .unwrap(); loop { thread::sleep(conf.gc_period); let last_lsn = self.get_last_valid_lsn(); @@ -406,9 +408,10 @@ impl PageCache { loop { maxbuf.clear(); maxkey.pack(&mut maxbuf); - let mut iter = self - .db - .iterator(rocksdb::IteratorMode::From(&maxbuf[..], rocksdb::Direction::Reverse)); + let mut iter = self.db.iterator(rocksdb::IteratorMode::From( + &maxbuf[..], + rocksdb::Direction::Reverse, + )); if let Some((k, v)) = iter.next() { minbuf.clear(); minbuf.extend_from_slice(&v); @@ -436,9 +439,10 @@ impl PageCache { if last_lsn > horizon { // locate most recent record before horizon - let mut iter = self - .db - .iterator(rocksdb::IteratorMode::From(&maxbuf[..], rocksdb::Direction::Reverse)); + let mut iter = self.db.iterator(rocksdb::IteratorMode::From( + &maxbuf[..], + rocksdb::Direction::Reverse, + )); if let Some((k, v)) = iter.next() { minbuf.clear(); minbuf.extend_from_slice(&v); @@ -544,9 +548,10 @@ impl PageCache { buf.clear(); maxkey.pack(&mut buf); - let mut iter = self - .db - .iterator_opt(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse), readopts); + let mut iter = self.db.iterator_opt( + rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse), + readopts, + ); let entry_opt = iter.next(); if entry_opt.is_none() { @@ -612,9 +617,10 @@ impl PageCache { buf.clear(); entry.key.pack(&mut buf); - let iter = self - .db - .iterator_opt(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse), readopts); + let iter = self.db.iterator_opt( + rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse), + readopts, + ); let mut base_img: Option = None; let mut records: Vec = Vec::new(); @@ -825,9 +831,10 @@ impl PageCache { loop { buf.clear(); key.pack(&mut buf); - let mut iter = self - .db - .iterator(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse)); + let mut iter = self.db.iterator(rocksdb::IteratorMode::From( + &buf[..], + rocksdb::Direction::Reverse, + )); if let Some((k, v)) = iter.next() { buf.clear(); buf.extend_from_slice(&k); @@ -866,9 +873,10 @@ impl PageCache { }; let mut buf = BytesMut::new(); key.pack(&mut buf); - let mut iter = self - .db - .iterator(rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse)); + let mut iter = self.db.iterator(rocksdb::IteratorMode::From( + &buf[..], + rocksdb::Direction::Reverse, + )); if let Some((k, _v)) = iter.next() { buf.clear(); buf.extend_from_slice(&k); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index fafbb376d8..d2e7f25042 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -81,11 +81,11 @@ pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) { let result = handle_apply_request(&pcache, &process, &runtime, request); if result.is_err() { // On error, kill the process. - error!("Kill wal redo process on error"); break; } } + info!("killing WAL redo postgres process"); let _ = runtime.block_on(process.stdin.get_mut().shutdown()); let mut child = process.child; drop(process.stdin); diff --git a/vendor/postgres b/vendor/postgres index b3af0d345c..eb757400f8 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit b3af0d345cc78ef0805063df5569a0389a39dbf2 +Subproject commit eb757400f8c13980f020dbeaa650805881f1cebd From c5a8c31b8a46d7f2515b5f046a5e2abf1b5501a1 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Apr 2021 11:46:20 +0300 Subject: [PATCH 19/21] Update comments --- pageserver/src/bin/pageserver.rs | 3 ++- pageserver/src/walredo.rs | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ecab571bc0..6fd53544de 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -20,6 +20,7 @@ use slog::Drain; use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf}; const DEFAULT_GC_HORIZON: u64 = 0; //64 * 1024 * 1024; +const DEFAULT_GC_PERIOD_SEC: u32 = 1; fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") @@ -63,7 +64,7 @@ fn main() -> Result<()> { daemonize: false, interactive: false, gc_horizon: DEFAULT_GC_HORIZON, - gc_period: Duration::from_secs(10), + gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC), listen_addr: "127.0.0.1:5430".parse().unwrap(), }; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index d2e7f25042..0fc9ae51e7 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -80,11 +80,17 @@ pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) { let result = handle_apply_request(&pcache, &process, &runtime, request); if result.is_err() { - // On error, kill the process. + // Something went wrong with handling the request. It's not clear + // if the request was faulty, and the next request would succeed + // again, or if the 'postgres' process went haywire. To be safe, + // kill the 'postgres' process so that we will start from a clean + // slate, with a new process, for the next request. break; } } + // Time to kill the 'postgres' process. A new one will be launched on next + // iteration of the loop. info!("killing WAL redo postgres process"); let _ = runtime.block_on(process.stdin.get_mut().shutdown()); let mut child = process.child; From 2ca8fbb6ff47f43a7c231dcea58523e7aeef04ce Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Apr 2021 12:01:25 +0300 Subject: [PATCH 20/21] Fix DEFAULT_GC_PERIOD_SEC type --- control_plane/src/local_env.rs | 5 ++++- pageserver/src/bin/pageserver.rs | 2 +- vendor/postgres | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ac6eb0bfb5..f3da897eda 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -138,7 +138,10 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> { .arg("--no-instructions") .env_clear() .env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap()) + .env( + "DYLD_LIBRARY_PATH", + local_env.pg_lib_dir().to_str().unwrap(), + ) .stdout(Stdio::null()) .status() .with_context(|| "failed to execute initdb")?; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 6fd53544de..23e0d5e57b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -20,7 +20,7 @@ use slog::Drain; use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf}; const DEFAULT_GC_HORIZON: u64 = 0; //64 * 1024 * 1024; -const DEFAULT_GC_PERIOD_SEC: u32 = 1; +const DEFAULT_GC_PERIOD_SEC: u64 = 1; fn main() -> Result<()> { let arg_matches = App::new("Zenith page server") diff --git a/vendor/postgres b/vendor/postgres index 77624689b7..fc5ffb678d 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 77624689b7c93a3285186900a4dc51965d6761b6 +Subproject commit fc5ffb678d4f7d9384b1a0ce6dff9295769a63f8 From 75baf670f5558283ad3bd038b1b1180a706f17fb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 22 Apr 2021 12:42:11 +0300 Subject: [PATCH 21/21] Remove extra trace --- pageserver/src/page_service.rs | 2 +- vendor/postgres | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 2ba1c64de9..37790b5561 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -619,7 +619,7 @@ impl Connection { let mut unnamed_query_string = Bytes::new(); loop { let msg = self.read_message().await?; - info!("got message {:?}", msg); + trace!("got message {:?}", msg); match msg { Some(FeMessage::StartupMessage(m)) => { trace!("got message {:?}", m); diff --git a/vendor/postgres b/vendor/postgres index fc5ffb678d..daec929ec3 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit fc5ffb678d4f7d9384b1a0ce6dff9295769a63f8 +Subproject commit daec929ec3f357f1af19b33fa6862acaa2fcf34d