mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-22 21:02:56 +00:00
Compare commits
16 Commits
inmem_stor
...
get_page_f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
600588034b | ||
|
|
9f015bdc60 | ||
|
|
c564272142 | ||
|
|
14a0ae5456 | ||
|
|
0d9805a505 | ||
|
|
87fce3fcd5 | ||
|
|
04e1ee5ce3 | ||
|
|
ec675bbdd6 | ||
|
|
a3b70745a9 | ||
|
|
41c352be7f | ||
|
|
8a5fcf52c0 | ||
|
|
abb114decd | ||
|
|
e7587ceb81 | ||
|
|
6d38b9ce6a | ||
|
|
47ef9c7ef4 | ||
|
|
d73cb49f89 |
@@ -37,7 +37,7 @@ jobs:
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison
|
||||
fi
|
||||
|
||||
# Build postgres if the restore_cache didn't find a build.
|
||||
|
||||
2
.github/workflows/testing.yml
vendored
2
.github/workflows/testing.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
- name: Install postgres dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
|
||||
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -1,7 +1,5 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.4.7"
|
||||
@@ -2473,7 +2471,6 @@ dependencies = [
|
||||
"bytes",
|
||||
"hex-literal",
|
||||
"log",
|
||||
"postgres",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"workspace_hack",
|
||||
|
||||
@@ -8,8 +8,3 @@ members = [
|
||||
"zenith_utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[profile.release]
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
@@ -89,6 +89,9 @@ RUN addgroup zenith && adduser -h /data -D -G zenith zenith
|
||||
VOLUME ["/data"]
|
||||
WORKDIR /data
|
||||
USER zenith
|
||||
ENV ZENITH_REPO_DIR /data/
|
||||
ENV POSTGRES_DISTRIB_DIR /usr/local
|
||||
|
||||
EXPOSE 6400
|
||||
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||
CMD ["pageserver"]
|
||||
|
||||
17
Makefile
17
Makefile
@@ -1,12 +1,3 @@
|
||||
# Seccomp BPF is only available for Linux
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
SECCOMP = --with-libseccomp
|
||||
SECCOMP =
|
||||
else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
@@ -30,12 +21,8 @@ tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
../../vendor/postgres/configure CFLAGS='-O0 $(CFLAGS)' --enable-debug --enable-cassert \
|
||||
--enable-depend --prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
# nicer alias for running 'configure'
|
||||
postgres-configure: tmp_install/build/config.status
|
||||
|
||||
@@ -8,7 +8,7 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
|
||||
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison \
|
||||
libssl-dev clang
|
||||
```
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@ use std::{
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
@@ -290,15 +289,14 @@ impl PostgresNode {
|
||||
// Connect it to the page server.
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
let (host, port) = connection_host_port(&self.pageserver.connection_config());
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"shared_preload_libraries = zenith \n\
|
||||
zenith.page_server_connstring = 'host={} port={}'\n\
|
||||
zenith.zenith_timeline='{}'\n",
|
||||
host,
|
||||
port,
|
||||
self.pageserver.address().ip(),
|
||||
self.pageserver.address().port(),
|
||||
self.timelineid
|
||||
),
|
||||
)?;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use std::net::{TcpStream};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::thread;
|
||||
@@ -8,11 +8,10 @@ use std::time::Duration;
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::{Config, NoTls};
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::read_pidfile;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
use pageserver::branches::BranchInfo;
|
||||
|
||||
//
|
||||
@@ -22,7 +21,7 @@ use pageserver::branches::BranchInfo;
|
||||
//
|
||||
pub struct PageServerNode {
|
||||
pub kill_on_exit: bool,
|
||||
pub connection_config: Option<Config>,
|
||||
pub listen_address: Option<SocketAddr>,
|
||||
pub env: LocalEnv,
|
||||
}
|
||||
|
||||
@@ -30,19 +29,15 @@ impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
connection_config: None, // default
|
||||
listen_address: None, // default
|
||||
env: env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn default_config() -> Config {
|
||||
"postgresql://no_user@localhost:64000/no_db".parse().unwrap()
|
||||
}
|
||||
|
||||
pub fn connection_config(&self) -> Config {
|
||||
match &self.connection_config {
|
||||
Some(config) => config.clone(),
|
||||
None => Self::default_config(),
|
||||
pub fn address(&self) -> SocketAddr {
|
||||
match self.listen_address {
|
||||
Some(addr) => addr,
|
||||
None => "127.0.0.1:64000".parse().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,7 +74,7 @@ impl PageServerNode {
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in {}",
|
||||
connection_address(&self.connection_config()),
|
||||
self.address(),
|
||||
self.repo_path().display()
|
||||
);
|
||||
|
||||
@@ -121,29 +116,42 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
// wait for pageserver stop
|
||||
let address = connection_address(&self.connection_config());
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(&address);
|
||||
let stream = TcpStream::connect(self.address());
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", address);
|
||||
println!("Stopping pageserver on {}", self.address());
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let mut client = self.connection_config().connect(NoTls).unwrap();
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
self.connection_config().connect(NoTls)
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls)
|
||||
}
|
||||
|
||||
pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
|
||||
|
||||
@@ -2,10 +2,10 @@
|
||||
if [ "$1" = 'pageserver' ]; then
|
||||
if [ ! -d "/data/timelines" ]; then
|
||||
echo "Initializing pageserver data directory"
|
||||
pageserver --init -D /data --postgres-distrib /usr/local
|
||||
pageserver --init --workdir $ZENITH_REPO_DIR
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -l 0.0.0.0:6400 -D /data
|
||||
pageserver -l 0.0.0.0:6400 --workdir $ZENITH_REPO_DIR
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
//! TODO: this module has nothing to do with PostgreSQL pg_basebackup.
|
||||
//! It could use a better name.
|
||||
//!
|
||||
//! Stateless Postgres compute node is launched by sending tarball which contains non-relational data (multixacts, clog, filenodemaps, twophase files)
|
||||
//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directory and
|
||||
//! Stateless Postgres compute node is lauched by sending taball which contains on-relational data (multixacts, clog, filenodemaps, twophase files)
|
||||
//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directry and
|
||||
//! data stored in object storage.
|
||||
//!
|
||||
use crate::ZTimelineId;
|
||||
@@ -17,15 +17,14 @@ use std::time::SystemTime;
|
||||
use tar::{Builder, Header};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::repository::Timeline;
|
||||
use crate::repository::{DatabaseTag, ObjectTag, Timeline};
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
/// This is shorliving object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between varyouds functions
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
@@ -56,6 +55,7 @@ impl<'a> Basebackup<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
#[rustfmt::skip] // otherwise "cargo fmt" produce very strange formatting for macch arms of self.timeline.list_nonrels
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
debug!("sending tarball of snapshot in {}", self.snappath);
|
||||
for entry in WalkDir::new(&self.snappath) {
|
||||
@@ -85,8 +85,7 @@ impl<'a> Basebackup<'a> {
|
||||
trace!("sending {}", relpath.display());
|
||||
self.ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
// relation pages are loaded on demand and should not be included in tarball
|
||||
} else { // relation pages are loaded on demand and should not be included in tarball
|
||||
trace!("not sending {}", relpath.display());
|
||||
}
|
||||
} else {
|
||||
@@ -95,25 +94,26 @@ impl<'a> Basebackup<'a> {
|
||||
}
|
||||
|
||||
// Generate non-relational files.
|
||||
// Iteration is sorted order: all objects of the same time are grouped and traversed
|
||||
// in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
|
||||
// It allows to easily construct SLRU segments (32 blocks).
|
||||
// Iteration is sorted order: all objects of the same time are grouped and traversed
|
||||
// in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
|
||||
// It allows to easily construct SLRU segments (32 blocks).
|
||||
for obj in self.timeline.list_nonrels(self.lsn)? {
|
||||
match obj {
|
||||
ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
|
||||
ObjectTag::MultiXactMembers(slru) => {
|
||||
self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?
|
||||
}
|
||||
ObjectTag::MultiXactOffsets(slru) => {
|
||||
self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?
|
||||
}
|
||||
ObjectTag::FileNodeMap(db) => self.add_relmap_file(&obj, &db)?,
|
||||
ObjectTag::TwoPhase(prepare) => self.add_twophase_file(&obj, prepare.xid)?,
|
||||
ObjectTag::Clog(slru) =>
|
||||
self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
|
||||
ObjectTag::MultiXactMembers(slru) =>
|
||||
self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?,
|
||||
ObjectTag::MultiXactOffsets(slru) =>
|
||||
self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?,
|
||||
ObjectTag::FileNodeMap(db) =>
|
||||
self.add_relmap_file(&obj, &db)?,
|
||||
ObjectTag::TwoPhase(prepare) =>
|
||||
self.add_twophase_file(&obj, prepare.xid)?,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
self.finish_slru_segment()?; // write last non-completed SLRU segment (if any)
|
||||
self.add_pgcontrol_file()?;
|
||||
self.add_pgcontrol_file()?;
|
||||
self.ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
@@ -238,9 +238,8 @@ impl<'a> Basebackup<'a> {
|
||||
info!("pg_control.state = {}", pg_control.state);
|
||||
pg_control.state = pg_constants::DB_SHUTDOWNED;
|
||||
|
||||
// add zenith.signal file
|
||||
self.ar
|
||||
.append(&new_tar_header("zenith.signal", 0)?, &b""[..])?;
|
||||
// add zenith.signal file
|
||||
self.ar.append(&new_tar_header("zenith.signal", 0)?, &b""[..])?;
|
||||
|
||||
//send pg_control
|
||||
let pg_control_bytes = pg_control.encode();
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::{
|
||||
env,
|
||||
fs::{File, OpenOptions},
|
||||
io,
|
||||
net::TcpListener,
|
||||
net::{SocketAddr, TcpListener},
|
||||
path::{Path, PathBuf},
|
||||
process::exit,
|
||||
thread,
|
||||
@@ -65,10 +65,11 @@ impl CfgFileParams {
|
||||
|
||||
/// Create a PageServerConf from these string parameters
|
||||
fn try_into_config(&self) -> Result<PageServerConf> {
|
||||
let listen_addr = match self.listen_addr.as_ref() {
|
||||
Some(addr) => addr.clone(),
|
||||
None => DEFAULT_LISTEN_ADDR.to_owned(),
|
||||
};
|
||||
let listen_addr: SocketAddr = self
|
||||
.listen_addr
|
||||
.as_deref()
|
||||
.unwrap_or(DEFAULT_LISTEN_ADDR)
|
||||
.parse()?;
|
||||
|
||||
let gc_horizon: u64 = match self.gc_horizon.as_ref() {
|
||||
Some(horizon_str) => horizon_str.parse()?,
|
||||
@@ -271,7 +272,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
info!("Starting pageserver on {}", conf.listen_addr);
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr.clone())?;
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr)?;
|
||||
|
||||
// Initialize page cache, this will spawn walredo_thread
|
||||
page_cache::init(conf);
|
||||
|
||||
@@ -100,8 +100,7 @@ pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
// and we failed to run initdb again in the same directory. This has been solved for the
|
||||
// rapid init+start case now, but the general race condition remains if you restart the
|
||||
// server quickly.
|
||||
//let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
|
||||
let storage = crate::inmem_storage::InmemObjectStore::create(conf)?;
|
||||
let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
|
||||
|
||||
let repo = crate::object_repository::ObjectRepository::new(
|
||||
conf,
|
||||
|
||||
@@ -1,349 +0,0 @@
|
||||
//!
|
||||
//! An implementation of the ObjectStore interface, backed by BTreeMap
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::repository::RelTag;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::{bail, Result};
|
||||
use std::collections::{BTreeMap,HashSet};
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use std::ops::Bound::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use std::io::prelude::*;
|
||||
use std::fs::File;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize)]
|
||||
pub struct StorageKey {
|
||||
obj_key: ObjectKey,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl StorageKey {
|
||||
/// The first key for a given timeline
|
||||
fn timeline_start(timeline: ZTimelineId) -> Self {
|
||||
Self {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::FirstTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InmemObjectStore {
|
||||
conf: &'static PageServerConf,
|
||||
db: RwLock<BTreeMap<StorageKey, Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl ObjectStore for InmemObjectStore {
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
|
||||
let db = self.db.read().unwrap();
|
||||
let val = db.get(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
});
|
||||
if let Some(val) = val {
|
||||
Ok(val.clone())
|
||||
} else {
|
||||
bail!("could not find page {:?}", key);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
|
||||
let search_key = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
for pair in db.range(&search_key..) {
|
||||
let key = pair.0;
|
||||
return Ok(Some(key.obj_key.clone()));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
|
||||
let mut db = self.db.write().unwrap();
|
||||
db.insert(
|
||||
StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
},
|
||||
value.to_vec(),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
|
||||
let mut db = self.db.write().unwrap();
|
||||
db.remove(&StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate through page versions of given page, starting from the given LSN.
|
||||
/// The versions are walked in descending LSN order.
|
||||
fn object_versions<'a>(
|
||||
&'a self,
|
||||
key: &ObjectKey,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
|
||||
let from = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let till = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn,
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
let versions: Vec<(Lsn, Vec<u8>)> = db.range(from..=till).map(|pair|(pair.0.lsn, pair.1.clone())).collect();
|
||||
Ok(Box::new(InmemObjectVersionIter::new(versions)))
|
||||
}
|
||||
|
||||
/// Iterate through all timeline objects
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
let curr_key = StorageKey::timeline_start(timeline);
|
||||
|
||||
Ok(Box::new(InmemObjectIter {
|
||||
store: &self,
|
||||
curr_key,
|
||||
timeline,
|
||||
nonrel_only,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all entries in the given database. In practice, this
|
||||
/// is used for CREATE DATABASE, and usually the template database is small.
|
||||
/// But if it's not, this will be slow.
|
||||
fn list_rels(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>> {
|
||||
// FIXME: This scans everything. Very slow
|
||||
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
let mut search_rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
};
|
||||
let db = self.db.read().unwrap();
|
||||
'outer: loop {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(search_rel_tag),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
for pair in db.range(&search_key..) {
|
||||
let key = pair.0;
|
||||
|
||||
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
search_rel_tag = rel_tag;
|
||||
// skip to next relation
|
||||
// FIXME: What if relnode is u32::MAX ?
|
||||
search_rel_tag.relnode += 1;
|
||||
continue 'outer;
|
||||
} else {
|
||||
// no more relation metadata entries
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
/// Returns all versions up to and including the specified LSN.
|
||||
fn objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
|
||||
let curr_key = StorageKey::timeline_start(timeline);
|
||||
|
||||
Ok(Box::new(InmemObjects {
|
||||
store: &self,
|
||||
curr_key,
|
||||
timeline,
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
fn compact(&self) {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for InmemObjectStore {
|
||||
fn drop(&mut self) {
|
||||
let path = self.conf.workdir.join("objstore.dmp");
|
||||
let mut f = File::create(path).unwrap();
|
||||
f.write(&self.db.ser().unwrap()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl InmemObjectStore {
|
||||
pub fn open(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
|
||||
let path = conf.workdir.join("objstore.dmp");
|
||||
let mut f = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
f.read_to_end(&mut buffer)?;
|
||||
let db = RwLock::new(BTreeMap::des(&buffer)?);
|
||||
Ok(InmemObjectStore {
|
||||
conf: conf,
|
||||
db
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
|
||||
Ok(InmemObjectStore {
|
||||
conf: conf,
|
||||
db: RwLock::new(BTreeMap::new()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `object_versions`. Returns all page versions of a given block, in
|
||||
/// reverse LSN order.
|
||||
///
|
||||
struct InmemObjectVersionIter {
|
||||
versions: Vec<(Lsn, Vec<u8>)>,
|
||||
curr: usize,
|
||||
}
|
||||
impl InmemObjectVersionIter {
|
||||
fn new(versions: Vec<(Lsn, Vec<u8>)>) -> InmemObjectVersionIter {
|
||||
let curr = versions.len();
|
||||
InmemObjectVersionIter {
|
||||
versions,
|
||||
curr
|
||||
}
|
||||
}
|
||||
}
|
||||
impl Iterator for InmemObjectVersionIter {
|
||||
type Item = (Lsn, Vec<u8>);
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
if self.curr == 0 {
|
||||
None
|
||||
} else {
|
||||
self.curr -= 1;
|
||||
Some(self.versions[self.curr].clone())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct InmemObjects<'r> {
|
||||
store: &'r InmemObjectStore,
|
||||
curr_key: StorageKey,
|
||||
timeline: ZTimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'r> Iterator for InmemObjects<'r> {
|
||||
// TODO consider returning Box<[u8]>
|
||||
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_result().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> InmemObjects<'r> {
|
||||
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
|
||||
let db = self.store.db.read().unwrap();
|
||||
for pair in db.range((Excluded(&self.curr_key),Unbounded)) {
|
||||
let key = pair.0;
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return Ok(None);
|
||||
}
|
||||
if key.lsn > self.lsn {
|
||||
// TODO can speed up by seeking iterator
|
||||
continue;
|
||||
}
|
||||
self.curr_key = key.clone();
|
||||
let value = pair.1.clone();
|
||||
return Ok(Some((key.obj_key.tag, key.lsn, value)));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
|
||||
///
|
||||
struct InmemObjectIter<'a> {
|
||||
store: &'a InmemObjectStore,
|
||||
curr_key: StorageKey,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for InmemObjectIter<'a> {
|
||||
type Item = ObjectTag;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<Self::Item> {
|
||||
let db = self.store.db.read().unwrap();
|
||||
'outer: loop {
|
||||
for pair in db.range((Excluded(&self.curr_key),Unbounded)) {
|
||||
let key = pair.0;
|
||||
if key.obj_key.timeline != self.timeline {
|
||||
return None;
|
||||
}
|
||||
self.curr_key = key.clone();
|
||||
self.curr_key.lsn = Lsn(u64::MAX); // next seek should skip all versions
|
||||
if key.lsn <= self.lsn {
|
||||
// visible in this snapshot
|
||||
if self.nonrel_only {
|
||||
match key.obj_key.tag {
|
||||
ObjectTag::RelationMetadata(_) => return None,
|
||||
ObjectTag::RelationBuffer(_) => return None,
|
||||
_ => return Some(key.obj_key.tag),
|
||||
}
|
||||
} else {
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,13 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod object_key;
|
||||
pub mod object_repository;
|
||||
pub mod object_store;
|
||||
pub mod page_cache;
|
||||
@@ -15,7 +15,6 @@ pub mod page_service;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod rocksdb_storage;
|
||||
pub mod inmem_storage;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
mod tui_logger;
|
||||
@@ -27,7 +26,7 @@ pub mod walredo;
|
||||
pub struct PageServerConf {
|
||||
pub daemonize: bool,
|
||||
pub interactive: bool,
|
||||
pub listen_addr: String,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
|
||||
@@ -104,7 +103,7 @@ impl PageServerConf {
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
use crate::repository::{BufferTag, RelTag};
|
||||
use crate::waldecoder::TransactionId;
|
||||
use crate::ZTimelineId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
///
|
||||
/// ObjectKey is the key type used to identify objects stored in an object
|
||||
/// repository. It is shared between object_repository.rs and object_store.rs.
|
||||
/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
|
||||
/// using the key given by the caller.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct ObjectKey {
|
||||
pub timeline: ZTimelineId,
|
||||
pub tag: ObjectTag,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
|
||||
/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into pages
|
||||
/// of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SlruBufferTag {
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Special type of Postgres files: pg_filenode.map is needed to map
|
||||
/// catalog table OIDs to filenode numbers, which define filename.
|
||||
///
|
||||
/// Each database has a map file for its local mapped catalogs,
|
||||
/// and there is a separate map file for shared catalogs.
|
||||
///
|
||||
/// These files have untypical size of 512 bytes.
|
||||
///
|
||||
/// See PostgreSQL relmapper.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DatabaseTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation files that keep state for prepared transactions.
|
||||
/// Unlike other files these are not divided into pages.
|
||||
///
|
||||
/// See PostgreSQL twophase.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct PrepareTag {
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
/// ObjectTag is a part of ObjectKey that is specific to the type of
|
||||
/// the stored object.
|
||||
///
|
||||
/// NB: the order of the enum values is significant! In particular,
|
||||
/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum ObjectTag {
|
||||
// dummy tag preceeding all other keys
|
||||
FirstTag,
|
||||
TimelineMetadataTag,
|
||||
// Special entry that represents PostgreSQL checkpoint.
|
||||
// We use it to track fields needed to restore controlfile checkpoint.
|
||||
Checkpoint,
|
||||
// Various types of non-relation files.
|
||||
// We need them to bootstrap compute node.
|
||||
ControlFile,
|
||||
Clog(SlruBufferTag),
|
||||
MultiXactMembers(SlruBufferTag),
|
||||
MultiXactOffsets(SlruBufferTag),
|
||||
FileNodeMap(DatabaseTag),
|
||||
TwoPhase(PrepareTag),
|
||||
// put relations at the end of enum to allow efficient iterations through non-rel objects
|
||||
RelationMetadata(RelTag),
|
||||
RelationBuffer(BufferTag),
|
||||
}
|
||||
@@ -13,8 +13,7 @@
|
||||
//! until we find the page we're looking for, making a separate lookup into the
|
||||
//! key-value store for each timeline.
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::object_store::{ObjectKey, ObjectStore};
|
||||
use crate::repository::*;
|
||||
use crate::restore_local_repo::import_timeline_wal;
|
||||
use crate::walredo::WalRedoManager;
|
||||
@@ -106,7 +105,7 @@ impl Repository for ObjectRepository {
|
||||
) -> Result<Arc<dyn Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Write initial metadata key.
|
||||
// Write metadata key
|
||||
let metadata = MetadataEntry {
|
||||
last_valid_lsn: start_lsn,
|
||||
last_record_lsn: start_lsn,
|
||||
@@ -162,7 +161,7 @@ impl Repository for ObjectRepository {
|
||||
ObjectTag::TimelineMetadataTag => {} // skip it
|
||||
_ => {
|
||||
let img = src_timeline.get_page_at_lsn_nowait(tag, at_lsn)?;
|
||||
let val = ObjectValue::Page(PageEntry::Page(img));
|
||||
let val = ObjectValue::Page(img);
|
||||
let key = ObjectKey { timeline: dst, tag };
|
||||
self.obj_store.put(&key, at_lsn, &ObjectValue::ser(&val)?)?;
|
||||
}
|
||||
@@ -236,19 +235,22 @@ impl ObjectTimeline {
|
||||
let v = obj_store
|
||||
.get(&timeline_metadata_key(timelineid), Lsn(0))
|
||||
.with_context(|| "timeline not found in repository")?;
|
||||
let metadata = ObjectValue::des_timeline_metadata(&v)?;
|
||||
|
||||
let timeline = ObjectTimeline {
|
||||
timelineid,
|
||||
obj_store,
|
||||
walredo_mgr,
|
||||
last_valid_lsn: SeqWait::new(metadata.last_valid_lsn),
|
||||
last_record_lsn: AtomicLsn::new(metadata.last_record_lsn.0),
|
||||
ancestor_timeline: metadata.ancestor_timeline,
|
||||
ancestor_lsn: metadata.ancestor_lsn,
|
||||
rel_meta: RwLock::new(BTreeMap::new()),
|
||||
};
|
||||
Ok(timeline)
|
||||
if let ObjectValue::TimelineMetadata(metadata) = ObjectValue::des(&v)? {
|
||||
let timeline = ObjectTimeline {
|
||||
timelineid,
|
||||
obj_store,
|
||||
walredo_mgr,
|
||||
last_valid_lsn: SeqWait::new(metadata.last_valid_lsn),
|
||||
last_record_lsn: AtomicLsn::new(metadata.last_record_lsn.0),
|
||||
ancestor_timeline: metadata.ancestor_timeline,
|
||||
ancestor_lsn: metadata.ancestor_lsn,
|
||||
rel_meta: RwLock::new(BTreeMap::new()),
|
||||
};
|
||||
Ok(timeline)
|
||||
} else {
|
||||
bail!("Invalid timeline metadata");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -278,22 +280,17 @@ impl Timeline for ObjectTimeline {
|
||||
let page_img: Bytes;
|
||||
|
||||
match ObjectValue::des(&value)? {
|
||||
ObjectValue::Page(PageEntry::Page(img)) => {
|
||||
ObjectValue::Page(img) => {
|
||||
page_img = img;
|
||||
}
|
||||
ObjectValue::Page(PageEntry::WALRecord(_rec)) => {
|
||||
ObjectValue::WALRecord(_rec) => {
|
||||
// Request the WAL redo manager to apply the WAL records for us.
|
||||
let (base_img, records) = self.collect_records_for_apply(tag, lsn)?;
|
||||
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
|
||||
|
||||
// Garbage collection assumes that we remember the materialized page
|
||||
// version. Otherwise we could opt to not do it, with the downside that
|
||||
// the next GetPage@LSN call of the same page version would have to
|
||||
// redo the WAL again.
|
||||
self.put_page_image(tag, lsn, page_img.clone(), false)?;
|
||||
self.put_page_image(tag, lsn, page_img.clone())?;
|
||||
}
|
||||
ObjectValue::SLRUTruncate => page_img = Bytes::from_static(&ZERO_PAGE),
|
||||
_ => bail!("Invalid object kind, expected a page entry or SRLU truncate"),
|
||||
x => bail!("Unexpected object value: {:?}", x),
|
||||
}
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
@@ -373,10 +370,12 @@ impl Timeline for ObjectTimeline {
|
||||
.obj_store
|
||||
.get(&timeline_metadata_key(timeline), Lsn(0))
|
||||
.with_context(|| "timeline not found in repository")?;
|
||||
let metadata = ObjectValue::des_timeline_metadata(&v)?;
|
||||
|
||||
prev_timeline = metadata.ancestor_timeline;
|
||||
lsn = metadata.ancestor_lsn;
|
||||
if let ObjectValue::TimelineMetadata(metadata) = ObjectValue::des(&v)? {
|
||||
prev_timeline = metadata.ancestor_timeline;
|
||||
lsn = metadata.ancestor_lsn;
|
||||
} else {
|
||||
bail!("Invalid timeline metadata");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_rels)
|
||||
@@ -394,8 +393,13 @@ impl Timeline for ObjectTimeline {
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()> {
|
||||
let lsn = rec.lsn;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
};
|
||||
let val = ObjectValue::WALRecord(rec);
|
||||
|
||||
self.put_page_entry(&tag, lsn, PageEntry::WALRecord(rec))?;
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
debug!("put_wal_record {:?} at {}", tag, lsn);
|
||||
|
||||
if let ObjectTag::RelationBuffer(tag) = tag {
|
||||
@@ -404,6 +408,8 @@ impl Timeline for ObjectTimeline {
|
||||
|
||||
if tag.blknum >= old_nblocks {
|
||||
let new_nblocks = tag.blknum + 1;
|
||||
let key = relation_size_key(self.timelineid, tag.rel);
|
||||
let val = ObjectValue::RelationSize(new_nblocks);
|
||||
|
||||
trace!(
|
||||
"Extended relation {} from {} to {} blocks at {}",
|
||||
@@ -413,7 +419,7 @@ impl Timeline for ObjectTimeline {
|
||||
lsn
|
||||
);
|
||||
|
||||
self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
let mut rel_meta = self.rel_meta.write().unwrap();
|
||||
rel_meta.insert(
|
||||
tag.rel,
|
||||
@@ -427,36 +433,18 @@ impl Timeline for ObjectTimeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Unlink relation. This method is used for marking dropped relations.
|
||||
fn put_unlink(&self, rel_tag: RelTag, lsn: Lsn) -> Result<()> {
|
||||
self.put_relsize_entry(&rel_tag, lsn, RelationSizeEntry::Unlink)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Truncate SRLU segment
|
||||
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()> {
|
||||
/// Unlink object. This method is used for marking dropped relations
|
||||
/// and removed segments of SLRUs.
|
||||
fn put_unlink(&self, tag: ObjectTag, lsn: Lsn) -> Result<()> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
};
|
||||
let val = ObjectValue::SLRUTruncate;
|
||||
let val = ObjectValue::Unlink;
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
};
|
||||
if let Some(key) = self.obj_store.get_next_key(&key)? {
|
||||
Ok(Some(key.tag))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
@@ -469,13 +457,16 @@ impl Timeline for ObjectTimeline {
|
||||
///
|
||||
/// Memorize a full image of a page version
|
||||
///
|
||||
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool) -> Result<()> {
|
||||
self.put_page_entry(&tag, lsn, PageEntry::Page(img))?;
|
||||
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes) -> Result<()> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
};
|
||||
let val = ObjectValue::Page(img);
|
||||
|
||||
if !update_meta {
|
||||
return Ok(());
|
||||
}
|
||||
debug!("put_page_image rel {:?} at {}", tag, lsn);
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
|
||||
debug!("put_page_image {:?} at {}", tag, lsn);
|
||||
|
||||
if let ObjectTag::RelationBuffer(tag) = tag {
|
||||
// Also check if this created or extended the file
|
||||
@@ -483,7 +474,8 @@ impl Timeline for ObjectTimeline {
|
||||
|
||||
if tag.blknum >= old_nblocks {
|
||||
let new_nblocks = tag.blknum + 1;
|
||||
|
||||
let key = relation_size_key(self.timelineid, tag.rel);
|
||||
let val = ObjectValue::RelationSize(new_nblocks);
|
||||
trace!(
|
||||
"Extended relation {} from {} to {} blocks at {}",
|
||||
tag.rel,
|
||||
@@ -492,7 +484,7 @@ impl Timeline for ObjectTimeline {
|
||||
lsn
|
||||
);
|
||||
|
||||
self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
let mut rel_meta = self.rel_meta.write().unwrap();
|
||||
rel_meta.insert(
|
||||
tag.rel,
|
||||
@@ -511,9 +503,10 @@ impl Timeline for ObjectTimeline {
|
||||
/// associating it with all pages started with specified block number
|
||||
///
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()> {
|
||||
info!("Truncate relation {} to {} blocks at {}", rel, nblocks, lsn);
|
||||
let key = relation_size_key(self.timelineid, rel);
|
||||
let val = ObjectValue::RelationSize(nblocks);
|
||||
|
||||
self.put_relsize_entry(&rel, lsn, RelationSizeEntry::Size(nblocks))?;
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
let mut rel_meta = self.rel_meta.write().unwrap();
|
||||
rel_meta.insert(
|
||||
rel,
|
||||
@@ -598,7 +591,12 @@ impl Timeline for ObjectTimeline {
|
||||
};
|
||||
trace!("checkpoint at {}", metadata.last_valid_lsn);
|
||||
|
||||
self.put_timeline_metadata_entry(metadata)?;
|
||||
let val = ObjectValue::TimelineMetadata(metadata);
|
||||
self.obj_store.put(
|
||||
&timeline_metadata_key(self.timelineid),
|
||||
Lsn(0),
|
||||
&ObjectValue::ser(&val)?,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -606,171 +604,7 @@ impl Timeline for ObjectTimeline {
|
||||
fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>> {
|
||||
let lsn = self.last_valid_lsn.load();
|
||||
let iter = self.obj_store.objects(self.timelineid, lsn)?;
|
||||
Ok(Box::new(ObjectHistory {
|
||||
lsn,
|
||||
iter,
|
||||
last_relation_size: None,
|
||||
}))
|
||||
}
|
||||
|
||||
fn gc_iteration(&self, horizon: u64) -> Result<GcResult> {
|
||||
let last_lsn = self.get_last_valid_lsn();
|
||||
let mut result: GcResult = Default::default();
|
||||
|
||||
// checked_sub() returns None on overflow.
|
||||
if let Some(horizon) = last_lsn.checked_sub(horizon) {
|
||||
// WAL is large enough to perform GC
|
||||
let now = Instant::now();
|
||||
|
||||
// Iterate through all objects in timeline
|
||||
for obj in self
|
||||
.obj_store
|
||||
.list_objects(self.timelineid, false, last_lsn)?
|
||||
{
|
||||
result.inspected += 1;
|
||||
match obj {
|
||||
// Prepared transactions
|
||||
ObjectTag::TwoPhase(prepare) => {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
if self.get_tx_status(prepare.xid, horizon)?
|
||||
!= pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
{
|
||||
let lsn = vers.0;
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.prep_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationMetadata(_) => {
|
||||
// Do not need to reconstruct page images,
|
||||
// just delete all old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
let content = vers.1;
|
||||
match ObjectValue::des(&content[..])? {
|
||||
ObjectValue::RelationSize(RelationSizeEntry::Unlink) => {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
result.dropped += 1;
|
||||
}
|
||||
_ => (), // preserve last version
|
||||
}
|
||||
last_version = false;
|
||||
result.truncated += 1;
|
||||
result.n_relations += 1;
|
||||
} else {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationBuffer(tag) => {
|
||||
// Reconstruct page at horizon unless relation was dropped
|
||||
// and delete all older versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
result.truncated += 1;
|
||||
last_version = false;
|
||||
if let Some(rel_size) = self.relsize_get_nowait(tag.rel, last_lsn)? {
|
||||
if rel_size > tag.blknum {
|
||||
// preserve and materialize last version before deleting all preceeding
|
||||
self.get_page_at_lsn_nowait(obj, lsn)?;
|
||||
continue;
|
||||
}
|
||||
debug!("Drop last block {} of relation {:?} at {} because it is beyond relation size {}", tag.blknum, tag.rel, lsn, rel_size);
|
||||
} else {
|
||||
if let Some(rel_size) =
|
||||
self.relsize_get_nowait(tag.rel, last_lsn)?
|
||||
{
|
||||
debug!("Preserve block {} of relation {:?} at {} because relation has size {} at {}", tag.rel, tag, lsn, rel_size, last_lsn);
|
||||
continue;
|
||||
}
|
||||
debug!("Relation {:?} was dropped at {}", tag.rel, lsn);
|
||||
}
|
||||
// relation was dropped or truncated so this block can be removed
|
||||
}
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
}
|
||||
}
|
||||
// SLRU-s
|
||||
ObjectTag::Clog(_)
|
||||
| ObjectTag::MultiXactOffsets(_)
|
||||
| ObjectTag::MultiXactMembers(_) => {
|
||||
// Remove old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
let content = vers.1;
|
||||
match ObjectValue::des(&content[..])? {
|
||||
ObjectValue::SLRUTruncate => {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.slru_deleted += 1;
|
||||
}
|
||||
ObjectValue::Page(PageEntry::WALRecord(_)) => {
|
||||
// preserve and materialize last version before deleting all preceeding
|
||||
self.get_page_at_lsn_nowait(obj, lsn)?;
|
||||
}
|
||||
_ => {} // do nothing if already materialized
|
||||
}
|
||||
last_version = false;
|
||||
} else {
|
||||
// delete deteriorated version
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.slru_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// versioned always materialized objects: no need to reconstruct pages
|
||||
ObjectTag::Checkpoint | ObjectTag::ControlFile => {
|
||||
// Remove old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
// preserve last version
|
||||
last_version = false;
|
||||
} else {
|
||||
// delete deteriorated version
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.chkp_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (), // do nothing
|
||||
}
|
||||
}
|
||||
result.elapsed = now.elapsed();
|
||||
info!("Garbage collection completed in {:?}: {} relations inspected, {} object inspected, {} version histories truncated, {} versions deleted, {} relations dropped",
|
||||
result.elapsed, result.n_relations, result.inspected, result.truncated, result.deleted, result.dropped);
|
||||
self.obj_store.compact();
|
||||
}
|
||||
Ok(result)
|
||||
Ok(Box::new(ObjectHistory { lsn, iter }))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -793,8 +627,9 @@ impl ObjectTimeline {
|
||||
let mut iter = self.object_versions(&*self.obj_store, &key, lsn)?;
|
||||
|
||||
if let Some((version_lsn, value)) = iter.next().transpose()? {
|
||||
match ObjectValue::des_relsize(&value)? {
|
||||
RelationSizeEntry::Size(nblocks) => {
|
||||
let value = ObjectValue::des(&value)?;
|
||||
match value {
|
||||
ObjectValue::RelationSize(nblocks) => {
|
||||
trace!(
|
||||
"relation {} has size {} at {} (request {})",
|
||||
rel,
|
||||
@@ -804,7 +639,7 @@ impl ObjectTimeline {
|
||||
);
|
||||
Ok(Some(nblocks))
|
||||
}
|
||||
RelationSizeEntry::Unlink => {
|
||||
ObjectValue::Unlink => {
|
||||
trace!(
|
||||
"relation {} not found; it was dropped at lsn {}",
|
||||
rel,
|
||||
@@ -812,9 +647,15 @@ impl ObjectTimeline {
|
||||
);
|
||||
Ok(None)
|
||||
}
|
||||
_ => bail!(
|
||||
"Unexpect relation {} size value {:?} at {}",
|
||||
rel,
|
||||
value,
|
||||
lsn
|
||||
),
|
||||
}
|
||||
} else {
|
||||
info!("relation {} not found at {}", rel, lsn);
|
||||
debug!("relation {} not found at {}", rel, lsn);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@@ -842,20 +683,21 @@ impl ObjectTimeline {
|
||||
};
|
||||
let mut iter = self.object_versions(&*self.obj_store, &searchkey, lsn)?;
|
||||
while let Some((_key, value)) = iter.next().transpose()? {
|
||||
match ObjectValue::des_page(&value)? {
|
||||
PageEntry::Page(img) => {
|
||||
match ObjectValue::des(&value)? {
|
||||
ObjectValue::Page(img) => {
|
||||
// We have a base image. No need to dig deeper into the list of
|
||||
// records
|
||||
base_img = Some(img);
|
||||
break;
|
||||
}
|
||||
PageEntry::WALRecord(rec) => {
|
||||
ObjectValue::WALRecord(rec) => {
|
||||
records.push(rec.clone());
|
||||
// If this WAL record initializes the page, no need to dig deeper.
|
||||
if rec.will_init {
|
||||
break;
|
||||
}
|
||||
}
|
||||
x => bail!("Unexpected object value {:?}", x),
|
||||
}
|
||||
}
|
||||
records.reverse();
|
||||
@@ -867,15 +709,170 @@ impl ObjectTimeline {
|
||||
.name("Garbage collection thread".into())
|
||||
.spawn(move || {
|
||||
// FIXME
|
||||
timeline_rc.gc_loop(conf).expect("GC thread died");
|
||||
timeline_rc.do_gc(conf).expect("GC thread died");
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn gc_loop(&self, conf: &'static PageServerConf) -> Result<()> {
|
||||
fn do_gc(&self, conf: &'static PageServerConf) -> Result<()> {
|
||||
loop {
|
||||
thread::sleep(conf.gc_period);
|
||||
self.gc_iteration(conf.gc_horizon)?;
|
||||
let last_lsn = self.get_last_valid_lsn();
|
||||
|
||||
// checked_sub() returns None on overflow.
|
||||
if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
|
||||
// WAL is large enough to perform GC
|
||||
let now = Instant::now();
|
||||
let mut truncated = 0u64;
|
||||
let mut inspected = 0u64;
|
||||
let mut deleted = 0u64;
|
||||
|
||||
// Iterate through all objects in timeline
|
||||
for obj in self
|
||||
.obj_store
|
||||
.list_objects(self.timelineid, false, last_lsn)?
|
||||
{
|
||||
inspected += 1;
|
||||
match obj {
|
||||
// Prepared transactions
|
||||
ObjectTag::TwoPhase(prepare) => {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
if self.get_tx_status(prepare.xid, horizon)?
|
||||
!= pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
{
|
||||
let lsn = vers.0;
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationMetadata(_) => {
|
||||
// Do not need to reconstruct page images,
|
||||
// just delete all old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
let content = vers.1;
|
||||
match ObjectValue::des(&content[..])? {
|
||||
ObjectValue::Unlink => {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
deleted += 1;
|
||||
}
|
||||
_ => (), // preserve last version
|
||||
}
|
||||
last_version = false;
|
||||
truncated += 1;
|
||||
} else {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationBuffer(tag) => {
|
||||
// Reconstruct page at horizon unless relation was dropped
|
||||
// and delete all older versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
truncated += 1;
|
||||
last_version = false;
|
||||
if let Some(rel_size) = self.relsize_get_nowait(tag.rel, lsn)? {
|
||||
if rel_size > tag.blknum {
|
||||
// preserve and materialize last version before deleting all preceeding
|
||||
self.get_page_at_lsn_nowait(obj, lsn)?;
|
||||
continue;
|
||||
}
|
||||
debug!("Drop last block {} of relation {:?} at {} because it is beyond relation size {}", tag.blknum, tag.rel, lsn, rel_size);
|
||||
} else {
|
||||
if let Some(rel_size) =
|
||||
self.relsize_get_nowait(tag.rel, last_lsn)?
|
||||
{
|
||||
debug!("Preserve block {} of relation {:?} at {} because relation has size {} at {}", tag.rel, tag, lsn, rel_size, last_lsn);
|
||||
continue;
|
||||
}
|
||||
debug!("Relation {:?} was dropped at {}", tag.rel, lsn);
|
||||
}
|
||||
// relation was dropped or truncated so this block can be removed
|
||||
}
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
deleted += 1;
|
||||
}
|
||||
}
|
||||
// SLRU-s
|
||||
ObjectTag::Clog(_)
|
||||
| ObjectTag::MultiXactOffsets(_)
|
||||
| ObjectTag::MultiXactMembers(_) => {
|
||||
// Remove old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
let content = vers.1;
|
||||
match ObjectValue::des(&content[..])? {
|
||||
ObjectValue::Unlink => {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
deleted += 1;
|
||||
}
|
||||
ObjectValue::WALRecord(_) => {
|
||||
// preserve and materialize last version before deleting all preceeding
|
||||
self.get_page_at_lsn_nowait(obj, lsn)?;
|
||||
}
|
||||
_ => {} // do nothing if already materialized
|
||||
}
|
||||
last_version = false;
|
||||
truncated += 1;
|
||||
} else {
|
||||
// delete deteriorated version
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// versioned alwaysmaterialized objects: no need to reconstruct pages
|
||||
ObjectTag::Checkpoint | ObjectTag::ControlFile => {
|
||||
// Remove old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
// presrve last version
|
||||
last_version = false;
|
||||
truncated += 1;
|
||||
} else {
|
||||
// delete deteriorated version
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (), // do nothing
|
||||
}
|
||||
}
|
||||
info!("Garbage collection completed in {:?}:\n{} version chains inspected, {} version histories truncated, {} versions deleted",
|
||||
now.elapsed(), inspected, truncated, deleted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -930,52 +927,26 @@ impl ObjectTimeline {
|
||||
|
||||
Ok(ObjectVersionIter {
|
||||
obj_store,
|
||||
object_tag: key.tag,
|
||||
tag: key.tag,
|
||||
current_iter,
|
||||
ancestor_timeline: self.ancestor_timeline,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
})
|
||||
}
|
||||
|
||||
//
|
||||
// Helper functions to store different kinds of objects to the underlying ObjectStore
|
||||
//
|
||||
fn put_page_entry(&self, tag: &ObjectTag, lsn: Lsn, val: PageEntry) -> Result<()> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: *tag,
|
||||
};
|
||||
let val = ObjectValue::Page(val);
|
||||
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)
|
||||
}
|
||||
|
||||
fn put_relsize_entry(&self, tag: &RelTag, lsn: Lsn, val: RelationSizeEntry) -> Result<()> {
|
||||
let key = relation_size_key(self.timelineid, *tag);
|
||||
let val = ObjectValue::RelationSize(val);
|
||||
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)
|
||||
}
|
||||
|
||||
fn put_timeline_metadata_entry(&self, val: MetadataEntry) -> Result<()> {
|
||||
let key = timeline_metadata_key(self.timelineid);
|
||||
let val = ObjectValue::TimelineMetadata(val);
|
||||
|
||||
self.obj_store.put(&key, Lsn(0), &ObjectValue::ser(&val)?)
|
||||
}
|
||||
}
|
||||
|
||||
struct ObjectHistory<'a> {
|
||||
iter: Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>,
|
||||
lsn: Lsn,
|
||||
last_relation_size: Option<(RelTag, u32)>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ObjectHistory<'a> {
|
||||
type Item = Result<RelationUpdate>;
|
||||
type Item = Result<Modification>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_result().transpose()
|
||||
self.iter
|
||||
.next()
|
||||
.map(|result| result.map(|t| Modification::new(t)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -985,117 +956,29 @@ impl<'a> History for ObjectHistory<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ObjectHistory<'a> {
|
||||
fn handle_relation_size(
|
||||
&mut self,
|
||||
rel_tag: RelTag,
|
||||
entry: RelationSizeEntry,
|
||||
) -> Option<Update> {
|
||||
match entry {
|
||||
RelationSizeEntry::Size(size) => {
|
||||
// we only want to output truncations, expansions are filtered out
|
||||
let last_relation_size = self.last_relation_size.replace((rel_tag, size));
|
||||
|
||||
match last_relation_size {
|
||||
Some((last_buf, last_size)) if last_buf != rel_tag || size < last_size => {
|
||||
Some(Update::Truncate { n_blocks: size })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
RelationSizeEntry::Unlink => Some(Update::Unlink),
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_page(&mut self, buf_tag: BufferTag, entry: PageEntry) -> Update {
|
||||
match entry {
|
||||
PageEntry::Page(img) => Update::Page {
|
||||
blknum: buf_tag.blknum,
|
||||
img,
|
||||
},
|
||||
PageEntry::WALRecord(rec) => Update::WALRecord {
|
||||
blknum: buf_tag.blknum,
|
||||
rec,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn next_result(&mut self) -> Result<Option<RelationUpdate>> {
|
||||
while let Some((object_tag, lsn, value)) = self.iter.next().transpose()? {
|
||||
let (rel_tag, update) = match object_tag {
|
||||
ObjectTag::RelationMetadata(rel_tag) => {
|
||||
let entry = ObjectValue::des_relsize(&value)?;
|
||||
match self.handle_relation_size(rel_tag, entry) {
|
||||
Some(relation_update) => (rel_tag, relation_update),
|
||||
None => continue,
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationBuffer(buf_tag) => {
|
||||
let entry = ObjectValue::des_page(&value)?;
|
||||
let update = self.handle_page(buf_tag, entry);
|
||||
|
||||
(buf_tag.rel, update)
|
||||
}
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
return Ok(Some(RelationUpdate {
|
||||
rel: rel_tag,
|
||||
lsn,
|
||||
update,
|
||||
}));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// We store several kinds of objects in the repository.
|
||||
/// We have per-page, per-relation and per-timeline entries.
|
||||
/// We have per-page, per-relation(or non-rel file) and per-timeline entries.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
enum ObjectValue {
|
||||
Page(PageEntry),
|
||||
RelationSize(RelationSizeEntry),
|
||||
TimelineMetadata(MetadataEntry),
|
||||
SLRUTruncate,
|
||||
}
|
||||
|
||||
///
|
||||
/// This is what we store for each page in the object store. Use
|
||||
/// ObjectTag::RelationBuffer as key.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
enum PageEntry {
|
||||
/// Ready-made image of the block
|
||||
pub enum ObjectValue {
|
||||
/// Ready-made images of the block
|
||||
Page(Bytes),
|
||||
|
||||
/// WAL record, to be applied on top of the "previous" entry
|
||||
/// WAL records, to be applied on top of the "previous" entry
|
||||
///
|
||||
/// Some WAL records will initialize the page from scratch. For such records,
|
||||
/// the 'will_init' flag is set. They don't need the previous page image before
|
||||
/// applying. The 'will_init' flag is set for records containing a full-page image,
|
||||
/// and for records with the BKPBLOCK_WILL_INIT flag. These differ from Page images
|
||||
/// stored directly in the repository in that you still need to run the WAL redo
|
||||
/// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
|
||||
/// stored directly in the cache entry in that you still need to run the WAL redo
|
||||
/// routine to generate the page image.
|
||||
WALRecord(WALRecord),
|
||||
}
|
||||
|
||||
///
|
||||
/// In addition to page versions, we store relation size as a separate, versioned,
|
||||
/// object. That way we can answer nblocks requests faster, and we also use it to
|
||||
/// support relation truncation without having to add a tombstone page version for
|
||||
/// each block that is truncated away.
|
||||
///
|
||||
/// Use ObjectTag::RelationMetadata as the key.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
enum RelationSizeEntry {
|
||||
Size(u32),
|
||||
|
||||
/// RelationSize. We store it separately not only to ansver nblocks requests faster.
|
||||
/// We also need it to support relation truncation.
|
||||
RelationSize(u32),
|
||||
/// Tombstone for a dropped relation.
|
||||
Unlink,
|
||||
TimelineMetadata(MetadataEntry),
|
||||
}
|
||||
|
||||
const fn relation_size_key(timelineid: ZTimelineId, rel: RelTag) -> ObjectKey {
|
||||
@@ -1106,9 +989,8 @@ const fn relation_size_key(timelineid: ZTimelineId, rel: RelTag) -> ObjectKey {
|
||||
}
|
||||
|
||||
///
|
||||
/// In addition to the per-page and per-relation entries, we also store
|
||||
/// a little metadata blob for each timeline. This is not versioned, use
|
||||
/// ObjectTag::TimelineMetadataTag with constant Lsn(0) as the key.
|
||||
/// In addition to those per-page and per-relation entries, we also
|
||||
/// store a little metadata blob for each timeline.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetadataEntry {
|
||||
@@ -1125,44 +1007,6 @@ const fn timeline_metadata_key(timelineid: ZTimelineId) -> ObjectKey {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper functions to deserialize ObjectValue, when the caller knows what kind of
|
||||
/// a value it should be.
|
||||
///
|
||||
/// There are no matching helper functions for serializing. Instead, there are
|
||||
/// `put_page_entry`, `put_relsize_entry`, and `put_timeline_metadata_entry` helper
|
||||
/// functions in ObjectTimeline that both construct the right kind of key and
|
||||
/// serialize the value in the same call.
|
||||
///
|
||||
impl ObjectValue {
|
||||
fn des_page(v: &[u8]) -> Result<PageEntry> {
|
||||
match ObjectValue::des(&v)? {
|
||||
ObjectValue::Page(p) => Ok(p),
|
||||
_ => {
|
||||
bail!("Invalid object kind, expected a page entry");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn des_relsize(v: &[u8]) -> Result<RelationSizeEntry> {
|
||||
match ObjectValue::des(&v)? {
|
||||
ObjectValue::RelationSize(rs) => Ok(rs),
|
||||
_ => {
|
||||
bail!("Invalid object kind, expected a relation size entry");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn des_timeline_metadata(v: &[u8]) -> Result<MetadataEntry> {
|
||||
match ObjectValue::des(&v)? {
|
||||
ObjectValue::TimelineMetadata(t) => Ok(t),
|
||||
_ => {
|
||||
bail!("Invalid object kind, expected a timeline metadata entry");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator for `object_versions`. Returns all page versions of a given block, in
|
||||
/// reverse LSN order. This implements the traversal of ancestor timelines. If
|
||||
@@ -1172,7 +1016,7 @@ impl ObjectValue {
|
||||
struct ObjectVersionIter<'a> {
|
||||
obj_store: &'a dyn ObjectStore,
|
||||
|
||||
object_tag: ObjectTag,
|
||||
tag: ObjectTag,
|
||||
|
||||
/// Iterator on the current timeline.
|
||||
current_iter: Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>,
|
||||
@@ -1209,7 +1053,7 @@ impl<'a> ObjectVersionIter<'a> {
|
||||
if let Some(ancestor_timeline) = self.ancestor_timeline {
|
||||
let searchkey = ObjectKey {
|
||||
timeline: ancestor_timeline,
|
||||
tag: self.object_tag,
|
||||
tag: self.tag,
|
||||
};
|
||||
let ancestor_iter = self
|
||||
.obj_store
|
||||
@@ -1222,10 +1066,13 @@ impl<'a> ObjectVersionIter<'a> {
|
||||
.obj_store
|
||||
.get(&timeline_metadata_key(ancestor_timeline), Lsn(0))
|
||||
.with_context(|| "timeline not found in repository")?;
|
||||
let ancestor_metadata = ObjectValue::des_timeline_metadata(&v)?;
|
||||
self.ancestor_timeline = ancestor_metadata.ancestor_timeline;
|
||||
self.ancestor_lsn = ancestor_metadata.ancestor_lsn;
|
||||
self.current_iter = ancestor_iter;
|
||||
if let ObjectValue::TimelineMetadata(ancestor_metadata) = ObjectValue::des(&v)? {
|
||||
self.ancestor_timeline = ancestor_metadata.ancestor_timeline;
|
||||
self.ancestor_lsn = ancestor_metadata.ancestor_lsn;
|
||||
self.current_iter = ancestor_iter;
|
||||
} else {
|
||||
bail!("Invalid timeline metadata");
|
||||
}
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
@@ -1,13 +1,19 @@
|
||||
//! Low-level key-value storage abstraction.
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::repository::RelTag;
|
||||
use crate::repository::{ObjectTag, RelTag};
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::iter::Iterator;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ObjectKey {
|
||||
pub timeline: ZTimelineId,
|
||||
pub tag: ObjectTag,
|
||||
}
|
||||
|
||||
///
|
||||
/// Low-level storage abstraction.
|
||||
///
|
||||
@@ -34,9 +40,6 @@ pub trait ObjectStore: Send + Sync {
|
||||
/// correspond to any real relation.
|
||||
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;
|
||||
|
||||
/// Read key greater or equal than specified
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
|
||||
|
||||
/// Iterate through all page versions of one object.
|
||||
///
|
||||
/// Returns all page versions in descending LSN order, along with the LSN
|
||||
@@ -82,7 +85,4 @@ pub trait ObjectStore: Send + Sync {
|
||||
|
||||
/// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
|
||||
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
|
||||
|
||||
// Compact storage and remove versions marged for deletion
|
||||
fn compact(&self);
|
||||
}
|
||||
|
||||
@@ -5,8 +5,7 @@
|
||||
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::repository::Repository;
|
||||
//use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::inmem_storage::InmemObjectStore;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use lazy_static::lazy_static;
|
||||
@@ -19,8 +18,7 @@ lazy_static! {
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
//let obj_store = RocksObjectStore::open(conf).unwrap();
|
||||
let obj_store = InmemObjectStore::open(conf).unwrap();
|
||||
let obj_store = RocksObjectStore::open(conf).unwrap();
|
||||
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf);
|
||||
|
||||
@@ -21,16 +21,13 @@ use std::thread;
|
||||
use std::{io, net::TcpStream};
|
||||
use zenith_utils::postgres_backend;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::pq_proto::{
|
||||
BeMessage, FeMessage, RowDescriptor, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC,
|
||||
};
|
||||
use zenith_utils::pq_proto::{BeMessage, FeMessage, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC};
|
||||
use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::branches;
|
||||
use crate::object_key::ObjectTag;
|
||||
use crate::page_cache;
|
||||
use crate::repository::{BufferTag, RelTag, RelationUpdate, Update};
|
||||
use crate::repository::{BufferTag, Modification, ObjectTag, RelTag};
|
||||
use crate::restore_local_repo;
|
||||
use crate::walreceiver;
|
||||
use crate::PageServerConf;
|
||||
@@ -413,38 +410,14 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
while let Some(msg) = pgb.read_message()? {
|
||||
match msg {
|
||||
FeMessage::CopyData(bytes) => {
|
||||
let relation_update = RelationUpdate::des(&bytes)?;
|
||||
let modification = Modification::des(&bytes)?;
|
||||
|
||||
last_lsn = relation_update.lsn;
|
||||
|
||||
match relation_update.update {
|
||||
Update::Page { blknum, img } => {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: relation_update.rel,
|
||||
blknum,
|
||||
});
|
||||
|
||||
timeline.put_page_image(tag, relation_update.lsn, img, true)?;
|
||||
}
|
||||
Update::WALRecord { blknum, rec } => {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: relation_update.rel,
|
||||
blknum,
|
||||
});
|
||||
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
}
|
||||
Update::Truncate { n_blocks } => {
|
||||
timeline.put_truncation(
|
||||
relation_update.rel,
|
||||
relation_update.lsn,
|
||||
n_blocks,
|
||||
)?;
|
||||
}
|
||||
Update::Unlink => {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
last_lsn = modification.lsn;
|
||||
timeline.put_raw_data(
|
||||
modification.tag,
|
||||
last_lsn,
|
||||
&modification.data[..],
|
||||
)?;
|
||||
}
|
||||
FeMessage::CopyDone => {
|
||||
timeline.advance_last_valid_lsn(last_lsn);
|
||||
@@ -509,97 +482,11 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?;
|
||||
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(system_id.as_bytes())]))?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with(b"do_gc ") {
|
||||
// Run GC immediately on given timeline.
|
||||
// FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
|
||||
// This probably should require special authentication or a global flag to
|
||||
// enable, I don't think we want to or need to allow regular clients to invoke
|
||||
// GC.
|
||||
let query_str = std::str::from_utf8(&query_string)?;
|
||||
|
||||
let mut it = query_str.split(' ');
|
||||
it.next().unwrap();
|
||||
|
||||
let timeline_id: ZTimelineId = it
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("missing timeline id"))?
|
||||
.parse()?;
|
||||
let timeline = page_cache::get_repository().get_timeline(timeline_id)?;
|
||||
|
||||
let horizon: u64 = it
|
||||
.next()
|
||||
.unwrap_or(&self.conf.gc_horizon.to_string())
|
||||
.parse()?;
|
||||
|
||||
let result = timeline.gc_iteration(horizon)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor {
|
||||
name: b"n_relations",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"truncated",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"prep_deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"slru_deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"chkp_deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"dropped",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"elapsed",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(&result.n_relations.to_string().as_bytes()),
|
||||
Some(&result.truncated.to_string().as_bytes()),
|
||||
Some(&result.deleted.to_string().as_bytes()),
|
||||
Some(&result.prep_deleted.to_string().as_bytes()),
|
||||
Some(&result.slru_deleted.to_string().as_bytes()),
|
||||
Some(&result.chkp_deleted.to_string().as_bytes()),
|
||||
Some(&result.dropped.to_string().as_bytes()),
|
||||
Some(&result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
bail!("unknown command");
|
||||
}
|
||||
|
||||
pgb.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::object_key::*;
|
||||
use crate::waldecoder::TransactionId;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
@@ -11,7 +10,6 @@ use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::iter::Iterator;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
@@ -34,22 +32,6 @@ pub trait Repository: Send + Sync {
|
||||
//fn get_stats(&self) -> RepositoryStats;
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct GcResult {
|
||||
pub n_relations: u64,
|
||||
pub inspected: u64,
|
||||
pub truncated: u64,
|
||||
pub deleted: u64,
|
||||
pub prep_deleted: u64, // 2PC prepare
|
||||
pub slru_deleted: u64, // SLRU (clog, multixact)
|
||||
pub chkp_deleted: u64, // Checkpoints
|
||||
pub dropped: u64,
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
pub trait Timeline: Send + Sync {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
@@ -67,7 +49,7 @@ pub trait Timeline: Send + Sync {
|
||||
/// Does relation exist?
|
||||
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
/// Get a list of all relations in given tablespace and database.
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Get a list of non-relational objects
|
||||
@@ -79,29 +61,24 @@ pub trait Timeline: Send + Sync {
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Put raw data
|
||||
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Put raw data
|
||||
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool) -> Result<()>;
|
||||
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes) -> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Unlink relation. This method is used for marking dropped relations.
|
||||
fn put_unlink(&self, tag: RelTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Truncate SRLU segment
|
||||
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
// Get object tag greater or equal than specified
|
||||
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>>;
|
||||
/// Unlink object. This method is used for marking dropped relations
|
||||
/// and removed segments of SLRUs.
|
||||
fn put_unlink(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Remember the all WAL before the given LSN has been processed.
|
||||
///
|
||||
@@ -135,13 +112,6 @@ pub trait Timeline: Send + Sync {
|
||||
// TODO ordering guarantee?
|
||||
fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>>;
|
||||
|
||||
/// Perform one garbage collection iteration.
|
||||
/// Garbage collection is periodically performed by GC thread,
|
||||
/// but it can be explicitly requested through page server API.
|
||||
///
|
||||
/// `horizon` specifies delta from last LSN to preserve all object versions (PITR interval).
|
||||
fn gc_iteration(&self, horizon: u64) -> Result<GcResult>;
|
||||
|
||||
// Check transaction status
|
||||
fn get_tx_status(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<u8> {
|
||||
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
@@ -152,24 +122,26 @@ pub trait Timeline: Send + Sync {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait History: Iterator<Item = Result<RelationUpdate>> {
|
||||
pub trait History: Iterator<Item = Result<Modification>> {
|
||||
/// The last_valid_lsn at the time of history() call.
|
||||
fn lsn(&self) -> Lsn;
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct RelationUpdate {
|
||||
pub rel: RelTag,
|
||||
pub struct Modification {
|
||||
pub tag: ObjectTag,
|
||||
pub lsn: Lsn,
|
||||
pub update: Update,
|
||||
pub data: Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Update {
|
||||
Page { blknum: u32, img: Bytes },
|
||||
WALRecord { blknum: u32, rec: WALRecord },
|
||||
Truncate { n_blocks: u32 },
|
||||
Unlink,
|
||||
impl Modification {
|
||||
pub fn new(entry: (ObjectTag, Lsn, Vec<u8>)) -> Modification {
|
||||
Modification {
|
||||
tag: entry.0,
|
||||
lsn: entry.1,
|
||||
data: entry.2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -195,6 +167,10 @@ pub struct RepositoryStats {
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
/// We use additional fork numbers to logically separate relational and
|
||||
/// non-relational data inside pageserver key-value storage.
|
||||
/// See, e.g., `ROCKSDB_SPECIAL_FORKNUM`.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
@@ -237,6 +213,8 @@ impl fmt::Display for RelTag {
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
/// NOTE: In this context we use buffer, block and page interchangeably when speak about relation files.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
@@ -250,6 +228,71 @@ impl BufferTag {
|
||||
};
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
|
||||
/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into pages
|
||||
/// of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SlruBufferTag {
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Special type of Postgres files: pg_filenode.map is needed to map
|
||||
/// catalog table OIDs to filenode numbers, which define filename.
|
||||
///
|
||||
/// Each database has a map file for its local mapped catalogs,
|
||||
/// and there is a separate map file for shared catalogs.
|
||||
///
|
||||
/// These files have untypical size of 512 bytes.
|
||||
///
|
||||
/// See PostgreSQL relmapper.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DatabaseTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation files that keep state for prepared transactions.
|
||||
/// Unlike other files these are not divided into pages.
|
||||
///
|
||||
/// See PostgreSQL twophase.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct PrepareTag {
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
/// ObjectTag is a part of ObjectKey that is specific
|
||||
/// to the type of the stored object.
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum ObjectTag {
|
||||
// dummy tag preceeding all other keys
|
||||
FirstTag,
|
||||
TimelineMetadataTag,
|
||||
// Special entry that represents PostgreSQL checkpoint.
|
||||
// We use it to track fields needed to restore controlfile checkpoint.
|
||||
Checkpoint,
|
||||
// Various types of non-relation files.
|
||||
// We need them to bootstrap compute node.
|
||||
ControlFile,
|
||||
Clog(SlruBufferTag),
|
||||
MultiXactMembers(SlruBufferTag),
|
||||
MultiXactOffsets(SlruBufferTag),
|
||||
FileNodeMap(DatabaseTag),
|
||||
TwoPhase(PrepareTag),
|
||||
// put relations at the end of enum to allow efficient iterations through non-rel objects
|
||||
RelationMetadata(RelTag),
|
||||
RelationBuffer(BufferTag),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
@@ -291,6 +334,7 @@ impl WALRecord {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::object_repository::ObjectValue;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
@@ -299,6 +343,7 @@ mod tests {
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelTag = RelTag {
|
||||
@@ -338,7 +383,7 @@ mod tests {
|
||||
interactive: false,
|
||||
gc_horizon: 64 * 1024 * 1024,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_addr: "127.0.0.1:5430".to_string(),
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
};
|
||||
@@ -367,11 +412,11 @@ mod tests {
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"))?;
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"))?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(5));
|
||||
|
||||
@@ -458,7 +503,7 @@ mod tests {
|
||||
for i in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
|
||||
lsn += 1;
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img, true)?;
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img)?;
|
||||
}
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
|
||||
@@ -495,48 +540,82 @@ mod tests {
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
let mut snapshot = tline.history()?;
|
||||
let snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(0));
|
||||
let mut snapshot = snapshot.skip_while(|r| match r {
|
||||
Ok(m) => match m.tag {
|
||||
ObjectTag::RelationBuffer(_) => false,
|
||||
_ => true,
|
||||
},
|
||||
_ => true,
|
||||
});
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// add a page and advance the last valid LSN
|
||||
let rel = TESTREL_A;
|
||||
let tag = TEST_BUF(1);
|
||||
tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
|
||||
tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"))?;
|
||||
tline.advance_last_valid_lsn(Lsn(1));
|
||||
let mut snapshot = tline.history()?;
|
||||
let snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(1));
|
||||
let expected_page = RelationUpdate {
|
||||
rel: rel,
|
||||
lsn: Lsn(1),
|
||||
update: Update::Page {
|
||||
blknum: 1,
|
||||
img: TEST_IMG("blk 1 @ lsn 1"),
|
||||
let mut snapshot = snapshot.skip_while(|r| match r {
|
||||
Ok(m) => match m.tag {
|
||||
ObjectTag::RelationBuffer(_) => false,
|
||||
_ => true,
|
||||
},
|
||||
_ => true,
|
||||
});
|
||||
let expected_page = Modification {
|
||||
tag,
|
||||
lsn: Lsn(1),
|
||||
data: ObjectValue::ser(&ObjectValue::Page(TEST_IMG("blk 1 @ lsn 1")))?,
|
||||
};
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// truncate to zero, but don't advance the last valid LSN
|
||||
tline.put_truncation(rel, Lsn(2), 0)?;
|
||||
let mut snapshot = tline.history()?;
|
||||
let snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(1));
|
||||
let mut snapshot = snapshot.skip_while(|r| match r {
|
||||
Ok(m) => match m.tag {
|
||||
ObjectTag::RelationBuffer(_) => false,
|
||||
_ => true,
|
||||
},
|
||||
_ => true,
|
||||
});
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
// advance the last valid LSN and the truncation should be observable
|
||||
tline.advance_last_valid_lsn(Lsn(2));
|
||||
let mut snapshot = tline.history()?;
|
||||
let snapshot = tline.history()?;
|
||||
assert_eq!(snapshot.lsn(), Lsn(2));
|
||||
|
||||
// TODO ordering not guaranteed by API. But currently it returns the
|
||||
// truncation entry before the block data.
|
||||
let expected_truncate = RelationUpdate {
|
||||
rel: rel,
|
||||
lsn: Lsn(2),
|
||||
update: Update::Truncate { n_blocks: 0 },
|
||||
let mut snapshot = snapshot.skip_while(|r| match r {
|
||||
Ok(m) => match m.tag {
|
||||
ObjectTag::RelationMetadata(_) => false,
|
||||
_ => true,
|
||||
},
|
||||
_ => true,
|
||||
});
|
||||
let expected_truncate = Modification {
|
||||
tag: ObjectTag::RelationMetadata(rel),
|
||||
lsn: Lsn(1),
|
||||
data: ObjectValue::ser(&ObjectValue::RelationSize(2))?,
|
||||
};
|
||||
assert_eq!(Some(expected_truncate), snapshot.next().transpose()?);
|
||||
assert_eq!(
|
||||
Some(&expected_truncate),
|
||||
snapshot.next().transpose()?.as_ref()
|
||||
); // TODO ordering not guaranteed by API
|
||||
let expected_truncate = Modification {
|
||||
tag: ObjectTag::RelationMetadata(rel),
|
||||
lsn: Lsn(2),
|
||||
data: ObjectValue::ser(&ObjectValue::RelationSize(0))?,
|
||||
};
|
||||
assert_eq!(
|
||||
Some(&expected_truncate),
|
||||
snapshot.next().transpose()?.as_ref()
|
||||
); // TODO ordering not guaranteed by API
|
||||
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
|
||||
assert_eq!(None, snapshot.next().transpose()?);
|
||||
|
||||
|
||||
@@ -12,21 +12,21 @@ use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::repository::*;
|
||||
use crate::waldecoder::*;
|
||||
use crate::repository::{
|
||||
BufferTag, DatabaseTag, ObjectTag, PrepareTag, RelTag, SlruBufferTag, Timeline, WALRecord,
|
||||
};
|
||||
use crate::waldecoder::{decode_wal_record, DecodedWALRecord, Oid, WalStreamDecoder};
|
||||
use crate::waldecoder::{XlCreateDatabase, XlSmgrTruncate};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
const MAX_MBR_BLKNO: u32 =
|
||||
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
|
||||
///
|
||||
/// Find latest snapshot in a timeline's 'snapshots' directory
|
||||
///
|
||||
@@ -203,7 +203,7 @@ fn import_relfile(
|
||||
},
|
||||
blknum,
|
||||
});
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf), true)?;
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf))?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
@@ -236,7 +236,7 @@ fn import_nonrel_file(
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]), false)?;
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -256,7 +256,7 @@ fn import_slru_file(
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
timeline.put_page_image(gen_tag(blknum), lsn, Bytes::copy_from_slice(&buf), false)?;
|
||||
timeline.put_page_image(gen_tag(blknum), lsn, Bytes::copy_from_slice(&buf))?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
@@ -289,8 +289,6 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
// get_page_at_lsn_nowait returns pages with zeros when object is not found in the storage.
|
||||
// nextXid can not be zero, so this check is used to detect situation when checkpoint record needs to be initialized.
|
||||
if checkpoint.nextXid.value == 0 {
|
||||
let pg_control_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::ControlFile, startpoint)?;
|
||||
@@ -339,8 +337,8 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?;
|
||||
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
|
||||
save_decoded_record(timeline, &decoded, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
@@ -360,7 +358,7 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
}
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes, false)?;
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -369,163 +367,40 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
pub fn save_decoded_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
decoded: &DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
checkpoint.update_next_xid(decoded.xl_xid);
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
});
|
||||
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
if blk.will_drop {
|
||||
timeline.put_unlink(blk.tag, lsn)?;
|
||||
} else {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(blk.tag, rec)?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
// Handle a few special record types
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&mut buf);
|
||||
let truncate = XlSmgrTruncate::decode(&decoded);
|
||||
save_xlog_smgr_truncate(timeline, lsn, &truncate)?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE {
|
||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||
save_xlog_dbase_create(timeline, lsn, &createdb)?;
|
||||
} else {
|
||||
// TODO
|
||||
trace!("XLOG_DBASE_DROP is not handled yet");
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
||||
} else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let blknum = buf.get_u32_le();
|
||||
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
checkpoint.oldestXid = buf.get_u32_le();
|
||||
checkpoint.oldestXidDB = buf.get_u32_le();
|
||||
trace!(
|
||||
"RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
|
||||
blknum,
|
||||
checkpoint.oldestXid,
|
||||
checkpoint.oldestXidDB
|
||||
);
|
||||
if let Some(ObjectTag::Clog(first_slru_tag)) =
|
||||
timeline.get_next_tag(ObjectTag::Clog(SlruBufferTag { blknum: 0 }))?
|
||||
{
|
||||
for trunc_blknum in first_slru_tag.blknum..=blknum {
|
||||
let tag = ObjectTag::Clog(SlruBufferTag {
|
||||
blknum: trunc_blknum,
|
||||
});
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
|| info == pg_constants::XLOG_XACT_ABORT
|
||||
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
|
||||
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(
|
||||
ObjectTag::TwoPhase(PrepareTag {
|
||||
xid: decoded.xl_xid,
|
||||
}),
|
||||
rec,
|
||||
)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
let blknum = buf.get_u32_le();
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
ObjectTag::MultiXactOffsets(SlruBufferTag { blknum })
|
||||
} else {
|
||||
ObjectTag::MultiXactMembers(SlruBufferTag { blknum })
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
save_relmap_record(timeline, lsn, &xlrec, decoded)?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
checkpoint.nextOid = next_oid;
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||
xlog_checkpoint.oldestXid,
|
||||
checkpoint.oldestXid
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&decoded);
|
||||
save_xlog_dbase_create(timeline, lsn, &createdb)?;
|
||||
}
|
||||
|
||||
// Now that this record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
@@ -579,7 +454,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
|
||||
debug!("copying block {:?} to {:?}", src_key, dst_key);
|
||||
|
||||
timeline.put_page_image(dst_key, lsn, content, true)?;
|
||||
timeline.put_page_image(dst_key, lsn, content)?;
|
||||
num_blocks_copied += 1;
|
||||
}
|
||||
|
||||
@@ -600,7 +475,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
});
|
||||
timeline.put_page_image(new_tag, lsn, img, false)?;
|
||||
timeline.put_page_image(new_tag, lsn, img)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -675,158 +550,3 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
|
||||
///
|
||||
/// We are currently only interested in the dropped relations.
|
||||
fn save_xact_record(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
parsed: &XlXactParsedRecord,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
// Record update of CLOG page
|
||||
let mut blknum = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
for subxact in &parsed.subxacts {
|
||||
let subxact_blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if subxact_blknum != blknum {
|
||||
blknum = subxact_blknum;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
}
|
||||
}
|
||||
for xnode in &parsed.xnodes {
|
||||
for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM {
|
||||
let rel_tag = RelTag {
|
||||
forknum,
|
||||
spcnode: xnode.spcnode,
|
||||
dbnode: xnode.dbnode,
|
||||
relnode: xnode.relnode,
|
||||
};
|
||||
timeline.put_unlink(rel_tag, lsn)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
loop {
|
||||
// Update members page
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
|
||||
if blknum == last_mbr_blkno {
|
||||
// last block inclusive
|
||||
break;
|
||||
}
|
||||
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
mbr.xid
|
||||
} else {
|
||||
acc
|
||||
}
|
||||
});
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
let first_off_blkno = xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let last_off_blkno = xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
for blknum in first_off_blkno..last_off_blkno {
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
let first_mbr_blkno = xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno = xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
// Delete all the segments but the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
while blknum != last_mbr_blkno {
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_relmap_record(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: xlrec.tsid,
|
||||
dbnode: xlrec.dbid,
|
||||
});
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
274
pageserver/src/restore_s3.rs
Normal file
274
pageserver/src/restore_s3.rs
Normal file
@@ -0,0 +1,274 @@
|
||||
//
|
||||
// Restore chunks from S3
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "base images" from
|
||||
// S3 into the in-memory page cache. It also initializes the "last valid LSN"
|
||||
// in the page cache to the LSN of the base image, so that when the WAL receiver
|
||||
// is started, it starts streaming from that LSN.
|
||||
//
|
||||
|
||||
use bytes::{Buf, BytesMut};
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::env;
|
||||
use std::fmt;
|
||||
|
||||
use s3::bucket::Bucket;
|
||||
use s3::creds::Credentials;
|
||||
use s3::region::Region;
|
||||
use s3::S3Error;
|
||||
|
||||
use tokio::runtime;
|
||||
|
||||
use futures::future;
|
||||
|
||||
use crate::{page_cache, PageServerConf};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
|
||||
struct Storage {
|
||||
region: Region,
|
||||
credentials: Credentials,
|
||||
bucket: String,
|
||||
}
|
||||
|
||||
pub fn restore_main(conf: &PageServerConf) {
|
||||
// Create a new thread pool
|
||||
let runtime = runtime::Runtime::new().unwrap();
|
||||
|
||||
runtime.block_on(async {
|
||||
let result = restore_chunk(conf).await;
|
||||
|
||||
match result {
|
||||
Ok(_) => {}
|
||||
Err(err) => {
|
||||
error!("S3 error: {}", err);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
//
|
||||
// Restores one chunk from S3.
|
||||
//
|
||||
// 1. Fetch the last base image >= given LSN
|
||||
// 2. Fetch all WAL
|
||||
//
|
||||
// Load it all into the page cache.
|
||||
//
|
||||
async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
let backend = Storage {
|
||||
region: Region::Custom {
|
||||
region: env::var("S3_REGION").unwrap(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap(),
|
||||
},
|
||||
credentials: Credentials::new(
|
||||
Some(&env::var("S3_ACCESSKEY").unwrap()),
|
||||
Some(&env::var("S3_SECRET").unwrap()),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.unwrap(),
|
||||
bucket: "zenith-testbucket".to_string(),
|
||||
};
|
||||
|
||||
info!("Restoring from S3...");
|
||||
|
||||
// Create Bucket in REGION for BUCKET
|
||||
let bucket = Bucket::new_with_path_style(&backend.bucket, backend.region, backend.credentials)?;
|
||||
|
||||
// List out contents of directory
|
||||
let results: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list("relationdata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
|
||||
// TODO: get that from backup
|
||||
let sys_id: u64 = 42;
|
||||
let mut oldest_lsn = 0;
|
||||
let mut slurp_futures: Vec<_> = Vec::new();
|
||||
|
||||
for result in results {
|
||||
for object in result.contents {
|
||||
// Download every relation file, slurping them into memory
|
||||
|
||||
let key = object.key;
|
||||
let relpath = key.strip_prefix("relationdata/").unwrap();
|
||||
|
||||
let parsed = parse_rel_file_path(&relpath);
|
||||
|
||||
match parsed {
|
||||
Ok(p) => {
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
let b = bucket.clone();
|
||||
let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
|
||||
|
||||
slurp_futures.push(f);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("unrecognized file: {} ({})", relpath, e);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if oldest_lsn == 0 {
|
||||
panic!("no base backup found");
|
||||
}
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
pcache.init_valid_lsn(oldest_lsn);
|
||||
|
||||
info!("{} files to restore...", slurp_futures.len());
|
||||
|
||||
future::join_all(slurp_futures).await;
|
||||
info!("restored!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode: u32 = relnode_str.parse()?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
};
|
||||
|
||||
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
|
||||
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
Ok((relnode, forknum, segno, lsn))
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
*
|
||||
* global/
|
||||
* shared relations
|
||||
*
|
||||
* base/<db oid>/
|
||||
* regular relations, default tablespace
|
||||
*
|
||||
* pg_tblspc/<tblspc oid>/<tblspc version>/
|
||||
* within a non-default tablespace (the name of the directory
|
||||
* depends on version)
|
||||
*
|
||||
* And the relation data files themselves have a filename like:
|
||||
*
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let dbnode: u32 = dbnode_str.parse()?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
} else {
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Load a base file from S3, and insert it into the page cache
|
||||
//
|
||||
async fn slurp_base_file(
|
||||
conf: &PageServerConf,
|
||||
sys_id: u64,
|
||||
bucket: Bucket,
|
||||
s3path: String,
|
||||
parsed: ParsedBaseImageFileName,
|
||||
) {
|
||||
// FIXME: rust-s3 opens a new connection for each request. Should reuse
|
||||
// the reqwest::Client object. But that requires changes to rust-s3 itself.
|
||||
let (data, code) = bucket.get_object(s3path.clone()).await.unwrap();
|
||||
|
||||
trace!("got response: {} on {}", code, &s3path);
|
||||
assert_eq!(200, code);
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,8 @@
|
||||
//!
|
||||
//! An implementation of the ObjectStore interface, backed by RocksDB
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::repository::RelTag;
|
||||
use crate::object_store::{ObjectKey, ObjectStore};
|
||||
use crate::repository::{BufferTag, ObjectTag, RelTag};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::{bail, Result};
|
||||
@@ -25,7 +24,7 @@ impl StorageKey {
|
||||
Self {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
tag: ObjectTag::TimelineMetadataTag,
|
||||
tag: ObjectTag::FirstTag,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
}
|
||||
@@ -94,21 +93,6 @@ impl ObjectStore for RocksObjectStore {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
|
||||
let mut iter = self.db.raw_iterator();
|
||||
let search_key = StorageKey {
|
||||
obj_key: key.clone(),
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
Ok(None)
|
||||
} else {
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
Ok(Some(key.obj_key.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
|
||||
self.db.put(
|
||||
StorageKey::ser(&StorageKey {
|
||||
@@ -167,43 +151,41 @@ impl ObjectStore for RocksObjectStore {
|
||||
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
let mut search_rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
let mut search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
},
|
||||
blknum: 0,
|
||||
}),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
let mut iter = self.db.raw_iterator();
|
||||
loop {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(search_rel_tag),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
iter.seek(search_key.ser()?);
|
||||
if !iter.valid() {
|
||||
break;
|
||||
}
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
|
||||
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
if let ObjectTag::RelationBuffer(buf_tag) = key.obj_key.tag {
|
||||
if (spcnode != 0 && buf_tag.rel.spcnode != spcnode)
|
||||
|| (dbnode != 0 && buf_tag.rel.dbnode != dbnode)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
if key.lsn < lsn {
|
||||
rels.insert(buf_tag.rel);
|
||||
}
|
||||
search_rel_tag = rel_tag;
|
||||
// skip to next relation
|
||||
// FIXME: What if relnode is u32::MAX ?
|
||||
search_rel_tag.relnode += 1;
|
||||
let mut next_tag = buf_tag.clone();
|
||||
next_tag.rel.relnode += 1; // skip to next relation
|
||||
search_key.obj_key.tag = ObjectTag::RelationBuffer(next_tag);
|
||||
} else {
|
||||
// no more relation metadata entries
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -233,10 +215,6 @@ impl ObjectStore for RocksObjectStore {
|
||||
lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
fn compact(&self) {
|
||||
self.db.compact_range::<&[u8], &[u8]>(None, None);
|
||||
}
|
||||
}
|
||||
|
||||
impl RocksObjectStore {
|
||||
|
||||
@@ -2,14 +2,17 @@
|
||||
//! WAL decoder. For each WAL record, it decodes the record to figure out which data blocks
|
||||
//! the record affects, to add the records to the page cache.
|
||||
//!
|
||||
use crate::repository::*;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::CheckPoint;
|
||||
use postgres_ffi::XLogLongPageHeaderData;
|
||||
use postgres_ffi::XLogPageHeaderData;
|
||||
use postgres_ffi::XLogRecord;
|
||||
use std::cmp::min;
|
||||
use std::str;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -21,6 +24,9 @@ pub type MultiXactId = TransactionId;
|
||||
pub type MultiXactOffset = u32;
|
||||
pub type MultiXactStatus = u32;
|
||||
|
||||
const MAX_MBR_BLKNO: u32 =
|
||||
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: Lsn,
|
||||
@@ -196,12 +202,7 @@ pub struct DecodedBkpBlock {
|
||||
//in_use: bool,
|
||||
|
||||
/* Identify the block this refers to */
|
||||
pub rnode_spcnode: u32,
|
||||
pub rnode_dbnode: u32,
|
||||
pub rnode_relnode: u32,
|
||||
// Note that we have a few special forknum values for non-rel files.
|
||||
pub forknum: u8,
|
||||
pub blkno: u32,
|
||||
pub tag: ObjectTag,
|
||||
|
||||
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
||||
flags: u8,
|
||||
@@ -210,6 +211,7 @@ pub struct DecodedBkpBlock {
|
||||
has_image: bool, /* has image, even for consistency checking */
|
||||
pub apply_image: bool, /* has image that should be restored */
|
||||
pub will_init: bool, /* record doesn't need previous page version to apply */
|
||||
pub will_drop: bool, /* record drops relation */
|
||||
//char *bkp_image;
|
||||
hole_offset: u16,
|
||||
hole_length: u16,
|
||||
@@ -224,16 +226,13 @@ pub struct DecodedBkpBlock {
|
||||
impl DecodedBkpBlock {
|
||||
pub fn new() -> DecodedBkpBlock {
|
||||
DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: 0,
|
||||
blkno: 0,
|
||||
tag: ObjectTag::FirstTag,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
will_drop: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
@@ -246,7 +245,6 @@ impl DecodedBkpBlock {
|
||||
}
|
||||
|
||||
pub struct DecodedWALRecord {
|
||||
pub xl_xid: TransactionId,
|
||||
pub xl_info: u8,
|
||||
pub xl_rmid: u8,
|
||||
pub record: Bytes, // raw XLogRecord
|
||||
@@ -290,7 +288,9 @@ pub struct XlSmgrTruncate {
|
||||
}
|
||||
|
||||
impl XlSmgrTruncate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlSmgrTruncate {
|
||||
pub fn decode(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance((XLOG_SIZE_OF_XLOG_RECORD + 2) as usize);
|
||||
XlSmgrTruncate {
|
||||
blkno: buf.get_u32_le(),
|
||||
rnode: RelFileNode {
|
||||
@@ -313,7 +313,9 @@ pub struct XlCreateDatabase {
|
||||
}
|
||||
|
||||
impl XlCreateDatabase {
|
||||
pub fn decode(buf: &mut Bytes) -> XlCreateDatabase {
|
||||
pub fn decode(decoded: &DecodedWALRecord) -> XlCreateDatabase {
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance((XLOG_SIZE_OF_XLOG_RECORD + 2) as usize);
|
||||
XlCreateDatabase {
|
||||
db_id: buf.get_u32_le(),
|
||||
tablespace_id: buf.get_u32_le(),
|
||||
@@ -399,103 +401,6 @@ impl XlHeapUpdate {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Note: Parsing some fields is missing, because they're not needed.
|
||||
///
|
||||
/// This is similar to the xl_xact_parsed_commit and
|
||||
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
|
||||
/// struct for commits and aborts.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct XlXactParsedRecord {
|
||||
pub xid: TransactionId,
|
||||
pub info: u8,
|
||||
pub xact_time: TimestampTz,
|
||||
pub xinfo: u32,
|
||||
|
||||
pub db_id: Oid, /* MyDatabaseId */
|
||||
pub ts_id: Oid, /* MyDatabaseTableSpace */
|
||||
|
||||
pub subxacts: Vec<TransactionId>,
|
||||
|
||||
pub xnodes: Vec<RelFileNode>,
|
||||
}
|
||||
|
||||
impl XlXactParsedRecord {
|
||||
/// Decode a XLOG_XACT_COMMIT/ABORT/COMMIT_PREPARED/ABORT_PREPARED
|
||||
/// record. This should agree with the ParseCommitRecord and ParseAbortRecord
|
||||
/// functions in PostgreSQL (in src/backend/access/rmgr/xactdesc.c)
|
||||
pub fn decode(buf: &mut Bytes, mut xid: TransactionId, xl_info: u8) -> XlXactParsedRecord {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
// The record starts with time of commit/abort
|
||||
let xact_time = buf.get_u64_le();
|
||||
let xinfo;
|
||||
if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
} else {
|
||||
xinfo = 0;
|
||||
}
|
||||
let db_id;
|
||||
let ts_id;
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
db_id = buf.get_u32_le();
|
||||
ts_id = buf.get_u32_le();
|
||||
} else {
|
||||
db_id = 0;
|
||||
ts_id = 0;
|
||||
}
|
||||
let mut subxacts = Vec::<TransactionId>::new();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
subxacts.push(subxact);
|
||||
}
|
||||
}
|
||||
let mut xnodes = Vec::<RelFileNode>::new();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
xnodes.push(RelFileNode {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
});
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
xid = buf.get_u32_le();
|
||||
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
|
||||
}
|
||||
XlXactParsedRecord {
|
||||
xid,
|
||||
info,
|
||||
xact_time,
|
||||
xinfo,
|
||||
db_id,
|
||||
ts_id,
|
||||
subxacts,
|
||||
xnodes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct MultiXactMember {
|
||||
@@ -542,14 +447,14 @@ impl XlMultiXactCreate {
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
pub oldest_multi_db: Oid,
|
||||
oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
pub start_trunc_off: MultiXactId, /* just for completeness' sake */
|
||||
pub end_trunc_off: MultiXactId,
|
||||
start_trunc_off: MultiXactId, /* just for completeness' sake */
|
||||
end_trunc_off: MultiXactId,
|
||||
|
||||
/* to-be-truncated range of multixact members */
|
||||
pub start_trunc_memb: MultiXactOffset,
|
||||
pub end_trunc_memb: MultiXactOffset,
|
||||
start_trunc_memb: MultiXactOffset,
|
||||
end_trunc_memb: MultiXactOffset,
|
||||
}
|
||||
|
||||
impl XlMultiXactTruncate {
|
||||
@@ -582,10 +487,11 @@ impl XlMultiXactTruncate {
|
||||
// block data
|
||||
// ...
|
||||
// main data
|
||||
pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedWALRecord {
|
||||
let mut spcnode: u32 = 0;
|
||||
let mut dbnode: u32 = 0;
|
||||
let mut relnode: u32 = 0;
|
||||
let mut forknum: u8;
|
||||
let mut got_rnode = false;
|
||||
|
||||
let mut buf = record.clone();
|
||||
@@ -601,6 +507,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
xlogrec.xl_info
|
||||
);
|
||||
|
||||
checkpoint.update_next_xid(xlogrec.xl_xid);
|
||||
let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;
|
||||
|
||||
if buf.remaining() != remaining {
|
||||
@@ -659,7 +566,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
max_block_id = block_id;
|
||||
|
||||
fork_flags = buf.get_u8();
|
||||
blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
|
||||
forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
|
||||
blk.flags = fork_flags;
|
||||
blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
|
||||
blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
|
||||
@@ -765,9 +672,9 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
}
|
||||
if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
|
||||
rnode_spcnode = buf.get_u32_le();
|
||||
rnode_dbnode = buf.get_u32_le();
|
||||
rnode_relnode = buf.get_u32_le();
|
||||
spcnode = buf.get_u32_le();
|
||||
dbnode = buf.get_u32_le();
|
||||
relnode = buf.get_u32_le();
|
||||
got_rnode = true;
|
||||
} else if !got_rnode {
|
||||
// TODO
|
||||
@@ -778,18 +685,16 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
goto err; */
|
||||
}
|
||||
|
||||
blk.rnode_spcnode = rnode_spcnode;
|
||||
blk.rnode_dbnode = rnode_dbnode;
|
||||
blk.rnode_relnode = rnode_relnode;
|
||||
|
||||
blk.blkno = buf.get_u32_le();
|
||||
trace!(
|
||||
"this record affects {}/{}/{} blk {}",
|
||||
rnode_spcnode,
|
||||
rnode_dbnode,
|
||||
rnode_relnode,
|
||||
blk.blkno
|
||||
);
|
||||
blk.tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
forknum,
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
},
|
||||
blknum: buf.get_u32_le(),
|
||||
});
|
||||
trace!("this record affects {:?}", blk.tag);
|
||||
|
||||
blocks.push(blk);
|
||||
}
|
||||
@@ -811,11 +716,226 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
assert_eq!(buf.remaining(), main_data_len as usize);
|
||||
}
|
||||
|
||||
// 5. Handle a few special record types that modify blocks without registering
|
||||
// them with the standard mechanism.
|
||||
if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
//5. Handle special CLOG and XACT records
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blknum = buf.get_i32_le() as u32;
|
||||
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
blk.will_init = true;
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
blk.will_drop = true;
|
||||
checkpoint.oldestXid = buf.get_u32_le();
|
||||
checkpoint.oldestXidDB = buf.get_u32_le();
|
||||
trace!(
|
||||
"RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
|
||||
blknum,
|
||||
checkpoint.oldestXid,
|
||||
checkpoint.oldestXidDB
|
||||
);
|
||||
}
|
||||
trace!("RM_CLOG_ID updates block {}", blknum);
|
||||
blocks.push(blk);
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blknum = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blknum,
|
||||
main_data_len
|
||||
);
|
||||
blocks.push(blk);
|
||||
}
|
||||
//parse commit record to extract subtrans entries
|
||||
// xl_xact_commit starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blknum = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blknum != blknum {
|
||||
prev_blknum = blknum;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::RelationMetadata(RelTag {
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
});
|
||||
blk.will_drop = true;
|
||||
blocks.push(blk);
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
let xid = buf.get_u32_le();
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
blocks.push(blk);
|
||||
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
|
||||
//TODO handle this to be able to restore pg_twophase on node start
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT
|
||||
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blknum = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blknum,
|
||||
main_data_len
|
||||
);
|
||||
blocks.push(blk);
|
||||
}
|
||||
//parse abort record to extract subtrans entries
|
||||
// xl_xact_abort starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blknum = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blknum != blknum {
|
||||
prev_blknum = blknum;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::RelationMetadata(RelTag {
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
});
|
||||
blk.will_drop = true;
|
||||
blocks.push(blk);
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
let xid = buf.get_u32_le();
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
blocks.push(blk);
|
||||
trace!("XLOG_XACT_ABORT-XACT_XINFO_HAS_TWOPHASE");
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::TwoPhase(PrepareTag {
|
||||
xid: xlogrec.xl_xid,
|
||||
});
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
debug!("Prepare transaction {}", xlogrec.xl_xid);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_DBASE_CREATE {
|
||||
//buf points to main_data
|
||||
let db_id = buf.get_u32_le();
|
||||
let tablespace_id = buf.get_u32_le();
|
||||
let src_db_id = buf.get_u32_le();
|
||||
let src_tablespace_id = buf.get_u32_le();
|
||||
trace!(
|
||||
"XLOG_DBASE_CREATE tablespace_id/db_id {}/{} src_db_id {}/{}",
|
||||
tablespace_id,
|
||||
db_id,
|
||||
src_tablespace_id,
|
||||
src_db_id
|
||||
);
|
||||
// in postgres it is implemented as copydir
|
||||
// we need to copy all pages in page_cache
|
||||
} else {
|
||||
trace!("XLOG_DBASE_DROP is not handled yet");
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_TBLSPC_CREATE {
|
||||
//buf points to main_data
|
||||
let ts_id = buf.get_u32_le();
|
||||
let ts_path = str::from_utf8(&buf).unwrap();
|
||||
trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
|
||||
} else {
|
||||
trace!("XLOG_TBLSPC_DROP is not handled yet");
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = XlHeapInsert::decode(&mut buf);
|
||||
if (xlrec.flags
|
||||
@@ -823,52 +943,96 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
|
||||
!= 0
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
spcnode: tag0.rel.spcnode,
|
||||
dbnode: tag0.rel.dbnode,
|
||||
relnode: tag0.rel.relnode,
|
||||
},
|
||||
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
|
||||
});
|
||||
blocks.push(blk);
|
||||
} else {
|
||||
panic!(
|
||||
"Block 0 is expected to be relation buffer tag but it is {:?}",
|
||||
blocks[0].tag
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = XlHeapDelete::decode(&mut buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
spcnode: tag0.rel.spcnode,
|
||||
dbnode: tag0.rel.dbnode,
|
||||
relnode: tag0.rel.relnode,
|
||||
},
|
||||
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
|
||||
});
|
||||
blocks.push(blk);
|
||||
} else {
|
||||
panic!(
|
||||
"Block 0 is expected to be relation buffer tag but it is {:?}",
|
||||
blocks[0].tag
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = XlHeapUpdate::decode(&mut buf);
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
spcnode: tag0.rel.spcnode,
|
||||
dbnode: tag0.rel.dbnode,
|
||||
relnode: tag0.rel.relnode,
|
||||
},
|
||||
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
|
||||
});
|
||||
blocks.push(blk);
|
||||
} else {
|
||||
panic!(
|
||||
"Block 0 is expected to be relation buffer tag but it is {:?}",
|
||||
blocks[0].tag
|
||||
);
|
||||
}
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
|
||||
&& blocks.len() > 1
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
blk.rnode_spcnode = blocks[1].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[1].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[1].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
if let ObjectTag::RelationBuffer(tag1) = blocks[1].tag {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
spcnode: tag1.rel.spcnode,
|
||||
dbnode: tag1.rel.dbnode,
|
||||
relnode: tag1.rel.relnode,
|
||||
},
|
||||
blknum: tag1.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
|
||||
});
|
||||
blocks.push(blk);
|
||||
} else {
|
||||
panic!(
|
||||
"Block 1 is expected to be relation buffer tag but it is {:?}",
|
||||
blocks[1].tag
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = XlHeapMultiInsert::decode(&mut buf);
|
||||
if (xlrec.flags
|
||||
@@ -876,20 +1040,164 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
|
||||
!= 0
|
||||
{
|
||||
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
spcnode: tag0.rel.spcnode,
|
||||
dbnode: tag0.rel.dbnode,
|
||||
relnode: tag0.rel.relnode,
|
||||
},
|
||||
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
|
||||
});
|
||||
blocks.push(blk);
|
||||
} else {
|
||||
panic!(
|
||||
"Block 0 is expected to be relation buffer tag but it is {:?}",
|
||||
blocks[0].tag
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag {
|
||||
blknum: buf.get_u32_le(),
|
||||
});
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag {
|
||||
blknum: buf.get_u32_le(),
|
||||
});
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
// Update offset page
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
blocks.push(blk);
|
||||
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
loop {
|
||||
// Update members page
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
blocks.push(blk);
|
||||
|
||||
if blknum == last_mbr_blkno {
|
||||
// last block inclusive
|
||||
break;
|
||||
}
|
||||
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
}
|
||||
let max_mbr_xid =
|
||||
xlrec.members.iter().fold(
|
||||
0u32,
|
||||
|acc, mbr| {
|
||||
if mbr.xid > acc {
|
||||
mbr.xid
|
||||
} else {
|
||||
acc
|
||||
}
|
||||
},
|
||||
);
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
let first_off_blkno =
|
||||
xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let last_off_blkno =
|
||||
xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
// Delete all the segments but the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
for blknum in first_off_blkno..last_off_blkno {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
blk.will_drop = true;
|
||||
blocks.push(blk);
|
||||
}
|
||||
let first_mbr_blkno =
|
||||
xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
// Delete all the segments but the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
while blknum != last_mbr_blkno {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
blk.will_drop = true;
|
||||
blocks.push(blk);
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
panic!()
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: xlrec.tsid,
|
||||
dbnode: xlrec.dbid,
|
||||
});
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XLOG_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
if next_oid > checkpoint.nextOid {
|
||||
checkpoint.nextOid = next_oid;
|
||||
}
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
|
||||
buf.copy_to_slice(&mut checkpoint_bytes);
|
||||
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
|
||||
trace!(
|
||||
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
|
||||
xlog_checkpoint.oldestXid, checkpoint.oldestXid
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DecodedWALRecord {
|
||||
xl_xid: xlogrec.xl_xid,
|
||||
xl_info: xlogrec.xl_info,
|
||||
xl_rmid: xlogrec.xl_rmid,
|
||||
record,
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
//!
|
||||
//! We keep one WAL receiver active per timeline.
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::page_cache;
|
||||
use crate::repository::*;
|
||||
use crate::restore_local_repo;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
@@ -190,26 +190,17 @@ fn walreceiver_main(
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// Save old checkpoint value to compare with it after decoding WAL record
|
||||
let old_checkpoint_bytes = checkpoint.encode();
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
restore_local_repo::save_decoded_record(
|
||||
&mut checkpoint,
|
||||
&*timeline,
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
|
||||
restore_local_repo::save_decoded_record(&*timeline, &decoded, recdata, lsn)?;
|
||||
last_rec_lsn = lsn;
|
||||
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(
|
||||
ObjectTag::Checkpoint,
|
||||
lsn,
|
||||
new_checkpoint_bytes,
|
||||
false,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -36,10 +37,7 @@ use tokio::time::timeout;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::repository::BufferTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlXactParsedRecord;
|
||||
use crate::repository::{BufferTag, ObjectTag, WALRecord};
|
||||
use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
@@ -173,7 +171,6 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
// Create a channel where to receive the response
|
||||
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
|
||||
|
||||
let request = WalRedoRequest {
|
||||
tag,
|
||||
lsn,
|
||||
@@ -181,7 +178,6 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
records,
|
||||
response_channel: tx,
|
||||
};
|
||||
|
||||
self.request_tx
|
||||
.lock()
|
||||
.unwrap()
|
||||
@@ -313,57 +309,150 @@ impl PostgresRedoManagerInternal {
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
let tag_blknum = match tag {
|
||||
ObjectTag::Clog(slru) => slru.blknum,
|
||||
ObjectTag::TwoPhase(_) => {
|
||||
assert!(info == pg_constants::XLOG_XACT_PREPARE);
|
||||
trace!("Apply prepare {} record", xlogrec.xl_xid);
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[..]);
|
||||
continue;
|
||||
0 // not used by XLOG_XACT_PREPARE
|
||||
}
|
||||
_ => panic!("Not valid XACT object tag {:?}", tag),
|
||||
};
|
||||
let parsed_xact =
|
||||
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
if info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
// status of 2PC transaction will be set later
|
||||
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
|
||||
}
|
||||
let _xact_time = buf.get_i64_le();
|
||||
// decode xinfo
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
|
||||
// handle subtrans
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
|
||||
if tag_blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
|
||||
transaction_id_set_status(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
// Do not need to handle dropped relations here, just need to skip them
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
// Skip invalidations
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
// Set status of 2PC transaction
|
||||
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
|
||||
let xid = buf.get_u32_le();
|
||||
transaction_id_set_status(xid, status, &mut page);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT
|
||||
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
if info == pg_constants::XLOG_XACT_ABORT {
|
||||
// status of 2PC transaction will be set later
|
||||
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
|
||||
}
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
// decode xinfo
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
// handle subtrans
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
transaction_id_set_status(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
// Do not need to handle dropped relations here, just need to skip them
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
// Skip invalidations
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
// Set status of 2PC transaction
|
||||
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
|
||||
let xid = buf.get_u32_le();
|
||||
transaction_id_set_status(xid, status, &mut page);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
trace!("Apply prepare {} record", xlogrec.xl_xid);
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[..]);
|
||||
} else {
|
||||
error!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multiexact operations
|
||||
@@ -371,7 +460,7 @@ impl PostgresRedoManagerInternal {
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
// Just need to zero page
|
||||
// Just need to ero page
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
|
||||
@@ -15,6 +15,7 @@ fn main() {
|
||||
// All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
|
||||
//
|
||||
.header("pg_control_ffi.h")
|
||||
.header("xlog_ffi.h")
|
||||
//
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
|
||||
@@ -67,7 +67,6 @@ pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usi
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
pub const XLOG_HEAP_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
|
||||
@@ -390,13 +390,6 @@ impl CheckPoint {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
|
||||
// We need this segment to start compute node.
|
||||
// In order to minimize changes in Postgres core, we prefer to
|
||||
// provide WAL segment from which is can extract checkpoint record in standard way,
|
||||
// rather then implement some alternative mechanism.
|
||||
//
|
||||
pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
|
||||
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
|
||||
|
||||
|
||||
3
postgres_ffi/xlog_ffi.h
Normal file
3
postgres_ffi/xlog_ffi.h
Normal file
@@ -0,0 +1,3 @@
|
||||
#include "c.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/xlogrecord.h"
|
||||
@@ -1,97 +0,0 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test Garbage Collection of old page versions.
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of page version storage
|
||||
# and garbage collection in object_repository.rs.
|
||||
#
|
||||
def test_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_gc", "empty"])
|
||||
pg = postgres.create_start('test_gc')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
with closing(pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create a test table
|
||||
cur.execute("CREATE TABLE foo(x integer)")
|
||||
|
||||
# Run GC, to clear out any old page versions left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
# before running the actual tests below, otherwise the counts won't match
|
||||
# what we expect.
|
||||
print("Running GC before test")
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
# remember the number of relations
|
||||
n_relations = row['n_relations']
|
||||
assert n_relations > 0
|
||||
|
||||
# Insert a row. The first insert will also create a metadata entry for the
|
||||
# relation, with size == 1 block. Hence, bump up the expected relation count.
|
||||
n_relations += 1;
|
||||
print("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 3
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 2
|
||||
|
||||
# Insert one more row. It creates one more page version, but doesn't affect the
|
||||
# relation size.
|
||||
print("Inserting one more row")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 1
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 0
|
||||
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['dropped'] == 3
|
||||
@@ -472,11 +472,11 @@ class WalAcceptor:
|
||||
|
||||
cmd = [self.wa_binpath]
|
||||
cmd.extend(["-D", self.data_dir])
|
||||
cmd.extend(["-l", "localhost:{}".format(self.port)])
|
||||
cmd.extend(["-l", "127.0.0.1:{}".format(self.port)])
|
||||
cmd.append("--daemonize")
|
||||
cmd.append("--no-sync")
|
||||
# Tell page server it can receive WAL from this WAL safekeeper
|
||||
cmd.extend(["--pageserver", "localhost:{}".format(DEFAULT_PAGESERVER_PORT)])
|
||||
cmd.extend(["--pageserver", "127.0.0.1:{}".format(DEFAULT_PAGESERVER_PORT)])
|
||||
cmd.extend(["--recall", "1 second"])
|
||||
print('Running command "{}"'.format(' '.join(cmd)))
|
||||
subprocess.run(cmd, check=True)
|
||||
@@ -543,7 +543,7 @@ class WalAcceptorFactory:
|
||||
|
||||
def get_connstrs(self) -> str:
|
||||
""" Get list of wal acceptor endpoints suitable for wal_acceptors GUC """
|
||||
return ','.join(["localhost:{}".format(wa.port) for wa in self.instances])
|
||||
return ','.join(["127.0.0.1:{}".format(wa.port) for wa in self.instances])
|
||||
|
||||
|
||||
@zenfixture
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: eb94b98d38...a08f50cba2
@@ -8,6 +8,7 @@ use log::*;
|
||||
use parse_duration::parse;
|
||||
use slog::Drain;
|
||||
use std::io;
|
||||
use std::net::ToSocketAddrs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
@@ -73,7 +74,7 @@ fn main() -> Result<()> {
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_addr: "localhost:5454".to_string(),
|
||||
listen_addr: "127.0.0.1:5454".parse()?,
|
||||
ttl: None,
|
||||
recall_period: None,
|
||||
};
|
||||
@@ -94,11 +95,17 @@ fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.to_owned();
|
||||
// TODO: keep addr vector in config and listen them all
|
||||
// XXX: with our callmemaybe approach we need to set 'advertised address'
|
||||
// as it is not always possible to listen it. Another reason to ditch callmemaybe.
|
||||
let addrs: Vec<_> = addr.to_socket_addrs().unwrap().collect();
|
||||
conf.listen_addr = addrs[0];
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("pageserver") {
|
||||
conf.pageserver_addr = Some(addr.to_owned());
|
||||
// TODO: keep addr vector in config and check them all while connecting
|
||||
let addrs: Vec<_> = addr.to_socket_addrs().unwrap().collect();
|
||||
conf.pageserver_addr = Some(addrs[0]);
|
||||
}
|
||||
|
||||
if let Some(ttl) = arg_matches.value_of("ttl") {
|
||||
@@ -188,19 +195,12 @@ fn init_logging(
|
||||
if conf.daemonize {
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
record.level().is_at_least(slog::Level::Info)
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
record.level().is_at_least(slog::Level::Info)
|
||||
})
|
||||
.fuse();
|
||||
let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -14,8 +15,8 @@ pub struct WalAcceptorConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub daemonize: bool,
|
||||
pub no_sync: bool,
|
||||
pub listen_addr: String,
|
||||
pub pageserver_addr: Option<String>,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub pageserver_addr: Option<SocketAddr>,
|
||||
pub ttl: Option<Duration>,
|
||||
pub recall_period: Option<Duration>,
|
||||
}
|
||||
|
||||
@@ -4,9 +4,8 @@
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use log::*;
|
||||
use postgres::{Client, Config, NoTls};
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{BufReader, Read, Seek, SeekFrom, Write};
|
||||
@@ -150,18 +149,19 @@ pub struct ReceiveWalConn {
|
||||
/// If pageserver already has replication channel, it will just ignore this request
|
||||
///
|
||||
fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId) {
|
||||
let ps_addr = conf.pageserver_addr.unwrap();
|
||||
let ps_connstr = format!("postgresql://no_user@{}/no_db", ps_addr);
|
||||
|
||||
// use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
|
||||
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr);
|
||||
let me_conf: Config = me_connstr.parse().unwrap();
|
||||
let (host, port) = connection_host_port(&me_conf);
|
||||
let addr = conf.pageserver_addr.unwrap();
|
||||
let ps_connstr = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
addr.ip(),
|
||||
addr.port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let callme = format!(
|
||||
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
|
||||
timelineid,
|
||||
host,
|
||||
port,
|
||||
conf.listen_addr.ip(),
|
||||
conf.listen_addr.port(),
|
||||
timelineid
|
||||
);
|
||||
loop {
|
||||
@@ -241,6 +241,9 @@ impl ReceiveWalConn {
|
||||
my_info.server = server_info.clone();
|
||||
my_info.server.node_id = node_id;
|
||||
|
||||
/* Need to save incompleted my_info in timeline to provide wal_seg_size for find_end_of_wal */
|
||||
self.timeline.get().set_info(&my_info);
|
||||
|
||||
/* Calculate WAL end based on local data */
|
||||
let (flush_lsn, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, true);
|
||||
my_info.flush_lsn = flush_lsn;
|
||||
|
||||
@@ -63,11 +63,15 @@ impl ReplicationConn {
|
||||
let feedback = HotStandbyFeedback::des(&m)?;
|
||||
timeline.add_hs_feedback(feedback)
|
||||
}
|
||||
msg => {
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
Some(msg) => {
|
||||
info!("unexpected message {:?}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(anyhow!("Connection closed"))
|
||||
}
|
||||
|
||||
/// Helper function that parses a pair of LSNs.
|
||||
|
||||
@@ -232,6 +232,7 @@ impl TimelineTools for Option<Arc<Timeline>> {
|
||||
/// Find last WAL record. If "precise" is false then just locate last partial segment
|
||||
fn find_end_of_wal(&self, data_dir: &Path, precise: bool) -> (Lsn, TimeLineID) {
|
||||
let seg_size = self.get().get_info().server.wal_seg_size as usize;
|
||||
assert!(seg_size > 0);
|
||||
let (lsn, timeline) = find_end_of_wal(data_dir, seg_size, precise);
|
||||
(Lsn(lsn), timeline)
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ use zenith_utils::postgres_backend::PostgresBackend;
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
pub fn thread_main(conf: WalAcceptorConf) -> Result<()> {
|
||||
info!("Starting wal acceptor on {}", conf.listen_addr);
|
||||
let listener = TcpListener::bind(conf.listen_addr.clone()).map_err(|e| {
|
||||
let listener = TcpListener::bind(conf.listen_addr).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
@@ -12,7 +12,6 @@ log = "0.4.14"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
bincode = "1.3"
|
||||
thiserror = "1.0"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
use postgres::Config;
|
||||
|
||||
pub fn connection_host_port(config: &Config) -> (String, u16) {
|
||||
assert_eq!(config.get_hosts().len(), 1, "only one pair of host and port is supported in connection string");
|
||||
assert_eq!(config.get_ports().len(), 1, "only one pair of host and port is supported in connection string");
|
||||
let host = match &config.get_hosts()[0] {
|
||||
postgres::config::Host::Tcp(host) => host.as_ref(),
|
||||
postgres::config::Host::Unix(host) => host.to_str().unwrap(),
|
||||
};
|
||||
(host.to_owned(), config.get_ports()[0])
|
||||
}
|
||||
|
||||
pub fn connection_address(config: &Config) -> String {
|
||||
let (host, port) = connection_host_port(config);
|
||||
format!("{}:{}", host, port)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_connection_host_port() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000/no_db".parse().unwrap();
|
||||
assert_eq!(connection_host_port(&config), ("localhost".to_owned(), 64000));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
|
||||
fn test_connection_host_port_multiple_ports() {
|
||||
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db".parse().unwrap();
|
||||
assert_eq!(connection_host_port(&config), ("localhost".to_owned(), 64000));
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,3 @@ pub mod bin_ser;
|
||||
|
||||
pub mod postgres_backend;
|
||||
pub mod pq_proto;
|
||||
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
Reference in New Issue
Block a user