code cleanup for compute_node_rebase branch

This commit is contained in:
anastasia
2021-04-09 17:25:41 +03:00
parent 1b9eb9430c
commit a267dfa41f
12 changed files with 859 additions and 827 deletions

View File

@@ -70,12 +70,9 @@ impl StorageControlPlane {
data_dir: TEST_WORKDIR.join("pageserver"),
};
pserver.init();
if froms3
{
if froms3 {
pserver.start_froms3();
}
else
{
} else {
pserver.start();
}
@@ -379,10 +376,10 @@ impl ComputeControlPlane<'_> {
node
}
// Init compute node without files, only datadir structure
// use initdb --compute-node flag and GUC 'computenode_mode'
// to distinguish the node
pub fn new_minimal_node<'a>(&mut self) -> &Arc<PostgresNode> {
// Init compute node without files, only datadir structure
// use initdb --compute-node flag and GUC 'computenode_mode'
// to distinguish the node
pub fn new_minimal_node<'a>(&mut self) -> &Arc<PostgresNode> {
// allocate new node entry with generated port
let node_id = self.nodes.len() + 1;
let node = PostgresNode {
@@ -448,9 +445,17 @@ impl ComputeControlPlane<'_> {
let pserver = storage_cplane.page_server_addr();
// Configure that node to take pages from pageserver
node.append_conf("postgresql.conf", format!("\
node.append_conf(
"postgresql.conf",
format!(
"\
page_server_connstring = 'host={} port={}'\n\
", pserver.ip(), pserver.port()).as_str());
",
pserver.ip(),
pserver.port()
)
.as_str(),
);
node.clone()
}
@@ -525,7 +530,6 @@ pub struct PostgresNode {
pg_bin_dir: PathBuf,
}
impl PostgresNode {
pub fn append_conf(&self, config: &str, opts: &str) {
OpenOptions::new()
@@ -636,80 +640,81 @@ impl PostgresNode {
// And reqular query() uses prepared queries.
// TODO pass sysid as parameter
pub fn setup_compute_node(&self, sysid: u64, storage_cplane: &StorageControlPlane)
{
pub fn setup_compute_node(&self, sysid: u64, storage_cplane: &StorageControlPlane) {
let mut query;
//Request pg_control from pageserver
query = format!("file {}/global/pg_control,{},{},{},{},{},{},{}",
query = format!(
"file {}/global/pg_control,{},{},{},{},{},{},{}",
self.pgdata.to_str().unwrap(),
sysid as u64, //sysid
1664, //tablespace
0, //dboid
0, //reloid
42, //forknum pg_control
0, //blkno
0 //lsn
1664, //tablespace
0, //dboid
0, //reloid
42, //forknum pg_control
0, //blkno
0 //lsn
);
storage_cplane.page_server_psql(query.as_str());
//Request pg_xact and pg_multixact from pageserver
//We need them for initial pageserver startup and authentication
//TODO figure out which block number we really need
query = format!("file {}/pg_xact/0000,{},{},{},{},{},{},{}",
query = format!(
"file {}/pg_xact/0000,{},{},{},{},{},{},{}",
self.pgdata.to_str().unwrap(),
sysid as u64, //sysid
0, //tablespace
0, //dboid
0, //reloid
44, //forknum
0, //blkno
0 //lsn
0, //tablespace
0, //dboid
0, //reloid
44, //forknum
0, //blkno
0 //lsn
);
storage_cplane.page_server_psql(query.as_str());
query = format!("file {}/pg_multixact/offsets/0000,{},{},{},{},{},{},{}",
query = format!(
"file {}/pg_multixact/offsets/0000,{},{},{},{},{},{},{}",
self.pgdata.to_str().unwrap(),
sysid as u64, //sysid
0, //tablespace
0, //dboid
0, //reloid
45, //forknum
0, //blkno
0 //lsn
0, //tablespace
0, //dboid
0, //reloid
45, //forknum
0, //blkno
0 //lsn
);
storage_cplane.page_server_psql(query.as_str());
query = format!("file {}/pg_multixact/members/0000,{},{},{},{},{},{},{}",
query = format!(
"file {}/pg_multixact/members/0000,{},{},{},{},{},{},{}",
self.pgdata.to_str().unwrap(),
sysid as u64, //sysid
0, //tablespace
0, //dboid
0, //reloid
46, //forknum
0, //blkno
0 //lsn
0, //tablespace
0, //dboid
0, //reloid
46, //forknum
0, //blkno
0 //lsn
);
storage_cplane.page_server_psql(query.as_str());
//Request a few shared catalogs needed for authentication
//Without them we cannot setup connection with pageserver to request further pages
let reloids = [1260, 1261, 1262, 2396];
for reloid in reloids.iter()
{
for reloid in reloids.iter() {
//FIXME request all blocks from file, not just 10
for blkno in 0..10
{
query = format!("file {}/global/{},{},{},{},{},{},{},{}",
for blkno in 0..10 {
query = format!(
"file {}/global/{},{},{},{},{},{},{},{}",
self.pgdata.to_str().unwrap(),
reloid, //suse it as filename
reloid, //suse it as filename
sysid as u64, //sysid
1664, //tablespace
0, //dboid
reloid, //reloid
0, //forknum
blkno, //blkno
0 //lsn
1664, //tablespace
0, //dboid
reloid, //reloid
0, //forknum
blkno, //blkno
0 //lsn
);
storage_cplane.page_server_psql(query.as_str());
}
@@ -719,8 +724,15 @@ impl PostgresNode {
fs::create_dir(format!("{}/base/13007", self.pgdata.to_str().unwrap())).unwrap();
//FIXME figure out what wal file we need to successfully start
let walfilepath = format!("{}/pg_wal/000000010000000000000001", self.pgdata.to_str().unwrap());
fs::copy("/home/anastasia/zenith/zenith/tmp_check/pgdata/pg_wal/000000010000000000000001", walfilepath).unwrap();
let walfilepath = format!(
"{}/pg_wal/000000010000000000000001",
self.pgdata.to_str().unwrap()
);
fs::copy(
"/home/anastasia/zenith/zenith/tmp_check/pgdata/pg_wal/000000010000000000000001",
walfilepath,
)
.unwrap();
println!("before resetwal ");
@@ -743,7 +755,6 @@ impl PostgresNode {
}
println!("setup done");
}
pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode {
@@ -761,8 +772,7 @@ impl PostgresNode {
}
}
pub fn push_to_s3(&self)
{
pub fn push_to_s3(&self) {
println!("Push to s3 node at '{}'", self.pgdata.to_str().unwrap());
let zenith_push_path = self.pg_bin_dir.join("zenith_push");

View File

@@ -6,7 +6,6 @@ use std::time::Duration;
use control_plane::ComputeControlPlane;
use control_plane::StorageControlPlane;
// XXX: force all redo at the end
// -- restart + seqscan won't read deleted stuff
// -- pageserver api endpoint to check all rels
@@ -27,8 +26,14 @@ fn test_redo_cases() {
sleep(Duration::from_secs(3));
// check basic work with table
node.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)");
node.safe_psql("postgres", "INSERT INTO t SELECT generate_series(1,100), 'payload'");
node.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
);
let count: i64 = node
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
@@ -86,8 +91,14 @@ fn test_pageserver_multitenancy() {
sleep(Duration::from_secs(3));
// check node1
node1.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)");
node1.safe_psql("postgres", "INSERT INTO t SELECT generate_series(1,100), 'payload'");
node1.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node1.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
);
let count: i64 = node1
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
@@ -97,8 +108,14 @@ fn test_pageserver_multitenancy() {
assert_eq!(count, 5050);
// check node2
node2.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)");
node2.safe_psql("postgres", "INSERT INTO t SELECT generate_series(100,200), 'payload'");
node2.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node2.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(100,200), 'payload'",
);
let count: i64 = node2
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
@@ -120,7 +137,8 @@ fn test_pageserver_multitenancy() {
// .env("S3_BUCKET", "zenith-testbucket")
// TODO use env variables in test
fn test_pageserver_recovery() {
//This test expects that image is already uploaded to s3
//To upload it use zenith_push before test (see node.push_to_s3() for details)
let storage_cplane = StorageControlPlane::one_page_server(true);
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
@@ -143,7 +161,6 @@ fn test_pageserver_recovery() {
#[ignore]
//Scenario for future test. Not implemented yet
fn test_pageserver_node_switch() {
//Create pageserver
let storage_cplane = StorageControlPlane::one_page_server(false);
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
@@ -152,8 +169,14 @@ fn test_pageserver_node_switch() {
let node = compute_cplane.new_node();
node.start(&storage_cplane);
node.safe_psql("postgres", "CREATE TABLE t(key int primary key, value text)");
node.safe_psql("postgres", "INSERT INTO t SELECT generate_series(1,100), 'payload'");
node.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
);
let count: i64 = node
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()

View File

@@ -230,7 +230,7 @@ fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard {
if record.level().is_at_least(slog::Level::Info) {
return true;
}
return true;
return false;
});
let drain = std::sync::Mutex::new(drain).fuse();
let logger = slog::Logger::root(drain, slog::o!());

View File

@@ -1,8 +1,8 @@
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
use std::io::prelude::*;
use std::fs::File;
use std::io::prelude::*;
use std::io::SeekFrom;
use bytes::{Buf, Bytes};
@@ -11,86 +11,79 @@ use log::*;
type XLogRecPtr = u64;
#[repr(C)]
#[derive(Debug)]
#[derive(Clone)]
#[derive(Debug, Clone)]
/*
* Body of CheckPoint XLOG records. This is declared here because we keep
* a copy of the latest one in pg_control for possible disaster recovery.
* Changing this struct requires a PG_CONTROL_VERSION bump.
*/
pub struct CheckPoint {
pub redo: XLogRecPtr, /* next RecPtr available when we began to
* create CheckPoint (i.e. REDO start point) */
pub ThisTimeLineID: u32, /* current TLI */
pub PrevTimeLineID: u32, /* previous TLI, if this record begins a new
* timeline (equals ThisTimeLineID otherwise) */
pub fullPageWrites: bool, /* current full_page_writes */
pub nextXid: u64, /* next free transaction ID */
pub nextOid: u32, /* next free OID */
pub nextMulti: u32, /* next free MultiXactId */
pub nextMultiOffset: u32, /* next free MultiXact offset */
pub oldestXid: u32, /* cluster-wide minimum datfrozenxid */
pub oldestXidDB: u32, /* database with minimum datfrozenxid */
pub oldestMulti: u32, /* cluster-wide minimum datminmxid */
pub oldestMultiDB: u32, /* database with minimum datminmxid */
pub time: u64, /* time stamp of checkpoint */
pub oldestCommitTsXid: u32, /* oldest Xid with valid commit
* timestamp */
pub newestCommitTsXid: u32, /* newest Xid with valid commit
* timestamp */
pub redo: XLogRecPtr, /* next RecPtr available when we began to
* create CheckPoint (i.e. REDO start point) */
pub ThisTimeLineID: u32, /* current TLI */
pub PrevTimeLineID: u32, /* previous TLI, if this record begins a new
* timeline (equals ThisTimeLineID otherwise) */
pub fullPageWrites: bool, /* current full_page_writes */
pub nextXid: u64, /* next free transaction ID */
pub nextOid: u32, /* next free OID */
pub nextMulti: u32, /* next free MultiXactId */
pub nextMultiOffset: u32, /* next free MultiXact offset */
pub oldestXid: u32, /* cluster-wide minimum datfrozenxid */
pub oldestXidDB: u32, /* database with minimum datfrozenxid */
pub oldestMulti: u32, /* cluster-wide minimum datminmxid */
pub oldestMultiDB: u32, /* database with minimum datminmxid */
pub time: u64, /* time stamp of checkpoint */
pub oldestCommitTsXid: u32, /* oldest Xid with valid commit
* timestamp */
pub newestCommitTsXid: u32, /* newest Xid with valid commit
* timestamp */
/*
* Oldest XID still running. This is only needed to initialize hot standby
* mode from an online checkpoint, so we only bother calculating this for
* online checkpoints and only when wal_level is replica. Otherwise it's
* set to InvalidTransactionId.
*/
pub oldestActiveXid: u32,
/*
* Oldest XID still running. This is only needed to initialize hot standby
* mode from an online checkpoint, so we only bother calculating this for
* online checkpoints and only when wal_level is replica. Otherwise it's
* set to InvalidTransactionId.
*/
pub oldestActiveXid: u32,
}
#[repr(C)]
#[derive(Debug)]
#[derive(Clone)]
#[derive(Debug, Clone)]
pub struct ControlFileDataZenith {
pub system_identifier: u64,
pg_control_version: u32, /* PG_CONTROL_VERSION */
catalog_version_no: u32, /* see catversion.h */
pg_control_version: u32, /* PG_CONTROL_VERSION */
catalog_version_no: u32, /* see catversion.h */
state: i32, /* see enum above */
time: i64, /* time stamp of last pg_control update */
pub checkPoint: XLogRecPtr,
state: i32, /* see enum above */
time: i64, /* time stamp of last pg_control update */
pub checkPoint: XLogRecPtr,
checkPointCopy: CheckPoint, /* copy of last check point record */
unloggedLSN: XLogRecPtr, /* current fake LSN value, for unlogged rels */
minRecoveryPoint: XLogRecPtr,
minRecoveryPointTLI: u32,
backupStartPoint: XLogRecPtr,
backupEndPoint: XLogRecPtr,
backupEndRequired: bool
unloggedLSN: XLogRecPtr, /* current fake LSN value, for unlogged rels */
minRecoveryPoint: XLogRecPtr,
minRecoveryPointTLI: u32,
backupStartPoint: XLogRecPtr,
backupEndPoint: XLogRecPtr,
backupEndRequired: bool,
}
impl ControlFileDataZenith {
pub fn new() -> ControlFileDataZenith
{
pub fn new() -> ControlFileDataZenith {
ControlFileDataZenith {
system_identifier: 0,
system_identifier: 0,
pg_control_version: 0,
catalog_version_no: 0,
catalog_version_no: 0,
state: 0,
time: 0,
checkPoint: 0,
checkPointCopy:
{
CheckPoint
{
checkPointCopy: {
CheckPoint {
redo: 0,
ThisTimeLineID: 0,
PrevTimeLineID: 0,
fullPageWrites: false,
nextXid: 0,
nextOid:0,
nextOid: 0,
nextMulti: 0,
nextMultiOffset: 0,
oldestXid: 0,
@@ -100,109 +93,113 @@ impl ControlFileDataZenith {
time: 0,
oldestCommitTsXid: 0,
newestCommitTsXid: 0,
oldestActiveXid:0
oldestActiveXid: 0,
}
},
unloggedLSN: 0,
minRecoveryPoint: 0,
minRecoveryPointTLI: 0,
backupStartPoint: 0,
backupEndPoint: 0,
backupEndRequired: false,
unloggedLSN: 0,
minRecoveryPoint: 0,
minRecoveryPointTLI: 0,
backupStartPoint: 0,
backupEndPoint: 0,
backupEndRequired: false,
}
}
}
}
pub fn decode_pg_control(mut buf: Bytes) -> ControlFileDataZenith {
info!("decode pg_control");
let controlfile : ControlFileDataZenith = ControlFileDataZenith {
system_identifier: buf.get_u64_le(),
pg_control_version: buf.get_u32_le(),
catalog_version_no: buf.get_u32_le(),
state: buf.get_i32_le(),
time: { buf.advance(4); buf.get_i64_le() },
checkPoint: buf.get_u64_le(),
checkPointCopy:
{
CheckPoint
{
redo: buf.get_u64_le(),
ThisTimeLineID: buf.get_u32_le(),
PrevTimeLineID: buf.get_u32_le(),
fullPageWrites: buf.get_u8() != 0,
nextXid: { buf.advance(7); buf.get_u64_le()},
nextOid: buf.get_u32_le(),
nextMulti: buf.get_u32_le(),
nextMultiOffset: buf.get_u32_le(),
oldestXid:buf.get_u32_le(),
oldestXidDB: buf.get_u32_le(),
oldestMulti: buf.get_u32_le(),
oldestMultiDB: buf.get_u32_le(),
time: { buf.advance(4); buf.get_u64_le()},
oldestCommitTsXid: buf.get_u32_le(),
newestCommitTsXid: buf.get_u32_le(),
oldestActiveXid:buf.get_u32_le()
}
},
unloggedLSN: buf.get_u64_le(),
minRecoveryPoint: buf.get_u64_le(),
minRecoveryPointTLI: buf.get_u32_le(),
backupStartPoint:{ buf.advance(4); buf.get_u64_le()},
backupEndPoint: buf.get_u64_le(),
backupEndRequired: buf.get_u8() != 0,
};
let controlfile: ControlFileDataZenith = ControlFileDataZenith {
system_identifier: buf.get_u64_le(),
pg_control_version: buf.get_u32_le(),
catalog_version_no: buf.get_u32_le(),
state: buf.get_i32_le(),
time: {
buf.advance(4);
buf.get_i64_le()
},
checkPoint: buf.get_u64_le(),
checkPointCopy: {
CheckPoint {
redo: buf.get_u64_le(),
ThisTimeLineID: buf.get_u32_le(),
PrevTimeLineID: buf.get_u32_le(),
fullPageWrites: buf.get_u8() != 0,
nextXid: {
buf.advance(7);
buf.get_u64_le()
},
nextOid: buf.get_u32_le(),
nextMulti: buf.get_u32_le(),
nextMultiOffset: buf.get_u32_le(),
oldestXid: buf.get_u32_le(),
oldestXidDB: buf.get_u32_le(),
oldestMulti: buf.get_u32_le(),
oldestMultiDB: buf.get_u32_le(),
time: {
buf.advance(4);
buf.get_u64_le()
},
oldestCommitTsXid: buf.get_u32_le(),
newestCommitTsXid: buf.get_u32_le(),
oldestActiveXid: buf.get_u32_le(),
}
},
unloggedLSN: buf.get_u64_le(),
minRecoveryPoint: buf.get_u64_le(),
minRecoveryPointTLI: buf.get_u32_le(),
backupStartPoint: {
buf.advance(4);
buf.get_u64_le()
},
backupEndPoint: buf.get_u64_le(),
backupEndRequired: buf.get_u8() != 0,
};
return controlfile;
return controlfile;
}
pub fn parse_controlfile(b: Bytes)
{
pub fn parse_controlfile(b: Bytes) {
let controlfile = decode_pg_control(b);
info!("controlfile {:X}/{:X}",
controlfile.checkPoint >> 32, controlfile.checkPoint);
info!(
"controlfile {:X}/{:X}",
controlfile.checkPoint >> 32,
controlfile.checkPoint
);
info!("controlfile {:?}", controlfile);
}
const MAX_MAPPINGS: usize = 62;
#[derive(Debug)]
struct RelMapping
{
mapoid: u32, /* OID of a catalog */
mapfilenode: u32 /* its filenode number */
struct RelMapping {
mapoid: u32, /* OID of a catalog */
mapfilenode: u32, /* its filenode number */
}
#[derive(Debug)]
pub struct RelMapFile
{
magic: i32, /* always RELMAPPER_FILEMAGIC */
num_mappings: i32, /* number of valid RelMapping entries */
mappings: [u8; MAX_MAPPINGS*8],
crc: u32, /* CRC of all above */
pad: i32 /* to make the struct size be 512 exactly */
pub struct RelMapFile {
magic: i32, /* always RELMAPPER_FILEMAGIC */
num_mappings: i32, /* number of valid RelMapping entries */
mappings: [u8; MAX_MAPPINGS * 8],
crc: u32, /* CRC of all above */
pad: i32, /* to make the struct size be 512 exactly */
}
pub fn decode_filemapping(mut buf: Bytes) -> RelMapFile {
info!("decode filemap");
let file : RelMapFile = RelMapFile {
magic: buf.get_i32_le(), /* always RELMAPPER_FILEMAGIC */
num_mappings: buf.get_i32_le(), /* number of valid RelMapping entries */
mappings: {
let mut arr = [0 as u8; MAX_MAPPINGS*8];
let file: RelMapFile = RelMapFile {
magic: buf.get_i32_le(), /* always RELMAPPER_FILEMAGIC */
num_mappings: buf.get_i32_le(), /* number of valid RelMapping entries */
mappings: {
let mut arr = [0 as u8; MAX_MAPPINGS * 8];
buf.copy_to_slice(&mut arr);
arr
}
,
crc: buf.get_u32_le(), /* CRC of all above */
pad: buf.get_i32_le()
},
crc: buf.get_u32_le(), /* CRC of all above */
pad: buf.get_i32_le(),
};
info!("decode filemap {:?}", file);
@@ -210,13 +207,12 @@ pub fn decode_filemapping(mut buf: Bytes) -> RelMapFile {
}
pub fn write_buf_to_file(filepath: String, buf: Bytes, blkno: u32) {
info!("write_buf_to_file {}", filepath.clone());
let mut buffer = File::create(filepath.clone()).unwrap();
buffer.seek(SeekFrom::Start(8192*blkno as u64)).unwrap();
buffer.seek(SeekFrom::Start(8192 * blkno as u64)).unwrap();
buffer.write_all(&buf).unwrap();
info!("DONE write_buf_to_file {}", filepath);
}
}

View File

@@ -1,12 +1,11 @@
use std::net::SocketAddr;
use std::path::PathBuf;
#[allow(dead_code)]
pub mod pg_constants;
pub mod controlfile;
pub mod page_cache;
pub mod page_service;
#[allow(dead_code)]
pub mod pg_constants;
pub mod restore_s3;
pub mod tui;
pub mod tui_event;

File diff suppressed because it is too large Load Diff

View File

@@ -374,7 +374,6 @@ impl Connection {
self.stream.write_u32(resp.n_blocks).await?;
self.stream.write_buf(&mut resp.page.clone()).await?;
}
}
Ok(())
@@ -429,7 +428,6 @@ impl Connection {
trace!("got query {:?}", q.body);
if q.body.starts_with(b"file") {
let (_l, r) = q.body.split_at("file ".len());
//TODO parse it correctly
let r = r.to_vec();
@@ -439,23 +437,44 @@ impl Connection {
let mut s;
let filepath = split.next().unwrap();
let sysid = { s = split.next().unwrap(); s.parse::<u64>().unwrap()};
let sysid = {
s = split.next().unwrap();
s.parse::<u64>().unwrap()
};
let buf_tag = page_cache::BufferTag {
spcnode: { s = split.next().unwrap(); s.parse::<u32>().unwrap() },
dbnode: { s = split.next().unwrap(); s.parse::<u32>().unwrap() },
relnode: { s = split.next().unwrap(); s.parse::<u32>().unwrap() },
forknum: { s = split.next().unwrap(); s.parse::<u8>().unwrap() },
blknum: { s = split.next().unwrap(); s.parse::<u32>().unwrap() }
spcnode: {
s = split.next().unwrap();
s.parse::<u32>().unwrap()
},
dbnode: {
s = split.next().unwrap();
s.parse::<u32>().unwrap()
},
relnode: {
s = split.next().unwrap();
s.parse::<u32>().unwrap()
},
forknum: {
s = split.next().unwrap();
s.parse::<u8>().unwrap()
},
blknum: {
s = split.next().unwrap();
s.parse::<u32>().unwrap()
},
};
//TODO PARSE LSN
//let lsn = { s = split.next().unwrap(); s.parse::<u64>().unwrap()};
let lsn: u64 = 0;
info!("process file query sysid {} -- {:?} lsn {}",sysid, buf_tag, lsn);
self.handle_file(filepath.to_string(), sysid, buf_tag, lsn.into()).await
info!(
"process file query sysid {} -- {:?} lsn {}",
sysid, buf_tag, lsn
);
self.handle_file(filepath.to_string(), sysid, buf_tag, lsn.into())
.await
} else if q.body.starts_with(b"pagestream ") {
let (_l, r) = q.body.split_at("pagestream ".len());
let mut r = r.to_vec();
@@ -502,9 +521,13 @@ impl Connection {
}
}
async fn handle_file(&mut self, filepath: String, sysid:u64,
buf_tag: page_cache::BufferTag, lsn:u64) -> Result<()> {
async fn handle_file(
&mut self,
filepath: String,
sysid: u64,
buf_tag: page_cache::BufferTag,
lsn: u64,
) -> Result<()> {
let pcache = page_cache::get_pagecache(self.conf.clone(), sysid);
match pcache.get_page_at_lsn(buf_tag, lsn) {
@@ -512,16 +535,17 @@ impl Connection {
info!("info succeeded get_page_at_lsn: {}", lsn);
controlfile::write_buf_to_file(filepath, p, buf_tag.blknum);
},
}
Err(e) => {
info!("page not found and it's ok. get_page_at_lsn: {}", e);
}
};
self.write_message_noflush(&BeMessage::RowDescription).await?;
self.write_message_noflush(&BeMessage::RowDescription)
.await?;
self.write_message_noflush(&BeMessage::DataRow).await?;
self.write_message_noflush(&BeMessage::CommandComplete).await?;
self.write_message_noflush(&BeMessage::CommandComplete)
.await?;
self.write_message(&BeMessage::ReadyForQuery).await
}
@@ -588,7 +612,7 @@ impl Connection {
let n_blocks = pcache.relsize_get(&tag);
info!("ZenithNblocksRequest {:?} = {}", tag, n_blocks);
trace!("ZenithNblocksRequest {:?} = {}", tag, n_blocks);
self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse {
ok: true,
n_blocks: n_blocks,
@@ -608,26 +632,23 @@ impl Connection {
Ok(p) => {
let mut b = BytesMut::with_capacity(8192);
info!("ZenithReadResponse get_page_at_lsn succeed");
if p.len() < 8192
{
trace!("ZenithReadResponse get_page_at_lsn succeed");
if p.len() < 8192 {
//add padding
info!("ZenithReadResponse add padding");
trace!("ZenithReadResponse add padding");
let padding: [u8; 8192 - 512] = [0; 8192 - 512];
b.extend_from_slice(&p);
b.extend_from_slice(&padding);
}
else
{
} else {
b.extend_from_slice(&p);
}
BeMessage::ZenithReadResponse(ZenithReadResponse {
ok: true,
n_blocks: 0,
page: b.freeze()
page: b.freeze(),
})
},
}
Err(e) => {
const ZERO_PAGE: [u8; 8192] = [0; 8192];
error!("get_page_at_lsn: {}", e);
@@ -648,7 +669,7 @@ impl Connection {
relnode: req.relnode,
forknum: req.forknum,
};
info!("ZenithCreateRequest {:?}", tag);
trace!("ZenithCreateRequest {:?}", tag);
pcache.relsize_inc(&tag, None);
@@ -666,7 +687,7 @@ impl Connection {
forknum: req.forknum,
};
info!("ZenithExtendRequest {:?} to {}", tag, req.blkno);
trace!("ZenithExtendRequest {:?} to {}", tag, req.blkno);
pcache.relsize_inc(&tag, Some(req.blkno));

View File

@@ -1,13 +1,11 @@
// From pg_tablespace_d.h
//
// FIXME: we'll probably need these elsewhere too, move to some common location
pub const DEFAULTTABLESPACE_OID:u32 = 1663;
pub const GLOBALTABLESPACE_OID:u32 = 1664;
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
pub const GLOBALTABLESPACE_OID: u32 = 1664;
//Special values for non-rel files' tags
//TODO maybe use enum?
pub const PG_CONTROLFILE_FORKNUM:u32 = 42;
pub const PG_FILENODEMAP_FORKNUM:u32 = 43;
pub const PG_XACT_FORKNUM:u32 = 44;
pub const PG_MXACT_OFFSETS_FORKNUM:u32 = 45;
pub const PG_MXACT_MEMBERS_FORKNUM:u32 = 46;
pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
pub const PG_XACT_FORKNUM: u32 = 44;
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;

View File

@@ -22,7 +22,7 @@ use tokio::runtime;
use futures::future;
use crate::{PageServerConf, page_cache, pg_constants, controlfile};
use crate::{controlfile, page_cache, pg_constants, PageServerConf};
struct Storage {
region: Region,
@@ -86,7 +86,12 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
//Before uploading other files, slurp pg_control to set systemid
let control_results: Vec<s3::serde_types::ListBucketResult> = bucket.list("relationdata/global/pg_control".to_string(), Some("".to_string())).await?;
let control_results: Vec<s3::serde_types::ListBucketResult> = bucket
.list(
"relationdata/global/pg_control".to_string(),
Some("".to_string()),
)
.await?;
let object = &(&control_results[0]).contents[0];
let (data, _) = bucket.get_object(&object.key).await.unwrap();
let bytes = BytesMut::from(data.as_slice()).freeze();
@@ -131,10 +136,11 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
}
//Now add nonrelation files
let nonrelresults: Vec<s3::serde_types::ListBucketResult> = bucket.list("nonreldata/".to_string(), Some("".to_string())).await?;
let nonrelresults: Vec<s3::serde_types::ListBucketResult> = bucket
.list("nonreldata/".to_string(), Some("".to_string()))
.await?;
for result in nonrelresults {
for object in result.contents {
// Download needed non relation files, slurping them into memory
let key = object.key;
@@ -150,7 +156,9 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
slurp_futures.push(f);
}
Err(e) => { warn!("unrecognized file: {} ({})", relpath, e); }
Err(e) => {
warn!("unrecognized file: {} ({})", relpath, e);
}
};
}
}
@@ -160,12 +168,14 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
info!("{} files to restore...", slurp_futures.len());
future::join_all(slurp_futures).await;
info!("restored! {:?} to {:?}", pcache.first_valid_lsn, pcache.last_valid_lsn);
info!(
"restored! {:?} to {:?}",
pcache.first_valid_lsn, pcache.last_valid_lsn
);
Ok(())
}
#[derive(Debug)]
struct FilePathError {
msg: String,
@@ -215,10 +225,8 @@ struct ParsedBaseImageFileName {
pub lsn: u64,
}
fn parse_lsn_from_filename(fname: &str) -> Result<u64, FilePathError>
{
let (_, lsn_str) = fname.split_at(fname.len()-16);
fn parse_lsn_from_filename(fname: &str) -> Result<u64, FilePathError> {
let (_, lsn_str) = fname.split_at(fname.len() - 16);
let (lsnhi, lsnlo) = lsn_str.split_at(8);
let lsn_hi = u64::from_str_radix(lsnhi, 16)?;
@@ -267,10 +275,8 @@ fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
}
fn parse_nonrel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
//TODO parse segno from xact filenames too
if let Some(fname) = path.strip_prefix("pg_xact/") {
let lsn = parse_lsn_from_filename(fname.clone())?;
return Ok(ParsedBaseImageFileName {
@@ -279,11 +285,9 @@ fn parse_nonrel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePat
relnode: 0,
forknum: pg_constants::PG_XACT_FORKNUM,
segno: 0,
lsn
lsn,
});
}
else if let Some(fname) = path.strip_prefix("pg_multixact/offsets") {
} else if let Some(fname) = path.strip_prefix("pg_multixact/offsets") {
let lsn = parse_lsn_from_filename(fname.clone())?;
return Ok(ParsedBaseImageFileName {
@@ -292,11 +296,9 @@ fn parse_nonrel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePat
relnode: 0,
forknum: pg_constants::PG_MXACT_OFFSETS_FORKNUM,
segno: 0,
lsn
lsn,
});
}
else if let Some(fname) = path.strip_prefix("pg_multixact/members") {
} else if let Some(fname) = path.strip_prefix("pg_multixact/members") {
let lsn = parse_lsn_from_filename(fname.clone())?;
return Ok(ParsedBaseImageFileName {
@@ -305,14 +307,11 @@ fn parse_nonrel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePat
relnode: 0,
forknum: pg_constants::PG_MXACT_MEMBERS_FORKNUM,
segno: 0,
lsn
lsn,
});
}
else {
} else {
return Err(FilePathError::new("invalid non relation data file name"));
}
}
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
@@ -334,9 +333,7 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
* <oid>.<segment number>
*/
if let Some(fname) = path.strip_prefix("global/") {
if fname.contains("pg_control")
{
if fname.contains("pg_control") {
let lsn = parse_lsn_from_filename(fname.clone())?;
return Ok(ParsedBaseImageFileName {
@@ -345,12 +342,11 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
relnode: 0,
forknum: pg_constants::PG_CONTROLFILE_FORKNUM,
segno: 0,
lsn
lsn,
});
}
if fname.contains("pg_filenode")
{
if fname.contains("pg_filenode") {
let lsn = parse_lsn_from_filename(fname.clone())?;
return Ok(ParsedBaseImageFileName {
@@ -359,7 +355,7 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
relnode: 0,
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
segno: 0,
lsn
lsn,
});
}
@@ -386,8 +382,7 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
return Err(FilePathError::new("invalid relation data file name"));
};
if fname.contains("pg_filenode")
{
if fname.contains("pg_filenode") {
let lsn = parse_lsn_from_filename(fname.clone())?;
return Ok(ParsedBaseImageFileName {
@@ -396,11 +391,10 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
relnode: 0,
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
segno: 0,
lsn
lsn,
});
}
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
return Ok(ParsedBaseImageFileName {
@@ -441,18 +435,16 @@ async fn slurp_base_file(
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
// pg_filenode.map has non-standard size - 512 bytes
if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM
{
if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM {
let b = bytes.clone();
controlfile::decode_filemapping(b);
while bytes.remaining() >= 512 {
let tag = page_cache::BufferTag {
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
relnode: parsed.relnode,
forknum: parsed.forknum as u8,
blknum: 0
blknum: 0,
};
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(512));
@@ -466,11 +458,9 @@ async fn slurp_base_file(
};
pcache.relsize_inc(&tag, Some(0));
}
else
{
} else {
// FIXME: use constants (BLCKSZ)
let mut blknum: u32 = parsed.segno * (1024*1024*1024 / 8192);
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
let reltag = page_cache::RelTag {
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
@@ -479,13 +469,12 @@ async fn slurp_base_file(
};
while bytes.remaining() >= 8192 {
let tag = page_cache::BufferTag {
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
relnode: parsed.relnode,
forknum: parsed.forknum as u8,
blknum: blknum
blknum: blknum,
};
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));

View File

@@ -238,7 +238,7 @@ const BLCKSZ: u16 = 8192;
//
const XLR_INFO_MASK: u8 = 0x0F;
const XLR_MAX_BLOCK_ID:u8 = 32;
const XLR_MAX_BLOCK_ID: u8 = 32;
const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
@@ -260,8 +260,8 @@ const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay
//
// constants from clog.h
//
const CLOG_XACTS_PER_BYTE:u32 = 4;
const CLOG_XACTS_PER_PAGE:u32 = 8192 * CLOG_XACTS_PER_BYTE;
const CLOG_XACTS_PER_BYTE: u32 = 4;
const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
pub struct DecodedBkpBlock {
/* Is this block ref in use? */
@@ -307,7 +307,7 @@ pub struct DecodedWALRecord {
const XLOG_SWITCH: u8 = 0x40;
const RM_XLOG_ID: u8 = 0;
const RM_XACT_ID:u8 = 1;
const RM_XACT_ID: u8 = 1;
// const RM_CLOG_ID:u8 = 3;
//const RM_MULTIXACT_ID:u8 = 6;
@@ -327,7 +327,6 @@ const XLOG_XACT_OPMASK: u8 = 0x70;
/* does this record have a 'xinfo' field or not */
// const XLOG_XACT_HAS_INFO: u8 = 0x80;
// Is this record an XLOG_SWITCH record? They need some special processing,
// so we need to check for that before the rest of the parsing.
//
@@ -369,9 +368,9 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord {
buf.advance(2); // 2 bytes of padding
let _xl_crc = buf.get_u32_le();
info!("decode_wal_record xl_rmid = {}" , xl_rmid);
info!("decode_wal_record xl_rmid = {}", xl_rmid);
let rminfo: u8 = xl_info & !XLR_INFO_MASK;
let rminfo: u8 = xl_info & !XLR_INFO_MASK;
let remaining = xl_tot_len - SizeOfXLogRecord;
@@ -384,15 +383,15 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord {
let mut rnode_relnode: u32 = 0;
let mut got_rnode = false;
if xl_rmid == RM_XACT_ID &&
((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT ||
(rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED)
if xl_rmid == RM_XACT_ID
&& ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT
|| (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED)
{
info!("decode_wal_record RM_XACT_ID - XLOG_XACT_COMMIT");
let mut blocks: Vec<DecodedBkpBlock> = Vec::new();
let blkno = xl_xid/CLOG_XACTS_PER_PAGE;
let blkno = xl_xid / CLOG_XACTS_PER_PAGE;
let mut blk = DecodedBkpBlock {
rnode_spcnode: 0,
@@ -411,21 +410,24 @@ pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord {
bimg_info: 0,
has_data: true,
data_len: 0
data_len: 0,
};
let fork_flags = buf.get_u8();
blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
blk.data_len = buf.get_u16_le();
info!("decode_wal_record RM_XACT_ID blk has data with data_len {}", blk.data_len);
info!(
"decode_wal_record RM_XACT_ID blk has data with data_len {}",
blk.data_len
);
blocks.push(blk);
return DecodedWALRecord {
lsn: lsn,
record: rec,
blocks: blocks
}
blocks: blocks,
};
}
// Decode the headers

View File

@@ -160,9 +160,11 @@ impl WalRedoProcess {
.expect("failed to execute initdb");
if !initdb.status.success() {
panic!("initdb failed: {}\nstderr:\n{}",
std::str::from_utf8(&initdb.stdout).unwrap(),
std::str::from_utf8(&initdb.stderr).unwrap());
panic!(
"initdb failed: {}\nstderr:\n{}",
std::str::from_utf8(&initdb.stdout).unwrap(),
std::str::from_utf8(&initdb.stderr).unwrap()
);
}
// Start postgres itself