mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-27 07:10:37 +00:00
Compare commits
3 Commits
createdb_h
...
compute_no
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a267dfa41f | ||
|
|
1b9eb9430c | ||
|
|
9a4fbf365c |
6
.github/workflows/testing.yml
vendored
6
.github/workflows/testing.yml
vendored
@@ -4,7 +4,6 @@ on: [push]
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
timeout-minutes: 30
|
||||
name: run regression test suite
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -77,7 +76,10 @@ jobs:
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
- name: Build
|
||||
# That build is only to build dependencies and can be skipped if Cargo.lock
|
||||
# wasn't changed. Next steps need their own build
|
||||
- name: Install cargo deps
|
||||
if: steps.cache_cargo.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cargo build
|
||||
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,3 @@
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
.vscode
|
||||
|
||||
539
Cargo.lock
generated
539
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -3,8 +3,4 @@ members = [
|
||||
"integration_tests",
|
||||
"pageserver",
|
||||
"walkeeper",
|
||||
"zenith",
|
||||
"control_plane",
|
||||
"postgres_ffi",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
48
README.md
48
README.md
@@ -2,54 +2,6 @@
|
||||
|
||||
Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
|
||||
|
||||
## Running local installation
|
||||
|
||||
1. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
cd zenith
|
||||
./pgbuild.sh # builds postgres and installs it to ./tmp_install
|
||||
cargo build
|
||||
```
|
||||
|
||||
2. Start pageserver and postggres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create ~/.zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
>./target/debug/zenith init
|
||||
|
||||
# start pageserver
|
||||
> ./target/debug/zenith pageserver start
|
||||
Starting pageserver at '127.0.0.1:64000'
|
||||
|
||||
# create and configure postgres data dir
|
||||
> ./target/debug/zenith pg create
|
||||
Creating new postgres: path=/Users/user/code/zenith/tmp_check_cli/compute/pg1 port=55432
|
||||
Database initialized
|
||||
|
||||
# start it
|
||||
> ./target/debug/zenith pg start pg1
|
||||
|
||||
# look up status and connection info
|
||||
> ./target/debug/zenith pg list
|
||||
NODE ADDRESS STATUS
|
||||
pg1 127.0.0.1:55432 running
|
||||
```
|
||||
|
||||
3. Now it is possible to connect to postgres and run some queries:
|
||||
```
|
||||
> psql -p55432 -h 127.0.0.1 postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
|
||||
188
cli-v2-story.md
188
cli-v2-story.md
@@ -1,188 +0,0 @@
|
||||
Create a new Zenith repository in the current directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
|
||||
The files belonging to this database system will be owned by user "heikki".
|
||||
This user must also own the server process.
|
||||
|
||||
The database cluster will be initialized with locale "en_GB.UTF-8".
|
||||
The default database encoding has accordingly been set to "UTF8".
|
||||
The default text search configuration will be set to "english".
|
||||
|
||||
Data page checksums are disabled.
|
||||
|
||||
creating directory tmp ... ok
|
||||
creating subdirectories ... ok
|
||||
selecting dynamic shared memory implementation ... posix
|
||||
selecting default max_connections ... 100
|
||||
selecting default shared_buffers ... 128MB
|
||||
selecting default time zone ... Europe/Helsinki
|
||||
creating configuration files ... ok
|
||||
running bootstrap script ... ok
|
||||
performing post-bootstrap initialization ... ok
|
||||
syncing data to disk ... ok
|
||||
|
||||
initdb: warning: enabling "trust" authentication for local connections
|
||||
You can change this by editing pg_hba.conf or using the option -A, or
|
||||
--auth-local and --auth-host, the next time you run initdb.
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
Initially, there is only one branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
main
|
||||
|
||||
Start a local Postgres instance on the branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432
|
||||
2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432"
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status"
|
||||
2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required
|
||||
2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Run some commands against it:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);"
|
||||
CREATE TABLE
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
Create a new branch called 'experimental'. We create it from the
|
||||
current end of the 'main' branch, but you could specify a different
|
||||
LSN as the start point instead.
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
|
||||
branching at end of WAL: 0/161F478
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
experimental
|
||||
main
|
||||
|
||||
Start another Postgres instance off the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Insert some a row on the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
See that the other Postgres instance is still running on 'main' branch on port 5432:
|
||||
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
|
||||
|
||||
|
||||
Everything is stored in the .zenith directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
|
||||
total 12
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
|
||||
|
||||
The 'datadirs' directory contains the datadirs of the running instances:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
|
||||
total 8
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
|
||||
total 124
|
||||
drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem
|
||||
-rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf
|
||||
-rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase
|
||||
-rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION
|
||||
lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact
|
||||
-rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf
|
||||
-rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
|
||||
-rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts
|
||||
-rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid
|
||||
|
||||
Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
|
||||
datadir is ephemeral, you can delete it at any time, and it can be reconstructed
|
||||
from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
|
||||
the repository, the 'datadirs' are not included. (They are like git working trees)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
|
||||
~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
1
control_plane/.gitignore
vendored
1
control_plane/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
tmp_check/
|
||||
@@ -1,27 +0,0 @@
|
||||
[package]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
|
||||
serde = ""
|
||||
serde_derive = ""
|
||||
toml = ""
|
||||
lazy_static = ""
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
hex = "0.4.3"
|
||||
bytes = "1.0.1"
|
||||
fs_extra = "1.2.0"
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
@@ -1,537 +0,0 @@
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Read, Write};
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::storage::{PageServerNode, WalProposerNode};
|
||||
use pageserver::{zenith_repo_dir, ZTimelineId};
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane {
|
||||
base_port: u16,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
pub nodes: BTreeMap<String, Arc<PostgresNode>>,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane {
|
||||
// Load current nodes with ports from data directories on disk
|
||||
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
|
||||
// TODO: since pageserver do not have config file yet we believe here that
|
||||
// it is running on default port. Change that when pageserver will have config.
|
||||
let pageserver = Arc::new(PageServerNode::from_env(&env));
|
||||
|
||||
let pgdatadirspath = env.repo_path.join("pgdatadirs");
|
||||
let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
|
||||
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
|
||||
.into_iter()
|
||||
.map(|f| {
|
||||
PostgresNode::from_dir_entry(f?, &env, &pageserver)
|
||||
.map(|node| (node.name.clone(), Arc::new(node)))
|
||||
})
|
||||
.collect();
|
||||
let nodes = nodes?;
|
||||
|
||||
Ok(ComputeControlPlane {
|
||||
base_port: 55431,
|
||||
pageserver,
|
||||
nodes,
|
||||
env,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_port(&mut self) -> u16 {
|
||||
1 + self
|
||||
.nodes
|
||||
.iter()
|
||||
.map(|(_name, node)| node.address.port())
|
||||
.max()
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
|
||||
pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
base_port: 65431,
|
||||
pageserver: Arc::clone(pageserver),
|
||||
nodes: BTreeMap::new(),
|
||||
env: local_env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Connect to a page server, get base backup, and untar it to initialize a
|
||||
/// new data directory
|
||||
pub fn new_from_page_server(
|
||||
&mut self,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let node_id = self.nodes.len() as u32 + 1;
|
||||
|
||||
let node = Arc::new(PostgresNode {
|
||||
name: format!("pg{}", node_id),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test,
|
||||
timelineid,
|
||||
});
|
||||
|
||||
node.init_from_page_server()?;
|
||||
self.nodes.insert(node.name.clone(), Arc::clone(&node));
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
|
||||
let node = self.new_from_page_server(true, timelineid);
|
||||
assert!(node.is_ok());
|
||||
let node = node.unwrap();
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
|
||||
let node = self.new_from_page_server(true, timelineid).unwrap();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n",
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result<Arc<PostgresNode>> {
|
||||
let node = self.new_from_page_server(false, timelineid).unwrap();
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
pub address: SocketAddr,
|
||||
name: String,
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
fn from_dir_entry(
|
||||
entry: std::fs::DirEntry,
|
||||
env: &LocalEnv,
|
||||
pageserver: &Arc<PageServerNode>,
|
||||
) -> Result<PostgresNode> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
anyhow::bail!(
|
||||
"PostgresNode::from_dir_entry failed: '{}' is not a directory",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
|
||||
}
|
||||
|
||||
// parse data directory name
|
||||
let fname = entry.file_name();
|
||||
let name = fname.to_str().unwrap().to_string();
|
||||
|
||||
// find out tcp port in config file
|
||||
let cfg_path = entry.path().join("postgresql.conf");
|
||||
let config = fs::read_to_string(cfg_path.clone()).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
|
||||
let err_msg = format!(
|
||||
"failed to find port definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let port: u16 = CONF_PORT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// FIXME: What timeline is this server on? Would have to parse the postgresql.conf
|
||||
// file for that, too. It's currently not needed for anything, but it would be
|
||||
// nice to list the timeline in "zenith pg list"
|
||||
let timelineid_buf = [0u8; 16];
|
||||
let timelineid = ZTimelineId::from(timelineid_buf);
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
name,
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timelineid,
|
||||
})
|
||||
}
|
||||
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
pub fn init_from_page_server(&self) -> Result<()> {
|
||||
let pgdata = self.pgdata();
|
||||
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
pgdata.display(),
|
||||
self.address.port()
|
||||
);
|
||||
|
||||
// initialize data directory
|
||||
if self.is_test {
|
||||
fs::remove_dir_all(&pgdata).ok();
|
||||
}
|
||||
|
||||
let sql = format!("basebackup {}", self.timelineid);
|
||||
let mut client = self
|
||||
.pageserver
|
||||
.page_server_psql_client()
|
||||
.with_context(|| "connecting to page server failed")?;
|
||||
|
||||
fs::create_dir_all(&pgdata)
|
||||
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
|
||||
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"could not set permissions in data directory {}",
|
||||
pgdata.display()
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive.
|
||||
// But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that
|
||||
// we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here.
|
||||
//fs::create_dir_all(pgdata.join("pg_wal"))?;
|
||||
|
||||
let mut copyreader = client
|
||||
.copy_out(sql.as_str())
|
||||
.with_context(|| "page server 'basebackup' command failed")?;
|
||||
|
||||
// FIXME: Currently, we slurp the whole tarball into memory, and then extract it,
|
||||
// but we really should do this:
|
||||
//let mut ar = tar::Archive::new(copyreader);
|
||||
let mut buf = vec![];
|
||||
copyreader
|
||||
.read_to_end(&mut buf)
|
||||
.with_context(|| "reading base backup from page server failed")?;
|
||||
let mut ar = tar::Archive::new(buf.as_slice());
|
||||
ar.unpack(&pgdata)
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
|
||||
// listen for selected port
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
fsync = off\n\
|
||||
max_connections = 100\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
address = self.address.ip(),
|
||||
port = self.address.port()
|
||||
),
|
||||
);
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeepr or
|
||||
// page server yet. But this will do for now.
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n");
|
||||
|
||||
// Connect it to the page server.
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"page_server_connstring = 'host={} port={}'\n\
|
||||
zenith_timeline='{}'\n",
|
||||
self.pageserver.address().ip(),
|
||||
self.pageserver.address().port(),
|
||||
self.timelineid
|
||||
),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pgdata(&self) -> PathBuf {
|
||||
self.env.repo_path.join("pgdatadirs").join(&self.name)
|
||||
}
|
||||
|
||||
pub fn status(&self) -> &str {
|
||||
let timeout = Duration::from_millis(300);
|
||||
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
|
||||
let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
|
||||
|
||||
match (has_pidfile, can_connect) {
|
||||
(true, true) => "running",
|
||||
(false, false) => "stopped",
|
||||
(true, false) => "crashed",
|
||||
(false, true) => "running, no pidfile",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_conf(&self, config: &str, opts: &str) {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata().join(config).to_str().unwrap())
|
||||
.unwrap()
|
||||
.write_all(opts.as_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str]) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
|
||||
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata().to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata().join("log").to_str().unwrap(),
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.with_context(|| "pg_ctl failed")?;
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_ctl failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"])
|
||||
}
|
||||
|
||||
pub fn restart(&self) -> Result<()> {
|
||||
self.pg_ctl(&["restart"])
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"])
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!(
|
||||
"host={} port={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
self.whoami()
|
||||
)
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
fn dump_log_file(&self) {
|
||||
if let Ok(mut file) = File::open(self.env.repo_path.join("pageserver.log")) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- Dump pageserver.log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
let result = client.query(sql, &[]);
|
||||
if result.is_err() {
|
||||
self.dump_log_file();
|
||||
}
|
||||
result.unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls).unwrap()
|
||||
}
|
||||
|
||||
pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
|
||||
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["--ztimelineid", &self.timelineid.to_string()])
|
||||
.args(&["-s", wal_acceptors])
|
||||
.args(&["-h", &self.address.ip().to_string()])
|
||||
.args(&["-p", &self.address.port().to_string()])
|
||||
.arg("-v")
|
||||
.stderr(
|
||||
OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(self.pgdata().join("safekeeper_proxy.log"))
|
||||
.unwrap(),
|
||||
)
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pg_regress(&self) {
|
||||
self.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
let data_dir = zenith_repo_dir();
|
||||
let regress_run_path = data_dir.join("regress");
|
||||
fs::create_dir_all(®ress_run_path).unwrap();
|
||||
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGPORT", self.address.port().to_string())
|
||||
.env("PGUSER", self.whoami())
|
||||
.env("PGHOST", self.address.ip().to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
}
|
||||
|
||||
pub fn pg_bench(&self, clients: u32, seconds: u32) {
|
||||
let port = self.address.port().to_string();
|
||||
let clients = clients.to_string();
|
||||
let seconds = seconds.to_string();
|
||||
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&["-i", "-p", port.as_str(), "postgres"])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench -i");
|
||||
let _pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&[
|
||||
"-p",
|
||||
port.as_str(),
|
||||
"-T",
|
||||
seconds.as_str(),
|
||||
"-P",
|
||||
"1",
|
||||
"-c",
|
||||
clients.as_str(),
|
||||
"-M",
|
||||
"prepared",
|
||||
"postgres",
|
||||
])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench run");
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
if self.is_test {
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod storage;
|
||||
@@ -1,389 +0,0 @@
|
||||
//
|
||||
// This module is responsible for locating and loading paths in a local setup.
|
||||
//
|
||||
// Now it also provides init method which acts like a stub for proper installation
|
||||
// script which will use local paths.
|
||||
//
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use rand::Rng;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
use anyhow::Result;
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
use pageserver::zenith_repo_dir;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils;
|
||||
|
||||
//
|
||||
// This data structure represents deserialized zenith config, which should be
|
||||
// located in ~/.zenith
|
||||
//
|
||||
// TODO: should we also support ZENITH_CONF env var?
|
||||
//
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct LocalEnv {
|
||||
// Path to the Repository. Here page server and compute nodes will create and store their data.
|
||||
pub repo_path: PathBuf,
|
||||
|
||||
// System identifier, from the PostgreSQL control file
|
||||
pub systemid: u64,
|
||||
|
||||
// Path to postgres distribution. It's expected that "bin", "include",
|
||||
// "lib", "share" from postgres distribution are there. If at some point
|
||||
// in time we will be able to run against vanilla postgres we may split that
|
||||
// to four separate paths and match OS-specific installation layout.
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Path to pageserver binary.
|
||||
pub zenith_distrib_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
// postgres installation
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init() -> Result<()> {
|
||||
// check if config already exists
|
||||
let repo_path = zenith_repo_dir();
|
||||
if repo_path.exists() {
|
||||
anyhow::bail!(
|
||||
"{} already exists. Perhaps already initialized?",
|
||||
repo_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// Now we can run init only from crate directory, so check that current dir is our crate.
|
||||
// Use 'pageserver/Cargo.toml' existence as evidendce.
|
||||
let cargo_path = env::current_dir()?;
|
||||
if !cargo_path.join("pageserver/Cargo.toml").exists() {
|
||||
anyhow::bail!(
|
||||
"Current dirrectory does not look like a zenith repo. \
|
||||
Please, run 'init' from zenith repo root."
|
||||
);
|
||||
}
|
||||
|
||||
// ok, now check that expected binaries are present
|
||||
|
||||
// check postgres
|
||||
let pg_distrib_dir = cargo_path.join("tmp_install");
|
||||
let pg_path = pg_distrib_dir.join("bin/postgres");
|
||||
if !pg_path.exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find postres binary at {}. \
|
||||
Perhaps './pgbuild.sh' is needed to build it first.",
|
||||
pg_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// check pageserver
|
||||
let zenith_distrib_dir = cargo_path.join("target/debug/");
|
||||
let pageserver_path = zenith_distrib_dir.join("pageserver");
|
||||
if !pageserver_path.exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find pageserver binary at {}. Please build it.",
|
||||
pageserver_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// ok, we are good to go
|
||||
let mut conf = LocalEnv {
|
||||
repo_path,
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir,
|
||||
systemid: 0,
|
||||
};
|
||||
init_repo(&mut conf)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
|
||||
let repopath = &local_env.repo_path;
|
||||
fs::create_dir(&repopath)
|
||||
.with_context(|| format!("could not create directory {}", repopath.display()))?;
|
||||
fs::create_dir(repopath.join("pgdatadirs"))?;
|
||||
fs::create_dir(repopath.join("timelines"))?;
|
||||
fs::create_dir(repopath.join("refs"))?;
|
||||
fs::create_dir(repopath.join("refs").join("branches"))?;
|
||||
fs::create_dir(repopath.join("refs").join("tags"))?;
|
||||
println!("created directory structure in {}", repopath.display());
|
||||
|
||||
// Create initial timeline
|
||||
let tli = create_timeline(&local_env, None)?;
|
||||
let timelinedir = repopath.join("timelines").join(tli.to_string());
|
||||
println!("created initial timeline {}", timelinedir.display());
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// FIXME: we create it temporarily in "tmp" directory, and move it into
|
||||
// the repository. Use "tempdir()" or something? Or just create it directly
|
||||
// in the repo?
|
||||
let initdb_path = local_env.pg_bin_dir().join("initdb");
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", "tmp"])
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
|
||||
.env(
|
||||
"DYLD_LIBRARY_PATH",
|
||||
local_env.pg_lib_dir().to_str().unwrap(),
|
||||
)
|
||||
.stdout(Stdio::null())
|
||||
.status()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
if !initdb.success() {
|
||||
anyhow::bail!("initdb failed");
|
||||
}
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile =
|
||||
postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?;
|
||||
let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
"tmp/pg_wal/000000010000000000000001",
|
||||
timelinedir
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
)?;
|
||||
println!("moved initial WAL file");
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all("tmp/pg_wal")?;
|
||||
println!("removed tmp/pg_wal");
|
||||
|
||||
force_crash_recovery(&PathBuf::from("tmp"))?;
|
||||
println!("updated pg_control");
|
||||
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename("tmp", &target)?;
|
||||
println!("moved 'tmp' to {}", target.display());
|
||||
|
||||
// Create 'main' branch to refer to the initial timeline
|
||||
let data = tli.to_string();
|
||||
fs::write(repopath.join("refs").join("branches").join("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
// Also update the system id in the LocalEnv
|
||||
local_env.systemid = systemid;
|
||||
|
||||
// write config
|
||||
let toml = toml::to_string(&local_env)?;
|
||||
fs::write(repopath.join("config"), toml)?;
|
||||
|
||||
println!(
|
||||
"new zenith repository was created in {}",
|
||||
repopath.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If control file says the cluster was shut down cleanly, modify it, to mark
|
||||
// it as crashed. That forces crash recovery when you start the cluster.
|
||||
//
|
||||
// FIXME:
|
||||
// We currently do this to the initial snapshot in "zenith init". It would
|
||||
// be more natural to do this when the snapshot is restored instead, but we
|
||||
// currently don't have any code to create new snapshots, so it doesn't matter
|
||||
// Or better yet, use a less hacky way of putting the cluster into recovery.
|
||||
// Perhaps create a backup label file in the data directory when it's restored.
|
||||
fn force_crash_recovery(datadir: &Path) -> Result<()> {
|
||||
// Read in the control file
|
||||
let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
|
||||
let mut controlfile =
|
||||
postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
|
||||
|
||||
controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
|
||||
|
||||
fs::write(
|
||||
controlfilepath.as_path(),
|
||||
postgres_ffi::encode_pg_control(controlfile),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// check that config file is present
|
||||
pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
|
||||
if !repopath.exists() {
|
||||
anyhow::bail!(
|
||||
"Zenith config is not found in {}. You need to run 'zenith init' first",
|
||||
repopath.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
toml::from_str(config.as_str()).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
// local env for tests
|
||||
pub fn test_env(testname: &str) -> LocalEnv {
|
||||
fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check");
|
||||
|
||||
let repo_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_check/")
|
||||
.join(testname);
|
||||
|
||||
// Remove remnants of old test repo
|
||||
let _ = fs::remove_dir_all(&repo_path);
|
||||
|
||||
let mut local_env = LocalEnv {
|
||||
repo_path,
|
||||
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
|
||||
zenith_distrib_dir: cargo_bin_dir(),
|
||||
systemid: 0,
|
||||
};
|
||||
init_repo(&mut local_env).expect("could not initialize zenith repository");
|
||||
local_env
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
pub fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
pathbuf
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PointInTime {
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
fn create_timeline(local_env: &LocalEnv, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
// Create initial timeline
|
||||
let mut tli_buf = [0u8; 16];
|
||||
rand::thread_rng().fill(&mut tli_buf);
|
||||
let timelineid = ZTimelineId::from(tli_buf);
|
||||
|
||||
let timelinedir = repopath.join("timelines").join(timelineid.to_string());
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("snapshots"))?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!(
|
||||
"{}@{:X}/{:X}",
|
||||
ancestor.timelineid,
|
||||
ancestor.lsn >> 32,
|
||||
ancestor.lsn & 0xffffffff
|
||||
);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
// Parse an LSN in the format used in filenames
|
||||
//
|
||||
// For example: 00000000015D3DD8
|
||||
//
|
||||
fn parse_lsn(s: &str) -> std::result::Result<u64, std::num::ParseIntError> {
|
||||
u64::from_str_radix(s, 16)
|
||||
}
|
||||
|
||||
// Create a new branch in the repository (for the "zenith branch" subcommand)
|
||||
pub fn create_branch(
|
||||
local_env: &LocalEnv,
|
||||
branchname: &str,
|
||||
startpoint: PointInTime,
|
||||
) -> Result<()> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
// create a new timeline for it
|
||||
let newtli = create_timeline(local_env, Some(startpoint))?;
|
||||
let newtimelinedir = repopath.join("timelines").join(newtli.to_string());
|
||||
|
||||
let data = newtli.to_string();
|
||||
fs::write(
|
||||
repopath.join("refs").join("branches").join(branchname),
|
||||
data,
|
||||
)?;
|
||||
|
||||
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
|
||||
// TODO: be smarter and avoid the copying...
|
||||
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?;
|
||||
let copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?;
|
||||
|
||||
let oldtimelinedir = repopath
|
||||
.join("timelines")
|
||||
.join(startpoint.timelineid.to_string());
|
||||
let mut copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
copy_opts.content_only = true;
|
||||
fs_extra::dir::copy(
|
||||
oldtimelinedir.join("wal"),
|
||||
newtimelinedir.join("wal"),
|
||||
©_opts,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find the end of valid WAL in a wal directory
|
||||
pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u64> {
|
||||
let repopath = &local_env.repo_path;
|
||||
let waldir = repopath
|
||||
.join("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("wal");
|
||||
|
||||
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
let snapshotsdir = repopath
|
||||
.join("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots");
|
||||
let paths = fs::read_dir(&snapshotsdir)?;
|
||||
let mut maxsnapshot: u64 = 0;
|
||||
let mut snapshotdir: Option<PathBuf> = None;
|
||||
for path in paths {
|
||||
let path = path?;
|
||||
let filename = path.file_name().to_str().unwrap().to_owned();
|
||||
if let Ok(lsn) = parse_lsn(&filename) {
|
||||
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
|
||||
snapshotdir = Some(path.path());
|
||||
}
|
||||
}
|
||||
if maxsnapshot == 0 {
|
||||
// TODO: check ancestor timeline
|
||||
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
|
||||
}
|
||||
|
||||
Ok((maxsnapshot, snapshotdir.unwrap()))
|
||||
}
|
||||
@@ -1,380 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
//
|
||||
// Collection of several example deployments useful for tests.
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct TestStorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
pub test_done: AtomicBool,
|
||||
pub repopath: PathBuf,
|
||||
}
|
||||
|
||||
impl TestStorageControlPlane {
|
||||
// Peek into the repository, to grab the timeline ID of given branch
|
||||
pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId {
|
||||
let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname);
|
||||
|
||||
ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
|
||||
}
|
||||
|
||||
// postgres <-> page_server
|
||||
//
|
||||
// Initialize a new repository and configure a page server to run in it
|
||||
//
|
||||
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
pserver.start().unwrap();
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath,
|
||||
}
|
||||
}
|
||||
|
||||
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
|
||||
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let mut cplane = TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
}),
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath,
|
||||
};
|
||||
cplane.pageserver.start().unwrap();
|
||||
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)),
|
||||
env: local_env.clone(),
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
let _ = wa.stop();
|
||||
}
|
||||
self.test_done.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.test_done.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestStorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
//
|
||||
// Used in CLI and tests.
|
||||
//
|
||||
pub struct PageServerNode {
|
||||
kill_on_exit: bool,
|
||||
listen_address: Option<SocketAddr>,
|
||||
pub env: LocalEnv,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
listen_address: None, // default
|
||||
env: env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn address(&self) -> SocketAddr {
|
||||
match self.listen_address {
|
||||
Some(addr) => addr,
|
||||
None => "127.0.0.1:64000".parse().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
self.env.repo_path.clone()
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
self.env.repo_path.join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in {}",
|
||||
self.address(),
|
||||
self.repo_path().display()
|
||||
);
|
||||
|
||||
let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver"));
|
||||
cmd.args(&["-l", self.address().to_string().as_str()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
anyhow::bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
for retries in 1..15 {
|
||||
let client = self.page_server_psql_client();
|
||||
if client.is_ok() {
|
||||
break;
|
||||
} else {
|
||||
println!("page server not responding yet, retrying ({})...", retries);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
let pidfile = self.pid_file();
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
|
||||
let status = Command::new("kill")
|
||||
.arg(&pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
// await for pageserver stop
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(self.address());
|
||||
if let Err(_e) = stream {
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", self.address());
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
// ok, we failed to stop pageserver, let's panic
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to stop pageserver with pid {}", pid);
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(
|
||||
&self,
|
||||
) -> std::result::Result<postgres::Client, postgres::Error> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for WalAcceptor.
|
||||
//
|
||||
// Now used only in test setups.
|
||||
//
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.args(&["--systemid", &self.env.systemid.to_string()])
|
||||
// Tell page server it can receive WAL from this WAL safekeeper
|
||||
// FIXME: If there are multiple safekeepers, they will all inform
|
||||
// the page server. Only the last "notification" will stay in effect.
|
||||
// So it's pretty random which safekeeper the page server will connect to
|
||||
.args(&["--pageserver", "127.0.0.1:64000"])
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> std::result::Result<(), io::Error> {
|
||||
println!("Stopping wal acceptor on {}", self.listen);
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
// Ignores any failures when running this command
|
||||
let _status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pub pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
let status = Command::new("kill")
|
||||
.arg(self.pid.to_string())
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// This should contain an unsigned integer, but we return it as a String
|
||||
/// because our callers only want to pass it back into a subcommand.
|
||||
fn read_pidfile(pidfile: &Path) -> std::result::Result<String, io::Error> {
|
||||
fs::read_to_string(pidfile).map_err(|err| {
|
||||
eprintln!("failed to read pidfile {:?}: {:?}", pidfile, err);
|
||||
err
|
||||
})
|
||||
}
|
||||
1
integration_tests/.gitignore
vendored
1
integration_tests/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
tmp_check/
|
||||
@@ -9,9 +9,8 @@ edition = "2018"
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
rand = "0.8.3"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
|
||||
844
integration_tests/tests/control_plane/mod.rs
Normal file
844
integration_tests/tests/control_plane/mod.rs
Normal file
@@ -0,0 +1,844 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
|
||||
use std::fs::File;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::{
|
||||
io::Write,
|
||||
net::{IpAddr, Ipv4Addr, SocketAddr},
|
||||
};
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use postgres;
|
||||
|
||||
lazy_static! {
|
||||
// postgres would be there if it was build by 'make postgres' here in the repo
|
||||
pub static ref PG_BIN_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/bin");
|
||||
pub static ref PG_LIB_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/lib");
|
||||
|
||||
pub static ref BIN_DIR : PathBuf = cargo_bin_dir();
|
||||
|
||||
pub static ref TEST_WORKDIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tmp_check");
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
pub fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().ok().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
return pathbuf;
|
||||
}
|
||||
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct StorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub page_servers: Vec<PageServerNode>,
|
||||
}
|
||||
|
||||
impl StorageControlPlane {
|
||||
// postgres <-> page_server
|
||||
pub fn one_page_server(froms3: bool) -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
|
||||
let pserver = PageServerNode {
|
||||
page_service_addr: "127.0.0.1:65200".parse().unwrap(),
|
||||
data_dir: TEST_WORKDIR.join("pageserver"),
|
||||
};
|
||||
pserver.init();
|
||||
if froms3 {
|
||||
pserver.start_froms3();
|
||||
} else {
|
||||
pserver.start();
|
||||
}
|
||||
|
||||
cplane.page_servers.push(pserver);
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn fault_tolerant(redundancy: usize) -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: TEST_WORKDIR.join(format!("wal_acceptor_{}", i)),
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
wa.stop();
|
||||
}
|
||||
}
|
||||
|
||||
// // postgres <-> wal_acceptor x3 <-> page_server
|
||||
// fn local(&mut self) -> StorageControlPlane {
|
||||
// }
|
||||
|
||||
pub fn page_server_addr(&self) -> &SocketAddr {
|
||||
&self.page_servers[0].page_service_addr
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string().to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let addr = &self.page_servers[0].page_service_addr;
|
||||
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
addr.ip(),
|
||||
addr.port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageServerNode {
|
||||
page_service_addr: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
// TODO: method to force redo on a specific relation
|
||||
|
||||
// TODO: make wal-redo-postgres workable without data directory?
|
||||
pub fn init(&self) {
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
|
||||
let datadir_path = self.data_dir.join("wal_redo_pgdata");
|
||||
fs::remove_dir_all(datadir_path.to_str().unwrap()).ok();
|
||||
|
||||
let initdb = Command::new(PG_BIN_DIR.join("initdb"))
|
||||
.args(&["-D", datadir_path.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!("Starting pageserver at '{}'", self.page_service_addr);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("pageserver"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.page_service_addr.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("--skip-recovery")
|
||||
.env_clear()
|
||||
.env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
|
||||
.status()
|
||||
.expect("failed to start pageserver");
|
||||
|
||||
if !status.success() {
|
||||
panic!("pageserver start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_froms3(&self) {
|
||||
println!("Starting pageserver at '{}'", self.page_service_addr);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("pageserver"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.page_service_addr.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
|
||||
.env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
.env("S3_REGION", "us-east-1")
|
||||
.env("S3_ACCESSKEY", "minioadmin")
|
||||
.env("S3_SECRET", "minioadmin")
|
||||
.status()
|
||||
.expect("failed to start pageserver");
|
||||
|
||||
if !status.success() {
|
||||
panic!("pageserver start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("pageserver.pid");
|
||||
let pid = fs::read_to_string(pidfile).unwrap();
|
||||
let status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("wal_acceptor"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
if let Ok(pid) = fs::read_to_string(pidfile) {
|
||||
let _status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane<'a> {
|
||||
pg_bin_dir: PathBuf,
|
||||
work_dir: PathBuf,
|
||||
last_assigned_port: u16,
|
||||
storage_cplane: &'a StorageControlPlane,
|
||||
nodes: Vec<Arc<PostgresNode>>,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane<'_> {
|
||||
pub fn local(storage_cplane: &StorageControlPlane) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
pg_bin_dir: PG_BIN_DIR.to_path_buf(),
|
||||
work_dir: TEST_WORKDIR.to_path_buf(),
|
||||
last_assigned_port: 65431,
|
||||
storage_cplane: storage_cplane,
|
||||
nodes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: check port availability and
|
||||
fn get_port(&mut self) -> u16 {
|
||||
let port = self.last_assigned_port + 1;
|
||||
self.last_assigned_port += 1;
|
||||
port
|
||||
}
|
||||
|
||||
pub fn new_vanilla_node<'a>(&mut self) -> &Arc<PostgresNode> {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// // allow local replication connections
|
||||
// node.append_conf("pg_hba.conf", format!("\
|
||||
// host replication all {}/32 sspi include_realm=1 map=regress\n\
|
||||
// ", node.ip).as_str());
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
// Init compute node without files, only datadir structure
|
||||
// use initdb --compute-node flag and GUC 'computenode_mode'
|
||||
// to distinguish the node
|
||||
pub fn new_minimal_node<'a>(&mut self) -> &Arc<PostgresNode> {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.arg("--compute-node")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// // allow local replication connections
|
||||
// node.append_conf("pg_hba.conf", format!("\
|
||||
// host replication all {}/32 sspi include_realm=1 map=regress\n\
|
||||
// ", node.ip).as_str());
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
computenode_mode = true\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node_wo_data(&mut self) -> Arc<PostgresNode> {
|
||||
let storage_cplane = self.storage_cplane;
|
||||
let node = self.new_minimal_node();
|
||||
|
||||
let pserver = storage_cplane.page_server_addr();
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
page_server_connstring = 'host={} port={}'\n\
|
||||
",
|
||||
pserver.ip(),
|
||||
pserver.port()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node.clone()
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self) -> Arc<PostgresNode> {
|
||||
let storage_cplane = self.storage_cplane;
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
let pserver = storage_cplane.page_server_addr();
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
page_server_connstring = 'host={} port={}'\n\
|
||||
",
|
||||
pserver.ip(),
|
||||
pserver.port()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node.clone()
|
||||
}
|
||||
|
||||
pub fn new_master_node(&mut self) -> Arc<PostgresNode> {
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n\
|
||||
",
|
||||
);
|
||||
node.clone()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
let status = Command::new("kill")
|
||||
.arg(self.pid.to_string())
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
_node_id: usize,
|
||||
pub port: u16,
|
||||
pub ip: IpAddr,
|
||||
pgdata: PathBuf,
|
||||
pg_bin_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
pub fn append_conf(&self, config: &str, opts: &str) {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata.join(config).to_str().unwrap())
|
||||
.unwrap()
|
||||
.write_all(opts.as_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], check_ok: bool) {
|
||||
let pg_ctl_path = self.pg_bin_dir.join("pg_ctl");
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata.join("log").to_str().unwrap(),
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute pg_ctl");
|
||||
|
||||
if check_ok && !pg_ctl.success() {
|
||||
panic!("pg_ctl failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self, storage_cplane: &StorageControlPlane) {
|
||||
if storage_cplane.page_servers.len() != 0 {
|
||||
let _res =
|
||||
storage_cplane.page_server_psql(format!("callmemaybe {}", self.connstr()).as_str());
|
||||
}
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"], true);
|
||||
}
|
||||
|
||||
pub fn restart(&self) {
|
||||
self.pg_ctl(&["restart"], true);
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], true);
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!("host={} port={} user={}", self.ip, self.port, self.whoami())
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
client.query(sql, &[]).unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_pgdata(&self) -> Option<&str> {
|
||||
self.pgdata.to_str()
|
||||
}
|
||||
|
||||
// Request from pageserver stub controlfile, respective xlog
|
||||
// and a bunch of files needed to start computenode
|
||||
//
|
||||
// NOTE this "file" request is a crutch.
|
||||
// It asks pageserver to write requested page to the provided filepath
|
||||
// and thus only works locally.
|
||||
// TODO receive pages via some libpq protocol.
|
||||
// The problem I've met is that nonrelfiles are not valid utf8 and cannot be
|
||||
// handled by simple_query(). that expects test.
|
||||
// And reqular query() uses prepared queries.
|
||||
|
||||
// TODO pass sysid as parameter
|
||||
pub fn setup_compute_node(&self, sysid: u64, storage_cplane: &StorageControlPlane) {
|
||||
let mut query;
|
||||
//Request pg_control from pageserver
|
||||
query = format!(
|
||||
"file {}/global/pg_control,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
1664, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
42, //forknum pg_control
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
//Request pg_xact and pg_multixact from pageserver
|
||||
//We need them for initial pageserver startup and authentication
|
||||
//TODO figure out which block number we really need
|
||||
query = format!(
|
||||
"file {}/pg_xact/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
44, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
query = format!(
|
||||
"file {}/pg_multixact/offsets/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
45, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
query = format!(
|
||||
"file {}/pg_multixact/members/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
46, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
//Request a few shared catalogs needed for authentication
|
||||
//Without them we cannot setup connection with pageserver to request further pages
|
||||
let reloids = [1260, 1261, 1262, 2396];
|
||||
for reloid in reloids.iter() {
|
||||
//FIXME request all blocks from file, not just 10
|
||||
for blkno in 0..10 {
|
||||
query = format!(
|
||||
"file {}/global/{},{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
reloid, //suse it as filename
|
||||
sysid as u64, //sysid
|
||||
1664, //tablespace
|
||||
0, //dboid
|
||||
reloid, //reloid
|
||||
0, //forknum
|
||||
blkno, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
}
|
||||
}
|
||||
|
||||
fs::create_dir(format!("{}/base/13006", self.pgdata.to_str().unwrap())).unwrap();
|
||||
fs::create_dir(format!("{}/base/13007", self.pgdata.to_str().unwrap())).unwrap();
|
||||
|
||||
//FIXME figure out what wal file we need to successfully start
|
||||
let walfilepath = format!(
|
||||
"{}/pg_wal/000000010000000000000001",
|
||||
self.pgdata.to_str().unwrap()
|
||||
);
|
||||
fs::copy(
|
||||
"/home/anastasia/zenith/zenith/tmp_check/pgdata/pg_wal/000000010000000000000001",
|
||||
walfilepath,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
println!("before resetwal ");
|
||||
|
||||
let pg_resetwal_path = self.pg_bin_dir.join("pg_resetwal");
|
||||
|
||||
// Now it does nothing, just prints existing content of pg_control.
|
||||
// TODO update values with most recent lsn, xid, oid requested from pageserver
|
||||
let pg_resetwal = Command::new(pg_resetwal_path)
|
||||
.args(&["-D", self.pgdata.to_str().unwrap()])
|
||||
.arg("-n") //dry run
|
||||
//.arg("-f")
|
||||
//.args(&["--next-transaction-id", "100500"])
|
||||
//.args(&["--next-oid", "17000"])
|
||||
//.args(&["--next-transaction-id", "100500"])
|
||||
.status()
|
||||
.expect("failed to execute pg_resetwal");
|
||||
|
||||
if !pg_resetwal.success() {
|
||||
panic!("pg_resetwal failed");
|
||||
}
|
||||
|
||||
println!("setup done");
|
||||
}
|
||||
|
||||
pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode {
|
||||
let proxy_path = PG_BIN_DIR.join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["-s", &wal_acceptors])
|
||||
.args(&["-h", &self.ip.to_string()])
|
||||
.args(&["-p", &self.port.to_string()])
|
||||
.arg("-v")
|
||||
.stderr(File::create(TEST_WORKDIR.join("safepkeeper_proxy.log")).unwrap())
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_to_s3(&self) {
|
||||
println!("Push to s3 node at '{}'", self.pgdata.to_str().unwrap());
|
||||
|
||||
let zenith_push_path = self.pg_bin_dir.join("zenith_push");
|
||||
println!("zenith_push_path: {}", zenith_push_path.to_str().unwrap());
|
||||
|
||||
let status = Command::new(zenith_push_path)
|
||||
.args(&["-D", self.pgdata.to_str().unwrap()])
|
||||
.env_clear()
|
||||
.env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
.env("S3_REGION", "us-east-1")
|
||||
.env("S3_ACCESSKEY", "minioadmin")
|
||||
.env("S3_SECRET", "minioadmin")
|
||||
// .env("S3_BUCKET", "zenith-testbucket")
|
||||
.status()
|
||||
.expect("failed to push node to s3");
|
||||
|
||||
if !status.success() {
|
||||
panic!("zenith_push failed");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
pub fn pg_bench() {}
|
||||
pub fn pg_regress() {}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.pgdata.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn regress_check(pg: &PostgresNode) {
|
||||
pg.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
|
||||
fs::create_dir_all(regress_run_path.clone()).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", PG_BIN_DIR.to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.env("PGPORT", pg.port.to_string())
|
||||
.env("PGUSER", pg.whoami())
|
||||
.env("PGHOST", pg.ip.to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
}
|
||||
@@ -1,11 +1,7 @@
|
||||
// test node resettlement to an empty datadir
|
||||
|
||||
// TODO
|
||||
/*
|
||||
#[test]
|
||||
fn test_resettlement() {}
|
||||
|
||||
// test seq scan of everythin after restart
|
||||
#[test]
|
||||
fn test_cold_seqscan() {}
|
||||
*/
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
// TODO
|
||||
/*
|
||||
#[test]
|
||||
fn test_actions() {}
|
||||
|
||||
#[test]
|
||||
fn test_regress() {}
|
||||
*/
|
||||
|
||||
@@ -1,24 +1,29 @@
|
||||
// mod control_plane;
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::PointInTime;
|
||||
use control_plane::storage::TestStorageControlPlane;
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
// XXX: force all redo at the end
|
||||
// -- restart + seqscan won't read deleted stuff
|
||||
// -- pageserver api endpoint to check all rels
|
||||
#[test]
|
||||
fn test_redo_cases() {
|
||||
let local_env = local_env::test_env("test_redo_cases");
|
||||
|
||||
//Handcrafted cases with wal records that are (were) problematic for redo.
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_redo_cases() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -27,7 +32,7 @@ fn test_redo_cases() {
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -35,9 +40,9 @@ fn test_redo_cases() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
// check 'create table as'
|
||||
//check 'create table as'
|
||||
node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -45,72 +50,45 @@ fn test_redo_cases() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
assert_eq!(count, 5050);
|
||||
}
|
||||
|
||||
// Runs pg_regress on a compute node
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_regress() {
|
||||
let local_env = local_env::test_env("test_regress");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
node.pg_regress();
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
control_plane::regress_check(&node);
|
||||
}
|
||||
|
||||
// Runs pg_bench on a compute node
|
||||
// Run two postgres instances on one pageserver
|
||||
#[test]
|
||||
fn pgbench() {
|
||||
let local_env = local_env::test_env("pgbench");
|
||||
|
||||
#[ignore]
|
||||
fn test_pageserver_multitenancy() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
// Allocate postgres instance, but don't start
|
||||
let node1 = compute_cplane.new_node();
|
||||
let node2 = compute_cplane.new_node();
|
||||
node1.start(&storage_cplane);
|
||||
node2.start(&storage_cplane);
|
||||
|
||||
node.pg_bench(10, 100);
|
||||
}
|
||||
|
||||
// Run two postgres instances on one pageserver, on different timelines
|
||||
#[test]
|
||||
fn test_pageserver_two_timelines() {
|
||||
let local_env = local_env::test_env("test_pageserver_two_timelines");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
|
||||
// Create new branch at the end of 'main'
|
||||
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
|
||||
local_env::create_branch(
|
||||
&local_env,
|
||||
"experimental",
|
||||
PointInTime {
|
||||
timelineid: maintli,
|
||||
lsn: startpoint,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
let experimentaltli = storage_cplane.get_branch_timeline("experimental");
|
||||
|
||||
// Launch postgres instances on both branches
|
||||
let node1 = compute_cplane.new_test_node(maintli);
|
||||
let node2 = compute_cplane.new_test_node(experimentaltli);
|
||||
node1.start().unwrap();
|
||||
node2.start().unwrap();
|
||||
// XXX: add some extension func to postgres to check walsender conn
|
||||
// XXX: or better just drop that
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
// check node1
|
||||
node1.safe_psql(
|
||||
@@ -119,7 +97,7 @@ fn test_pageserver_two_timelines() {
|
||||
);
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node1
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -127,7 +105,7 @@ fn test_pageserver_two_timelines() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
// check node2
|
||||
node2.safe_psql(
|
||||
@@ -136,7 +114,7 @@ fn test_pageserver_two_timelines() {
|
||||
);
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(100000,200000), 'payload'",
|
||||
"INSERT INTO t SELECT generate_series(100,200), 'payload'",
|
||||
);
|
||||
let count: i64 = node2
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -144,5 +122,89 @@ fn test_pageserver_two_timelines() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 15000150000);
|
||||
assert_eq!(count, 15150);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
// Start pageserver using s3 base image
|
||||
//
|
||||
// Requires working minio with hardcoded setup:
|
||||
// .env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
// .env("S3_REGION", "us-east-1")
|
||||
// .env("S3_ACCESSKEY", "minioadmin")
|
||||
// .env("S3_SECRET", "minioadmin")
|
||||
// .env("S3_BUCKET", "zenith-testbucket")
|
||||
// TODO use env variables in test
|
||||
fn test_pageserver_recovery() {
|
||||
//This test expects that image is already uploaded to s3
|
||||
//To upload it use zenith_push before test (see node.push_to_s3() for details)
|
||||
let storage_cplane = StorageControlPlane::one_page_server(true);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
//Wait while daemon uploads pages from s3
|
||||
sleep(Duration::from_secs(15));
|
||||
|
||||
let node_restored = compute_cplane.new_node_wo_data();
|
||||
|
||||
//TODO 6947041219207877724 is a hardcoded sysid for my cluster. Get it somewhere
|
||||
node_restored.setup_compute_node(6947041219207877724, &storage_cplane);
|
||||
|
||||
node_restored.start(&storage_cplane);
|
||||
|
||||
let rows = node_restored.safe_psql("postgres", "SELECT relname from pg_class;");
|
||||
|
||||
assert_eq!(rows.len(), 395);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
//Scenario for future test. Not implemented yet
|
||||
fn test_pageserver_node_switch() {
|
||||
//Create pageserver
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
//Create reqular node
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
//Push all node files to s3
|
||||
//TODO upload them directly to pageserver
|
||||
node.push_to_s3();
|
||||
//Upload data from s3 to pageserver
|
||||
//storage_cplane.upload_from_s3() //Not implemented yet
|
||||
|
||||
//Shut down the node
|
||||
node.stop();
|
||||
|
||||
//Create new node without files
|
||||
let node_restored = compute_cplane.new_node_wo_data();
|
||||
|
||||
// Setup minimal set of files needed to start node and setup pageserver connection
|
||||
// TODO 6947041219207877724 is a hardcoded sysid. Get it from node
|
||||
node_restored.setup_compute_node(6947041219207877724, &storage_cplane);
|
||||
|
||||
//Start compute node without files
|
||||
node_restored.start(&storage_cplane);
|
||||
|
||||
//Ensure that is has table created on initial node
|
||||
let rows = node_restored.safe_psql("postgres", "SELECT key from t;");
|
||||
assert_eq!(rows.len(), 5050);
|
||||
}
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
// Restart acceptors one by one while compute is under the load.
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::PointInTime;
|
||||
use control_plane::storage::TestStorageControlPlane;
|
||||
use pageserver::ZTimelineId;
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
@@ -12,20 +11,18 @@ use std::{thread, time};
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
let local_env = local_env::test_env("test_acceptors_normal_work");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -46,97 +43,24 @@ fn test_acceptors_normal_work() {
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
// Run page server and multiple safekeepers, and multiple compute nodes running
|
||||
// against different timelines.
|
||||
#[test]
|
||||
fn test_many_timelines() {
|
||||
// Initialize a new repository, and set up WAL safekeepers and page server.
|
||||
const REDUNDANCY: usize = 3;
|
||||
const N_TIMELINES: usize = 5;
|
||||
let local_env = local_env::test_env("test_many_timelines");
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// Create branches
|
||||
let mut timelines: Vec<ZTimelineId> = Vec::new();
|
||||
let maintli = storage_cplane.get_branch_timeline("main"); // main branch
|
||||
timelines.push(maintli);
|
||||
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
|
||||
for i in 1..N_TIMELINES {
|
||||
// additional branches
|
||||
let branchname = format!("experimental{}", i);
|
||||
local_env::create_branch(
|
||||
&local_env,
|
||||
&branchname,
|
||||
PointInTime {
|
||||
timelineid: maintli,
|
||||
lsn: startpoint,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
let tli = storage_cplane.get_branch_timeline(&branchname);
|
||||
timelines.push(tli);
|
||||
}
|
||||
|
||||
// start postgres on each timeline
|
||||
let mut nodes = Vec::new();
|
||||
for tli in timelines {
|
||||
let node = compute_cplane.new_test_node(tli);
|
||||
nodes.push(node.clone());
|
||||
node.start().unwrap();
|
||||
node.start_proxy(&wal_acceptors);
|
||||
}
|
||||
|
||||
// create schema
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
}
|
||||
|
||||
// Populate data
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
}
|
||||
|
||||
// Check data
|
||||
for node in &nodes {
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
}
|
||||
}
|
||||
|
||||
// Majority is always alive
|
||||
#[test]
|
||||
fn test_acceptors_restarts() {
|
||||
let local_env = local_env::test_env("test_acceptors_restarts");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
const FAULT_PROBABILITY: f32 = 0.01;
|
||||
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
let mut failed_node: Option<usize> = None;
|
||||
|
||||
// check basic work with table
|
||||
@@ -156,7 +80,7 @@ fn test_acceptors_restarts() {
|
||||
} else {
|
||||
let node: usize = rng.gen_range(0..REDUNDANCY);
|
||||
failed_node = Some(node);
|
||||
storage_cplane.wal_acceptors[node].stop().unwrap();
|
||||
storage_cplane.wal_acceptors[node].stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -169,7 +93,7 @@ fn test_acceptors_restarts() {
|
||||
assert_eq!(count, 500500);
|
||||
}
|
||||
|
||||
fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
fn start_acceptor(cplane: &Arc<StorageControlPlane>, no: usize) {
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
thread::sleep(time::Duration::from_secs(1));
|
||||
@@ -181,23 +105,20 @@ fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
// them again and check that nothing was losed. Repeat.
|
||||
// N_CRASHES env var
|
||||
#[test]
|
||||
fn test_acceptors_unavailability() {
|
||||
let local_env = local_env::test_env("test_acceptors_unavailability");
|
||||
|
||||
fn test_acceptors_unavalability() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 2;
|
||||
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -208,7 +129,7 @@ fn test_acceptors_unavailability() {
|
||||
psql.execute("INSERT INTO t values (1, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
storage_cplane.wal_acceptors[0].stop().unwrap();
|
||||
storage_cplane.wal_acceptors[0].stop();
|
||||
let cp = Arc::new(storage_cplane);
|
||||
start_acceptor(&cp, 0);
|
||||
let now = SystemTime::now();
|
||||
@@ -218,7 +139,7 @@ fn test_acceptors_unavailability() {
|
||||
psql.execute("INSERT INTO t values (3, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
cp.wal_acceptors[1].stop().unwrap();
|
||||
cp.wal_acceptors[1].stop();
|
||||
start_acceptor(&cp, 1);
|
||||
psql.execute("INSERT INTO t values (4, 'payload')", &[])
|
||||
.unwrap();
|
||||
@@ -236,16 +157,16 @@ fn test_acceptors_unavailability() {
|
||||
assert_eq!(count, 15);
|
||||
}
|
||||
|
||||
fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
|
||||
fn simulate_failures(cplane: &Arc<StorageControlPlane>) {
|
||||
let mut rng = rand::thread_rng();
|
||||
let n_acceptors = cplane.wal_acceptors.len();
|
||||
let failure_period = time::Duration::from_secs(1);
|
||||
while cplane.is_running() {
|
||||
loop {
|
||||
thread::sleep(failure_period);
|
||||
let mask: u32 = rng.gen_range(0..(1 << n_acceptors));
|
||||
for i in 0..n_acceptors {
|
||||
if (mask & (1 << i)) != 0 {
|
||||
cplane.wal_acceptors[i].stop().unwrap();
|
||||
cplane.wal_acceptors[i].stop();
|
||||
}
|
||||
}
|
||||
thread::sleep(failure_period);
|
||||
@@ -260,34 +181,29 @@ fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
|
||||
// Race condition test
|
||||
#[test]
|
||||
fn test_race_conditions() {
|
||||
let local_env = local_env::test_env("test_race_conditions");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
|
||||
let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(
|
||||
&local_env, REDUNDANCY,
|
||||
));
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
|
||||
let cp = storage_cplane.clone();
|
||||
let failures_thread = thread::spawn(move || {
|
||||
simulate_failures(cp);
|
||||
let cplane = Arc::new(storage_cplane);
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
simulate_failures(&cp);
|
||||
});
|
||||
|
||||
let mut psql = node.open_psql("postgres");
|
||||
@@ -302,7 +218,5 @@ fn test_race_conditions() {
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 500500);
|
||||
|
||||
storage_cplane.stop();
|
||||
failures_thread.join().unwrap();
|
||||
cplane.stop();
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3.13"
|
||||
lazy_static = "1.4.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
@@ -25,21 +26,11 @@ clap = "2.33.0"
|
||||
termion = "1.5.6"
|
||||
tui = "0.14.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
rocksdb = "0.16.0"
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
walkdir = "2"
|
||||
thiserror = "1.0"
|
||||
hex = "0.4.3"
|
||||
tar = "0.4.33"
|
||||
parse_duration = "*"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use tar::Builder;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::ZTimelineId;
|
||||
|
||||
pub fn send_snapshot_tarball(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
snapshotlsn: u64,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn);
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
//ar.append_dir_all("", &snappath)?;
|
||||
|
||||
for entry in WalkDir::new(&snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
// Shared catalogs are exempt
|
||||
if relpath.starts_with("global/") {
|
||||
trace!("sending shared catalog {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
trace!("sending {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
// FIXME: send all files for now
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: also send all the WAL
|
||||
for entry in std::fs::read_dir(&walpath)? {
|
||||
let entry = entry?;
|
||||
let fullpath = &entry.path();
|
||||
let relpath = fullpath.strip_prefix(&walpath).unwrap();
|
||||
|
||||
if !entry.path().is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let archive_fname = relpath.to_str().unwrap();
|
||||
let archive_fname = archive_fname
|
||||
.strip_suffix(".partial")
|
||||
.unwrap_or(&archive_fname);
|
||||
let archive_path = "pg_wal/".to_owned() + archive_fname;
|
||||
ar.append_path_with_name(fullpath, archive_path)?;
|
||||
}
|
||||
|
||||
ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
*
|
||||
* global/
|
||||
* shared relations
|
||||
*
|
||||
* base/<db oid>/
|
||||
* regular relations, default tablespace
|
||||
*
|
||||
* pg_tblspc/<tblspc oid>/<tblspc version>/
|
||||
* within a non-default tablespace (the name of the directory
|
||||
* depends on version)
|
||||
*
|
||||
* And the relation data files themselves have a filename like:
|
||||
*
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split('/');
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let _dbnode = u32::from_str_radix(dbnode_str, 10)?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
} else {
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
}
|
||||
}
|
||||
|
||||
fn is_rel_file_path(path: &str) -> bool {
|
||||
parse_rel_file_path(path).is_ok()
|
||||
}
|
||||
43
pageserver/src/bin/cli/main.rs
Normal file
43
pageserver/src/bin/cli/main.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
pub mod pg;
|
||||
pub mod snapshot;
|
||||
pub mod storage;
|
||||
mod subcommand;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli_commands = subcommand::ClapCommands {
|
||||
commands: vec![
|
||||
Box::new(pg::PgCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("pg"),
|
||||
}),
|
||||
Box::new(storage::StorageCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("storage"),
|
||||
}),
|
||||
Box::new(snapshot::SnapshotCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("snapshot"),
|
||||
}),
|
||||
],
|
||||
};
|
||||
|
||||
let matches = App::new("zenith")
|
||||
.about("Zenith CLI")
|
||||
.version("1.0")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommands(cli_commands.generate())
|
||||
.get_matches();
|
||||
|
||||
if let Some(subcommand) = matches.subcommand_name() {
|
||||
println!("'git {}' was used", subcommand);
|
||||
}
|
||||
|
||||
match matches.subcommand() {
|
||||
("pg", Some(sub_args)) => cli_commands.commands[0].run(sub_args.clone())?,
|
||||
("storage", Some(sub_args)) => cli_commands.commands[1].run(sub_args.clone())?,
|
||||
("snapshot", Some(sub_args)) => cli_commands.commands[2].run(sub_args.clone())?,
|
||||
("", None) => println!("No subcommand"),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
105
pageserver/src/bin/cli/pg.rs
Normal file
105
pageserver/src/bin/cli/pg.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct PgCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for PgCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith compute nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list").about("List existing compute nodes"))
|
||||
.subcommand(
|
||||
App::new("create")
|
||||
.about(
|
||||
"Create (init) new data directory using given storage and start postgres",
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("storage")
|
||||
.short("s")
|
||||
.long("storage")
|
||||
.takes_value(true)
|
||||
.help("Name of the storage node to use"),
|
||||
)
|
||||
//TODO should it be just name of uploaded snapshot or some path?
|
||||
.arg(
|
||||
Arg::with_name("snapshot")
|
||||
.long("snapshot")
|
||||
.takes_value(true)
|
||||
.help("Name of the snapshot to use"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("nostart")
|
||||
.long("no-start")
|
||||
.takes_value(false)
|
||||
.help("Don't start postgres on the created node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("destroy")
|
||||
.about("Stop postgres and destroy node's data directory")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("start")
|
||||
.about("Start postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("replica")
|
||||
.long("replica")
|
||||
.takes_value(false)
|
||||
.help("Start the compute node as replica"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("stop")
|
||||
.about("Stop postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("show")
|
||||
.about("Show info about the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run PgCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
27
pageserver/src/bin/cli/snapshot.rs
Normal file
27
pageserver/src/bin/cli/snapshot.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct SnapshotCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for SnapshotCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith snapshots")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true)))
|
||||
.subcommand(App::new("destroy"))
|
||||
.subcommand(App::new("start"))
|
||||
.subcommand(App::new("stop"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run SnapshotCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
25
pageserver/src/bin/cli/storage.rs
Normal file
25
pageserver/src/bin/cli/storage.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct StorageCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for StorageCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith storage nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("attach"))
|
||||
.subcommand(App::new("detach"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run StorageCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
29
pageserver/src/bin/cli/subcommand.rs
Normal file
29
pageserver/src/bin/cli/subcommand.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use anyhow::Result;
|
||||
|
||||
/// All subcommands need to implement this interface.
|
||||
pub trait SubCommand {
|
||||
/// Generates the cli-config that Clap requires for the subcommand.
|
||||
fn gen_clap_command(&self) -> clap::App;
|
||||
|
||||
/// Runs the body of the subcommand.
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()>;
|
||||
}
|
||||
|
||||
/// A struct which holds a vector of heap-allocated `Box`es of trait objects all of which must
|
||||
/// implement the `SubCommand` trait, but other than that, can be of any type.
|
||||
pub struct ClapCommands {
|
||||
pub commands: Vec<Box<dyn SubCommand>>,
|
||||
}
|
||||
|
||||
impl ClapCommands {
|
||||
/// Generates a vector of `clap::Apps` that can be passed into clap's `.subcommands()` method in
|
||||
/// order to generate the full CLI.
|
||||
pub fn generate(&self) -> Vec<clap::App> {
|
||||
let mut v: Vec<clap::App> = Vec::new();
|
||||
|
||||
for command in self.commands.iter() {
|
||||
v.push(command.gen_clap_command());
|
||||
}
|
||||
v
|
||||
}
|
||||
}
|
||||
@@ -3,71 +3,73 @@
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use parse_duration::parse;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{fs::File, fs::OpenOptions, str::FromStr};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
use slog_scope;
|
||||
use slog_stdlog;
|
||||
|
||||
use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf};
|
||||
use pageserver::page_service;
|
||||
use pageserver::restore_s3;
|
||||
use pageserver::tui;
|
||||
use pageserver::walreceiver;
|
||||
use pageserver::PageServerConf;
|
||||
|
||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
const DEFAULT_GC_PERIOD_SEC: u64 = 10;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
fn main() -> Result<(), io::Error> {
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_horizon")
|
||||
.long("gc_horizon")
|
||||
.takes_value(true)
|
||||
.help("Distance from current LSN to perform all wal records cleanup"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_period")
|
||||
.long("gc_period")
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(Arg::with_name("datadir")
|
||||
.short("D")
|
||||
.long("dir")
|
||||
.takes_value(true)
|
||||
.help("Path to the page server data directory"))
|
||||
.arg(Arg::with_name("wal_producer")
|
||||
.short("w")
|
||||
.long("wal-producer")
|
||||
.takes_value(true)
|
||||
.help("connect to the WAL sender (postgres or wal_acceptor) on connstr (default: 'host=127.0.0.1 port=65432 user=zenith')"))
|
||||
.arg(Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"))
|
||||
.arg(Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"))
|
||||
.arg(Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"))
|
||||
.arg(Arg::with_name("skip_recovery")
|
||||
.long("skip-recovery")
|
||||
.takes_value(false)
|
||||
.help("Skip S3 recovery procedy and start empty"))
|
||||
.get_matches();
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
|
||||
wal_producer_connstr: None,
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
skip_recovery: false,
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
conf.data_dir = PathBuf::from(dir);
|
||||
}
|
||||
|
||||
if arg_matches.is_present("daemonize") {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
@@ -77,29 +79,31 @@ fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
if conf.daemonize && conf.interactive {
|
||||
eprintln!("--daemonize is not allowed with --interactive: choose one");
|
||||
exit(1);
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"--daemonize is not allowed with --interactive: choose one",
|
||||
));
|
||||
}
|
||||
|
||||
if arg_matches.is_present("skip_recovery") {
|
||||
conf.skip_recovery = true;
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("wal_producer") {
|
||||
conf.wal_producer_connstr = Some(String::from_str(addr).unwrap());
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.parse()?;
|
||||
conf.listen_addr = addr.parse().unwrap();
|
||||
}
|
||||
|
||||
if let Some(horizon) = arg_matches.value_of("gc_horizon") {
|
||||
conf.gc_horizon = horizon.parse()?;
|
||||
}
|
||||
|
||||
if let Some(period) = arg_matches.value_of("gc_period") {
|
||||
conf.gc_period = parse(period)?;
|
||||
}
|
||||
|
||||
start_pageserver(&conf)
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> {
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf)?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
let _scope_guard = init_logging(&conf);
|
||||
let _log_guard = slog_stdlog::init().unwrap();
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
@@ -123,25 +127,22 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
let repodir = PathBuf::from(zenith_repo_dir());
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let log_filename = repodir.join("pageserver.log");
|
||||
// that we will see any accidental manual fpritf's or backtraces.
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file(repodir.join("pageserver.pid"))
|
||||
.working_directory(repodir)
|
||||
.pid_file(conf.data_dir.join("pageserver.pid"))
|
||||
.working_directory(conf.data_dir.clone())
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
@@ -149,44 +150,65 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
}
|
||||
} else {
|
||||
// change into the repository directory. In daemon mode, Daemonize
|
||||
// does this for us.
|
||||
let repodir = zenith_repo_dir();
|
||||
std::env::set_current_dir(&repodir)?;
|
||||
info!("Changed current directory to repository in {:?}", &repodir);
|
||||
}
|
||||
|
||||
let mut threads = Vec::new();
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
info!("starting...");
|
||||
|
||||
// Before opening up for connections, restore the latest base backup from S3.
|
||||
// (We don't persist anything to local disk at the moment, so we need to do
|
||||
// this at every startup)
|
||||
// TODO move it to a separate function
|
||||
if !conf.skip_recovery {
|
||||
restore_s3::restore_main(&conf);
|
||||
}
|
||||
|
||||
// Create directory for wal-redo datadirs
|
||||
match fs::create_dir("wal-redo") {
|
||||
match fs::create_dir(conf.data_dir.join("wal-redo")) {
|
||||
Ok(_) => {}
|
||||
Err(e) => match e.kind() {
|
||||
io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
anyhow::bail!("Failed to create wal-redo data directory: {}", e);
|
||||
panic!("Failed to create wal-redo data directory: {}", e);
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// Launch the WAL receiver thread if pageserver was started with --wal-producer
|
||||
// option. It will try to connect to the WAL safekeeper, and stream the WAL. If
|
||||
// the connection is lost, it will reconnect on its own. We just fire and forget
|
||||
// it here.
|
||||
//
|
||||
// All other wal receivers are started on demand by "callmemaybe" command
|
||||
// sent to pageserver.
|
||||
let conf_copy = conf.clone();
|
||||
if let Some(wal_producer) = conf.wal_producer_connstr {
|
||||
let conf = conf_copy.clone();
|
||||
let walreceiver_thread = thread::Builder::new()
|
||||
.name("static WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
walreceiver::thread_main(conf, &wal_producer);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(walreceiver_thread);
|
||||
}
|
||||
|
||||
// GetPage@LSN requests are served by another thread. (It uses async I/O,
|
||||
// but the code in page_service sets up it own thread pool for that)
|
||||
let conf_copy = conf.clone();
|
||||
let conf = conf_copy.clone();
|
||||
let page_server_thread = thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(move || {
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
page_service::thread_main(&conf_copy);
|
||||
page_service::thread_main(conf);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(page_server_thread);
|
||||
|
||||
if let Some(tui_thread) = tui_thread {
|
||||
if tui_thread.is_some() {
|
||||
// The TUI thread exits when the user asks to Quit.
|
||||
tui_thread.join().unwrap();
|
||||
tui_thread.unwrap().join().unwrap();
|
||||
} else {
|
||||
// In non-interactive mode, wait forever.
|
||||
for t in threads {
|
||||
@@ -196,31 +218,23 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard {
|
||||
if conf.interactive {
|
||||
Ok(tui::init_logging())
|
||||
tui::init_logging()
|
||||
} else if conf.daemonize {
|
||||
let log = zenith_repo_dir().join("pageserver.log");
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log)
|
||||
.map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let log = conf.data_dir.join("pageserver.log");
|
||||
let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file"));
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
if record.level().is_at_least(slog::Level::Debug) {
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
slog_scope::set_global_logger(logger)
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
@@ -234,10 +248,10 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
slog_scope::set_global_logger(logger)
|
||||
}
|
||||
}
|
||||
|
||||
218
pageserver/src/controlfile.rs
Normal file
218
pageserver/src/controlfile.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use log::*;
|
||||
|
||||
type XLogRecPtr = u64;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone)]
|
||||
/*
|
||||
* Body of CheckPoint XLOG records. This is declared here because we keep
|
||||
* a copy of the latest one in pg_control for possible disaster recovery.
|
||||
* Changing this struct requires a PG_CONTROL_VERSION bump.
|
||||
*/
|
||||
pub struct CheckPoint {
|
||||
pub redo: XLogRecPtr, /* next RecPtr available when we began to
|
||||
* create CheckPoint (i.e. REDO start point) */
|
||||
pub ThisTimeLineID: u32, /* current TLI */
|
||||
pub PrevTimeLineID: u32, /* previous TLI, if this record begins a new
|
||||
* timeline (equals ThisTimeLineID otherwise) */
|
||||
pub fullPageWrites: bool, /* current full_page_writes */
|
||||
pub nextXid: u64, /* next free transaction ID */
|
||||
pub nextOid: u32, /* next free OID */
|
||||
pub nextMulti: u32, /* next free MultiXactId */
|
||||
pub nextMultiOffset: u32, /* next free MultiXact offset */
|
||||
pub oldestXid: u32, /* cluster-wide minimum datfrozenxid */
|
||||
pub oldestXidDB: u32, /* database with minimum datfrozenxid */
|
||||
pub oldestMulti: u32, /* cluster-wide minimum datminmxid */
|
||||
pub oldestMultiDB: u32, /* database with minimum datminmxid */
|
||||
pub time: u64, /* time stamp of checkpoint */
|
||||
pub oldestCommitTsXid: u32, /* oldest Xid with valid commit
|
||||
* timestamp */
|
||||
pub newestCommitTsXid: u32, /* newest Xid with valid commit
|
||||
* timestamp */
|
||||
|
||||
/*
|
||||
* Oldest XID still running. This is only needed to initialize hot standby
|
||||
* mode from an online checkpoint, so we only bother calculating this for
|
||||
* online checkpoints and only when wal_level is replica. Otherwise it's
|
||||
* set to InvalidTransactionId.
|
||||
*/
|
||||
pub oldestActiveXid: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ControlFileDataZenith {
|
||||
pub system_identifier: u64,
|
||||
pg_control_version: u32, /* PG_CONTROL_VERSION */
|
||||
catalog_version_no: u32, /* see catversion.h */
|
||||
|
||||
state: i32, /* see enum above */
|
||||
time: i64, /* time stamp of last pg_control update */
|
||||
pub checkPoint: XLogRecPtr,
|
||||
checkPointCopy: CheckPoint, /* copy of last check point record */
|
||||
unloggedLSN: XLogRecPtr, /* current fake LSN value, for unlogged rels */
|
||||
minRecoveryPoint: XLogRecPtr,
|
||||
minRecoveryPointTLI: u32,
|
||||
backupStartPoint: XLogRecPtr,
|
||||
backupEndPoint: XLogRecPtr,
|
||||
backupEndRequired: bool,
|
||||
}
|
||||
|
||||
impl ControlFileDataZenith {
|
||||
pub fn new() -> ControlFileDataZenith {
|
||||
ControlFileDataZenith {
|
||||
system_identifier: 0,
|
||||
pg_control_version: 0,
|
||||
catalog_version_no: 0,
|
||||
state: 0,
|
||||
time: 0,
|
||||
checkPoint: 0,
|
||||
checkPointCopy: {
|
||||
CheckPoint {
|
||||
redo: 0,
|
||||
ThisTimeLineID: 0,
|
||||
PrevTimeLineID: 0,
|
||||
fullPageWrites: false,
|
||||
nextXid: 0,
|
||||
nextOid: 0,
|
||||
nextMulti: 0,
|
||||
nextMultiOffset: 0,
|
||||
oldestXid: 0,
|
||||
oldestXidDB: 0,
|
||||
oldestMulti: 0,
|
||||
oldestMultiDB: 0,
|
||||
time: 0,
|
||||
oldestCommitTsXid: 0,
|
||||
newestCommitTsXid: 0,
|
||||
oldestActiveXid: 0,
|
||||
}
|
||||
},
|
||||
unloggedLSN: 0,
|
||||
minRecoveryPoint: 0,
|
||||
minRecoveryPointTLI: 0,
|
||||
backupStartPoint: 0,
|
||||
backupEndPoint: 0,
|
||||
backupEndRequired: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(mut buf: Bytes) -> ControlFileDataZenith {
|
||||
info!("decode pg_control");
|
||||
|
||||
let controlfile: ControlFileDataZenith = ControlFileDataZenith {
|
||||
system_identifier: buf.get_u64_le(),
|
||||
pg_control_version: buf.get_u32_le(),
|
||||
catalog_version_no: buf.get_u32_le(),
|
||||
state: buf.get_i32_le(),
|
||||
time: {
|
||||
buf.advance(4);
|
||||
buf.get_i64_le()
|
||||
},
|
||||
checkPoint: buf.get_u64_le(),
|
||||
checkPointCopy: {
|
||||
CheckPoint {
|
||||
redo: buf.get_u64_le(),
|
||||
ThisTimeLineID: buf.get_u32_le(),
|
||||
PrevTimeLineID: buf.get_u32_le(),
|
||||
fullPageWrites: buf.get_u8() != 0,
|
||||
nextXid: {
|
||||
buf.advance(7);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
nextOid: buf.get_u32_le(),
|
||||
nextMulti: buf.get_u32_le(),
|
||||
nextMultiOffset: buf.get_u32_le(),
|
||||
oldestXid: buf.get_u32_le(),
|
||||
oldestXidDB: buf.get_u32_le(),
|
||||
oldestMulti: buf.get_u32_le(),
|
||||
oldestMultiDB: buf.get_u32_le(),
|
||||
time: {
|
||||
buf.advance(4);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
oldestCommitTsXid: buf.get_u32_le(),
|
||||
newestCommitTsXid: buf.get_u32_le(),
|
||||
oldestActiveXid: buf.get_u32_le(),
|
||||
}
|
||||
},
|
||||
unloggedLSN: buf.get_u64_le(),
|
||||
minRecoveryPoint: buf.get_u64_le(),
|
||||
minRecoveryPointTLI: buf.get_u32_le(),
|
||||
backupStartPoint: {
|
||||
buf.advance(4);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
backupEndPoint: buf.get_u64_le(),
|
||||
backupEndRequired: buf.get_u8() != 0,
|
||||
};
|
||||
|
||||
return controlfile;
|
||||
}
|
||||
|
||||
pub fn parse_controlfile(b: Bytes) {
|
||||
let controlfile = decode_pg_control(b);
|
||||
|
||||
info!(
|
||||
"controlfile {:X}/{:X}",
|
||||
controlfile.checkPoint >> 32,
|
||||
controlfile.checkPoint
|
||||
);
|
||||
info!("controlfile {:?}", controlfile);
|
||||
}
|
||||
|
||||
const MAX_MAPPINGS: usize = 62;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RelMapping {
|
||||
mapoid: u32, /* OID of a catalog */
|
||||
mapfilenode: u32, /* its filenode number */
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RelMapFile {
|
||||
magic: i32, /* always RELMAPPER_FILEMAGIC */
|
||||
num_mappings: i32, /* number of valid RelMapping entries */
|
||||
mappings: [u8; MAX_MAPPINGS * 8],
|
||||
crc: u32, /* CRC of all above */
|
||||
pad: i32, /* to make the struct size be 512 exactly */
|
||||
}
|
||||
|
||||
pub fn decode_filemapping(mut buf: Bytes) -> RelMapFile {
|
||||
info!("decode filemap");
|
||||
|
||||
let file: RelMapFile = RelMapFile {
|
||||
magic: buf.get_i32_le(), /* always RELMAPPER_FILEMAGIC */
|
||||
num_mappings: buf.get_i32_le(), /* number of valid RelMapping entries */
|
||||
mappings: {
|
||||
let mut arr = [0 as u8; MAX_MAPPINGS * 8];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
arr
|
||||
},
|
||||
crc: buf.get_u32_le(), /* CRC of all above */
|
||||
pad: buf.get_i32_le(),
|
||||
};
|
||||
|
||||
info!("decode filemap {:?}", file);
|
||||
file
|
||||
}
|
||||
|
||||
pub fn write_buf_to_file(filepath: String, buf: Bytes, blkno: u32) {
|
||||
info!("write_buf_to_file {}", filepath.clone());
|
||||
|
||||
let mut buffer = File::create(filepath.clone()).unwrap();
|
||||
buffer.seek(SeekFrom::Start(8192 * blkno as u64)).unwrap();
|
||||
|
||||
buffer.write_all(&buf).unwrap();
|
||||
|
||||
info!("DONE write_buf_to_file {}", filepath);
|
||||
}
|
||||
@@ -1,14 +1,12 @@
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod controlfile;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
#[allow(dead_code)]
|
||||
pub mod pg_constants;
|
||||
pub mod restore_local_repo;
|
||||
pub mod restore_s3;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
mod tui_logger;
|
||||
@@ -16,80 +14,13 @@ pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PageServerConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub daemonize: bool,
|
||||
pub interactive: bool,
|
||||
pub wal_producer_connstr: Option<String>,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
}
|
||||
|
||||
/// Zenith Timeline ID is a 128-bit random ID.
|
||||
///
|
||||
/// Zenith timeline IDs are different from PostgreSQL timeline
|
||||
/// IDs. They serve a similar purpose though: they differentiate
|
||||
/// between different "histories" of the same cluster. However,
|
||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||
/// 32-bits wide, and they must be in ascending order in any given
|
||||
/// timeline history. Those limitations mean that we cannot generate a
|
||||
/// new PostgreSQL timeline ID by just generating a random number. And
|
||||
/// that in turn is problematic for the "pull/push" workflow, where you
|
||||
/// have a local copy of a zenith repository, and you periodically sync
|
||||
/// the local changes with a remote server. When you work "detached"
|
||||
/// from the remote server, you cannot create a PostgreSQL timeline ID
|
||||
/// that's guaranteed to be different from all existing timelines in
|
||||
/// the remote server. For example, if two people are having a clone of
|
||||
/// the repository on their laptops, and they both create a new branch
|
||||
/// with different name. What timeline ID would they assign to their
|
||||
/// branches? If they pick the same one, and later try to push the
|
||||
/// branches to the same remote server, they will get mixed up.
|
||||
///
|
||||
/// To avoid those issues, Zenith has its own concept of timelines that
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<ZTimelineId, Self::Err> {
|
||||
let timelineid = hex::decode(s)?;
|
||||
|
||||
let mut buf: [u8; 16] = [0u8; 16];
|
||||
buf.copy_from_slice(timelineid.as_slice());
|
||||
Ok(ZTimelineId(buf))
|
||||
}
|
||||
}
|
||||
|
||||
impl ZTimelineId {
|
||||
pub fn from(b: [u8; 16]) -> ZTimelineId {
|
||||
ZTimelineId(b)
|
||||
}
|
||||
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZTimelineId {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
ZTimelineId::from(arr)
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZTimelineId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&hex::encode(self.0))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn zenith_repo_dir() -> PathBuf {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
pub skip_recovery: bool,
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,52 +7,43 @@
|
||||
// *status* -- show actual info about this pageserver,
|
||||
// *pagestream* -- enter mode where smgr and pageserver talk with their
|
||||
// custom protocol.
|
||||
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
|
||||
// *callmemaybe $url* -- ask pageserver to start walreceiver on $url
|
||||
//
|
||||
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::io;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio::runtime;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::task;
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::page_cache;
|
||||
use crate::restore_local_repo;
|
||||
use crate::walreceiver;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
|
||||
use crate::controlfile;
|
||||
|
||||
type Result<T> = std::result::Result<T, io::Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
enum FeMessage {
|
||||
StartupMessage(FeStartupMessage),
|
||||
Query(FeQueryMessage), // Simple query
|
||||
Parse(FeParseMessage), // Extended query protocol
|
||||
Describe(FeDescribeMessage),
|
||||
Bind(FeBindMessage),
|
||||
Execute(FeExecuteMessage),
|
||||
Close(FeCloseMessage),
|
||||
Sync,
|
||||
Query(FeQueryMessage),
|
||||
Terminate,
|
||||
|
||||
//
|
||||
// All that messages are actually CopyData from libpq point of view.
|
||||
//
|
||||
ZenithExistsRequest(ZenithRequest),
|
||||
ZenithTruncRequest(ZenithRequest),
|
||||
ZenithUnlinkRequest(ZenithRequest),
|
||||
ZenithNblocksRequest(ZenithRequest),
|
||||
ZenithReadRequest(ZenithRequest),
|
||||
ZenithCreateRequest(ZenithRequest),
|
||||
ZenithExtendRequest(ZenithRequest),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -60,14 +51,8 @@ enum BeMessage {
|
||||
AuthenticationOk,
|
||||
ReadyForQuery,
|
||||
RowDescription,
|
||||
ParseComplete,
|
||||
ParameterDescription,
|
||||
NoData,
|
||||
BindComplete,
|
||||
CloseComplete,
|
||||
DataRow,
|
||||
CommandComplete,
|
||||
ControlFile,
|
||||
|
||||
//
|
||||
// All that messages are actually CopyData from libpq point of view.
|
||||
@@ -161,171 +146,6 @@ struct FeQueryMessage {
|
||||
body: Bytes,
|
||||
}
|
||||
|
||||
// We only support the simple case of Parse on unnamed prepared statement and
|
||||
// no params
|
||||
#[derive(Debug)]
|
||||
struct FeParseMessage {
|
||||
query_string: Bytes,
|
||||
}
|
||||
|
||||
fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
|
||||
let mut result = BytesMut::new();
|
||||
|
||||
loop {
|
||||
if !buf.has_remaining() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"no null-terminator in string",
|
||||
));
|
||||
}
|
||||
|
||||
let byte = buf.get_u8();
|
||||
|
||||
if byte == 0 {
|
||||
break;
|
||||
}
|
||||
result.put_u8(byte);
|
||||
}
|
||||
Ok(result.freeze())
|
||||
}
|
||||
|
||||
impl FeParseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
let query_string = read_null_terminated(&mut buf)?;
|
||||
let nparams = buf.get_i16();
|
||||
|
||||
// FIXME: the rust-postgres driver uses a named prepared statement
|
||||
// for copy_out(). We're not prepared to handle that correctly. For
|
||||
// now, just ignore the statement name, assuming that the client never
|
||||
// uses more than one prepared statement at a time.
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Parse",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
if nparams != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"query params not implemented",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Parse(FeParseMessage { query_string }))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FeDescribeMessage {
|
||||
kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal.
|
||||
// we only support unnamed prepared stmt or portal
|
||||
}
|
||||
|
||||
impl FeDescribeMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
let kind = buf.get_u8();
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Describe",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
if kind != b'S' {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"only prepared statmement Describe is implemented",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Describe(FeDescribeMessage { kind }))
|
||||
}
|
||||
}
|
||||
|
||||
// we only support unnamed prepared stmt or portal
|
||||
#[derive(Debug)]
|
||||
struct FeExecuteMessage {
|
||||
/// max # of rows
|
||||
maxrows: i32,
|
||||
}
|
||||
|
||||
impl FeExecuteMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let maxrows = buf.get_i32();
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
));
|
||||
}
|
||||
|
||||
if maxrows != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"row limit in Execute message not supported",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
|
||||
}
|
||||
}
|
||||
|
||||
// we only support unnamed prepared stmt and portal
|
||||
#[derive(Debug)]
|
||||
struct FeBindMessage {}
|
||||
|
||||
impl FeBindMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
));
|
||||
}
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
Ok(FeMessage::Bind(FeBindMessage {}))
|
||||
}
|
||||
}
|
||||
|
||||
// we only support unnamed prepared stmt and portal
|
||||
#[derive(Debug)]
|
||||
struct FeCloseMessage {}
|
||||
|
||||
impl FeCloseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
let _kind = buf.get_u8();
|
||||
let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
// FIXME: we do nothing with Close
|
||||
|
||||
Ok(FeMessage::Close(FeCloseMessage {}))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
|
||||
if buf.len() < 5 {
|
||||
@@ -354,16 +174,10 @@ impl FeMessage {
|
||||
let mut body = buf.split_to(total_len);
|
||||
body.advance(5);
|
||||
|
||||
let mut body = body.freeze();
|
||||
|
||||
match tag {
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))),
|
||||
b'P' => Ok(Some(FeParseMessage::parse(body)?)),
|
||||
b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
|
||||
b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
|
||||
b'B' => Ok(Some(FeBindMessage::parse(body)?)),
|
||||
b'C' => Ok(Some(FeCloseMessage::parse(body)?)),
|
||||
b'S' => Ok(Some(FeMessage::Sync)),
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage {
|
||||
body: body.freeze(),
|
||||
}))),
|
||||
b'X' => Ok(Some(FeMessage::Terminate)),
|
||||
b'd' => {
|
||||
let smgr_tag = body.get_u8();
|
||||
@@ -380,8 +194,12 @@ impl FeMessage {
|
||||
// serialization.
|
||||
match smgr_tag {
|
||||
0 => Ok(Some(FeMessage::ZenithExistsRequest(zreq))),
|
||||
1 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
|
||||
2 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
|
||||
1 => Ok(Some(FeMessage::ZenithTruncRequest(zreq))),
|
||||
2 => Ok(Some(FeMessage::ZenithUnlinkRequest(zreq))),
|
||||
3 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
|
||||
4 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
|
||||
5 => Ok(Some(FeMessage::ZenithCreateRequest(zreq))),
|
||||
6 => Ok(Some(FeMessage::ZenithExtendRequest(zreq))),
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("unknown smgr message tag: {},'{:?}'", smgr_tag, buf),
|
||||
@@ -398,34 +216,26 @@ impl FeMessage {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub fn thread_main(conf: &PageServerConf) {
|
||||
pub fn thread_main(conf: PageServerConf) {
|
||||
// Create a new thread pool
|
||||
//
|
||||
// FIXME: It would be nice to keep this single-threaded for debugging purposes,
|
||||
// but that currently leads to a deadlock: if a GetPage@LSN request arrives
|
||||
// for an LSN that hasn't been received yet, the thread gets stuck waiting for
|
||||
// the WAL to arrive. If the WAL receiver hasn't been launched yet, i.e
|
||||
// we haven't received a "callmemaybe" request yet to tell us where to get the
|
||||
// WAL, we will not have a thread available to process the "callmemaybe"
|
||||
// request when it does arrive. Using a thread pool alleviates the problem so
|
||||
// that it doesn't happen in the tests anymore, but in principle it could still
|
||||
// happen if we receive enough GetPage@LSN requests to consume all of the
|
||||
// available threads.
|
||||
//let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap();
|
||||
let runtime = runtime::Runtime::new().unwrap();
|
||||
// FIXME: keep it single-threaded for now, make it easier to debug with gdb,
|
||||
// and we're not concerned with performance yet.
|
||||
//let runtime = runtime::Runtime::new().unwrap();
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
info!("Starting page server on {}", conf.listen_addr);
|
||||
|
||||
let runtime_ref = Arc::new(runtime);
|
||||
|
||||
runtime_ref.block_on(async {
|
||||
runtime.block_on(async {
|
||||
let listener = TcpListener::bind(conf.listen_addr).await.unwrap();
|
||||
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().await.unwrap();
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref);
|
||||
let mut conn_handler = Connection::new(conf.clone(), socket);
|
||||
|
||||
task::spawn(async move {
|
||||
if let Err(err) = conn_handler.run().await {
|
||||
@@ -442,17 +252,15 @@ struct Connection {
|
||||
buffer: BytesMut,
|
||||
init_done: bool,
|
||||
conf: PageServerConf,
|
||||
runtime: Arc<Runtime>,
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
pub fn new(conf: PageServerConf, socket: TcpStream, runtime: &Arc<Runtime>) -> Connection {
|
||||
pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection {
|
||||
Connection {
|
||||
stream: BufWriter::new(socket),
|
||||
buffer: BytesMut::with_capacity(10 * 1024),
|
||||
init_done: false,
|
||||
conf,
|
||||
runtime: Arc::clone(runtime),
|
||||
conf: conf,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -500,36 +308,9 @@ impl Connection {
|
||||
self.stream.write_u8(b'I').await?;
|
||||
}
|
||||
|
||||
BeMessage::ParseComplete => {
|
||||
self.stream.write_u8(b'1').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::BindComplete => {
|
||||
self.stream.write_u8(b'2').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::CloseComplete => {
|
||||
self.stream.write_u8(b'3').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::NoData => {
|
||||
self.stream.write_u8(b'n').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::ParameterDescription => {
|
||||
self.stream.write_u8(b't').await?;
|
||||
self.stream.write_i32(6).await?;
|
||||
// we don't support params, so always 0
|
||||
self.stream.write_i16(0).await?;
|
||||
}
|
||||
|
||||
BeMessage::RowDescription => {
|
||||
// XXX
|
||||
let b = Bytes::from("data\0");
|
||||
let mut b = Bytes::from("data\0");
|
||||
|
||||
self.stream.write_u8(b'T').await?;
|
||||
self.stream
|
||||
@@ -537,7 +318,7 @@ impl Connection {
|
||||
.await?;
|
||||
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_all(&b).await?;
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
self.stream.write_i32(0).await?; /* table oid */
|
||||
self.stream.write_i16(0).await?; /* attnum */
|
||||
self.stream.write_i32(25).await?; /* TEXTOID */
|
||||
@@ -549,34 +330,22 @@ impl Connection {
|
||||
// XXX: accept some text data
|
||||
BeMessage::DataRow => {
|
||||
// XXX
|
||||
let b = Bytes::from("hello world");
|
||||
let mut b = Bytes::from("hello world");
|
||||
|
||||
self.stream.write_u8(b'D').await?;
|
||||
self.stream.write_i32(4 + 2 + 4 + b.len() as i32).await?;
|
||||
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_i32(b.len() as i32).await?;
|
||||
self.stream.write_all(&b).await?;
|
||||
}
|
||||
|
||||
BeMessage::ControlFile => {
|
||||
// TODO pass checkpoint and xid info in this message
|
||||
let b = Bytes::from("hello pg_control");
|
||||
|
||||
self.stream.write_u8(b'D').await?;
|
||||
self.stream.write_i32(4 + 2 + 4 + b.len() as i32).await?;
|
||||
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_i32(b.len() as i32).await?;
|
||||
self.stream.write_all(&b).await?;
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::CommandComplete => {
|
||||
let b = Bytes::from("SELECT 1\0");
|
||||
let mut b = Bytes::from("SELECT 1\0");
|
||||
|
||||
self.stream.write_u8(b'C').await?;
|
||||
self.stream.write_i32(4 + b.len() as i32).await?;
|
||||
self.stream.write_all(&b).await?;
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::ZenithStatusResponse(resp) => {
|
||||
@@ -603,7 +372,7 @@ impl Connection {
|
||||
self.stream.write_u8(102).await?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8).await?;
|
||||
self.stream.write_u32(resp.n_blocks).await?;
|
||||
self.stream.write_all(&resp.page.clone()).await?;
|
||||
self.stream.write_buf(&mut resp.page.clone()).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -616,18 +385,15 @@ impl Connection {
|
||||
}
|
||||
|
||||
async fn run(&mut self) -> Result<()> {
|
||||
let mut unnamed_query_string = Bytes::new();
|
||||
loop {
|
||||
let msg = self.read_message().await?;
|
||||
trace!("got message {:?}", msg);
|
||||
match msg {
|
||||
match self.read_message().await? {
|
||||
Some(FeMessage::StartupMessage(m)) => {
|
||||
trace!("got message {:?}", m);
|
||||
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
|
||||
let b = Bytes::from("N");
|
||||
self.stream.write_all(&b).await?;
|
||||
let mut b = Bytes::from("N");
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
self.stream.flush().await?;
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
@@ -640,28 +406,7 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
Some(FeMessage::Query(m)) => {
|
||||
self.process_query(m.body).await?;
|
||||
}
|
||||
Some(FeMessage::Parse(m)) => {
|
||||
unnamed_query_string = m.query_string;
|
||||
self.write_message(&BeMessage::ParseComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Describe(_)) => {
|
||||
self.write_message_noflush(&BeMessage::ParameterDescription)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::NoData).await?;
|
||||
}
|
||||
Some(FeMessage::Bind(_)) => {
|
||||
self.write_message(&BeMessage::BindComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Close(_)) => {
|
||||
self.write_message(&BeMessage::CloseComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Execute(_)) => {
|
||||
self.process_query(unnamed_query_string.clone()).await?;
|
||||
}
|
||||
Some(FeMessage::Sync) => {
|
||||
self.write_message(&BeMessage::ReadyForQuery).await?;
|
||||
self.process_query(&m).await?;
|
||||
}
|
||||
Some(FeMessage::Terminate) => {
|
||||
break;
|
||||
@@ -670,8 +415,7 @@ impl Connection {
|
||||
info!("connection closed");
|
||||
break;
|
||||
}
|
||||
x => {
|
||||
error!("unexpected message type : {:?}", x);
|
||||
_ => {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "unexpected message"));
|
||||
}
|
||||
}
|
||||
@@ -680,62 +424,87 @@ impl Connection {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_query(&mut self, query_string: Bytes) -> Result<()> {
|
||||
debug!("process query {:?}", query_string);
|
||||
async fn process_query(&mut self, q: &FeQueryMessage) -> Result<()> {
|
||||
trace!("got query {:?}", q.body);
|
||||
|
||||
// remove null terminator, if any
|
||||
let mut query_string = query_string.clone();
|
||||
if query_string.last() == Some(&0) {
|
||||
query_string.truncate(query_string.len() - 1);
|
||||
}
|
||||
|
||||
if query_string.starts_with(b"controlfile") {
|
||||
self.handle_controlfile().await
|
||||
} else if query_string.starts_with(b"pagestream ") {
|
||||
let (_l, r) = query_string.split_at("pagestream ".len());
|
||||
let timelineid_str = String::from_utf8(r.to_vec()).unwrap();
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
|
||||
self.handle_pagerequests(timelineid).await
|
||||
} else if query_string.starts_with(b"basebackup ") {
|
||||
let (_l, r) = query_string.split_at("basebackup ".len());
|
||||
if q.body.starts_with(b"file") {
|
||||
let (_l, r) = q.body.split_at("file ".len());
|
||||
//TODO parse it correctly
|
||||
let r = r.to_vec();
|
||||
let timelineid_str = String::from(String::from_utf8(r).unwrap().trim_end());
|
||||
info!("got basebackup command: \"{}\"", timelineid_str);
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
let str = String::from_utf8(r).unwrap().to_string();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(timelineid).await?;
|
||||
let mut split = str.split(',');
|
||||
let mut s;
|
||||
|
||||
let filepath = split.next().unwrap();
|
||||
let sysid = {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u64>().unwrap()
|
||||
};
|
||||
|
||||
let buf_tag = page_cache::BufferTag {
|
||||
spcnode: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
dbnode: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
relnode: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
forknum: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u8>().unwrap()
|
||||
},
|
||||
blknum: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
};
|
||||
|
||||
//TODO PARSE LSN
|
||||
//let lsn = { s = split.next().unwrap(); s.parse::<u64>().unwrap()};
|
||||
let lsn: u64 = 0;
|
||||
info!(
|
||||
"process file query sysid {} -- {:?} lsn {}",
|
||||
sysid, buf_tag, lsn
|
||||
);
|
||||
|
||||
self.handle_file(filepath.to_string(), sysid, buf_tag, lsn.into())
|
||||
.await
|
||||
} else if q.body.starts_with(b"pagestream ") {
|
||||
let (_l, r) = q.body.split_at("pagestream ".len());
|
||||
let mut r = r.to_vec();
|
||||
r.pop();
|
||||
let sysid = String::from_utf8(r).unwrap().trim().to_string();
|
||||
let sysid: u64 = sysid.parse().unwrap(); // XXX
|
||||
|
||||
self.handle_pagerequests(sysid).await
|
||||
} else if q.body.starts_with(b"callmemaybe ") {
|
||||
let (_l, r) = q.body.split_at("callmemaybe ".len());
|
||||
let mut r = r.to_vec();
|
||||
r.pop();
|
||||
let connstr = String::from_utf8(r).unwrap().trim().to_string();
|
||||
|
||||
let conf_copy = self.conf.clone();
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
walreceiver::thread_main(conf_copy, &connstr);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// generic ack:
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
} else if query_string.starts_with(b"callmemaybe ") {
|
||||
let query_str = String::from_utf8(query_string.to_vec())
|
||||
.unwrap()
|
||||
.to_string();
|
||||
|
||||
// callmemaybe <zenith timelineid as hex string> <connstr>
|
||||
let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap();
|
||||
let caps = re.captures(&query_str);
|
||||
let caps = caps.unwrap();
|
||||
|
||||
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
let connstr: String = String::from(caps.get(2).unwrap().as_str());
|
||||
|
||||
// Check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
|
||||
walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr);
|
||||
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
} else if query_string.starts_with(b"status") {
|
||||
} else if q.body.starts_with(b"status") {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
@@ -752,25 +521,35 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_controlfile(&mut self) -> Result<()> {
|
||||
async fn handle_file(
|
||||
&mut self,
|
||||
filepath: String,
|
||||
sysid: u64,
|
||||
buf_tag: page_cache::BufferTag,
|
||||
lsn: u64,
|
||||
) -> Result<()> {
|
||||
let pcache = page_cache::get_pagecache(self.conf.clone(), sysid);
|
||||
|
||||
match pcache.get_page_at_lsn(buf_tag, lsn) {
|
||||
Ok(p) => {
|
||||
info!("info succeeded get_page_at_lsn: {}", lsn);
|
||||
|
||||
controlfile::write_buf_to_file(filepath, p, buf_tag.blknum);
|
||||
}
|
||||
Err(e) => {
|
||||
info!("page not found and it's ok. get_page_at_lsn: {}", e);
|
||||
}
|
||||
};
|
||||
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::ControlFile).await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
}
|
||||
|
||||
async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
// Check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested pagestream on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
let pcache = pcache.unwrap();
|
||||
|
||||
async fn handle_pagerequests(&mut self, sysid: u64) -> Result<()> {
|
||||
/* switch client to COPYBOTH */
|
||||
self.stream.write_u8(b'W').await?;
|
||||
self.stream.write_i32(4 + 1 + 2).await?;
|
||||
@@ -778,11 +557,13 @@ impl Connection {
|
||||
self.stream.write_i16(0).await?; /* numAttributes */
|
||||
self.stream.flush().await?;
|
||||
|
||||
let pcache = page_cache::get_pagecache(self.conf.clone(), sysid);
|
||||
|
||||
loop {
|
||||
let message = self.read_message().await?;
|
||||
|
||||
if let Some(m) = &message {
|
||||
trace!("query({:?}): {:?}", timelineid, m);
|
||||
info!("query({}): {:?}", sysid, m);
|
||||
};
|
||||
|
||||
if message.is_none() {
|
||||
@@ -799,7 +580,7 @@ impl Connection {
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
let exist = pcache.relsize_exist(&tag, req.lsn).await.unwrap_or(false);
|
||||
let exist = pcache.relsize_exist(&tag);
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: exist,
|
||||
@@ -807,6 +588,20 @@ impl Connection {
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithTruncRequest(_)) => {
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithUnlinkRequest(_)) => {
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithNblocksRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: req.spcnode,
|
||||
@@ -815,31 +610,45 @@ impl Connection {
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
let n_blocks = pcache.relsize_get(&tag, req.lsn).await.unwrap_or(0);
|
||||
let n_blocks = pcache.relsize_get(&tag);
|
||||
|
||||
trace!("ZenithNblocksRequest {:?} = {}", tag, n_blocks);
|
||||
self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks,
|
||||
n_blocks: n_blocks,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithReadRequest(req)) => {
|
||||
let buf_tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
},
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
blknum: req.blkno,
|
||||
};
|
||||
|
||||
let msg = match pcache.get_page_at_lsn(buf_tag, req.lsn).await {
|
||||
Ok(p) => BeMessage::ZenithReadResponse(ZenithReadResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
page: p,
|
||||
}),
|
||||
let msg = match pcache.get_page_at_lsn(buf_tag, req.lsn) {
|
||||
Ok(p) => {
|
||||
let mut b = BytesMut::with_capacity(8192);
|
||||
|
||||
trace!("ZenithReadResponse get_page_at_lsn succeed");
|
||||
if p.len() < 8192 {
|
||||
//add padding
|
||||
trace!("ZenithReadResponse add padding");
|
||||
let padding: [u8; 8192 - 512] = [0; 8192 - 512];
|
||||
b.extend_from_slice(&p);
|
||||
b.extend_from_slice(&padding);
|
||||
} else {
|
||||
b.extend_from_slice(&p);
|
||||
}
|
||||
|
||||
BeMessage::ZenithReadResponse(ZenithReadResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
page: b.freeze(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
const ZERO_PAGE: [u8; 8192] = [0; 8192];
|
||||
error!("get_page_at_lsn: {}", e);
|
||||
@@ -853,102 +662,43 @@ impl Connection {
|
||||
|
||||
self.write_message(&msg).await?
|
||||
}
|
||||
Some(FeMessage::ZenithCreateRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
trace!("ZenithCreateRequest {:?}", tag);
|
||||
|
||||
pcache.relsize_inc(&tag, None);
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithExtendRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
trace!("ZenithExtendRequest {:?} to {}", tag, req.blkno);
|
||||
|
||||
pcache.relsize_inc(&tag, Some(req.blkno));
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
// check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested basebackup on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
|
||||
/* switch client to COPYOUT */
|
||||
let stream = &mut self.stream;
|
||||
stream.write_u8(b'H').await?;
|
||||
stream.write_i32(4 + 1 + 2).await?;
|
||||
stream.write_u8(0).await?; /* copy_is_binary */
|
||||
stream.write_i16(0).await?; /* numAttributes */
|
||||
stream.flush().await?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest snapshot on the timeline */
|
||||
|
||||
// find latest snapshot
|
||||
let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
|
||||
|
||||
// Stream it
|
||||
let (s, mut r) = mpsc::channel(5);
|
||||
|
||||
let f_tar = task::spawn_blocking(move || {
|
||||
basebackup::send_snapshot_tarball(&mut CopyDataSink(s), timelineid, snapshotlsn)?;
|
||||
Ok(())
|
||||
});
|
||||
let f_tar2 = async {
|
||||
let joinres = f_tar.await;
|
||||
|
||||
if let Err(joinreserr) = joinres {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, joinreserr));
|
||||
}
|
||||
joinres.unwrap()
|
||||
};
|
||||
|
||||
let f_pump = async move {
|
||||
loop {
|
||||
let buf = r.recv().await;
|
||||
if buf.is_none() {
|
||||
break;
|
||||
}
|
||||
let buf = buf.unwrap();
|
||||
|
||||
// CopyData
|
||||
stream.write_u8(b'd').await?;
|
||||
stream.write_u32((4 + buf.len()) as u32).await?;
|
||||
stream.write_all(&buf).await?;
|
||||
trace!("CopyData sent for {} bytes!", buf.len());
|
||||
|
||||
// FIXME: flush isn't really required, but makes it easier
|
||||
// to view in wireshark
|
||||
stream.flush().await?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
tokio::try_join!(f_tar2, f_pump)?;
|
||||
|
||||
// CopyDone
|
||||
self.stream.write_u8(b'c').await?;
|
||||
self.stream.write_u32(4).await?;
|
||||
self.stream.flush().await?;
|
||||
debug!("CopyDone sent!");
|
||||
|
||||
// FIXME: I'm getting an error from the tokio copyout driver without this.
|
||||
// I think it happens when the CommandComplete, CloseComplete and ReadyForQuery
|
||||
// are sent in the same TCP packet as the CopyDone. I don't understand why.
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct CopyDataSink(mpsc::Sender<Bytes>);
|
||||
|
||||
impl std::io::Write for CopyDataSink {
|
||||
fn write(&mut self, data: &[u8]) -> std::result::Result<usize, std::io::Error> {
|
||||
let buf = Bytes::copy_from_slice(data);
|
||||
|
||||
if let Err(e) = self.0.blocking_send(buf) {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, e));
|
||||
}
|
||||
|
||||
Ok(data.len())
|
||||
}
|
||||
fn flush(&mut self) -> std::result::Result<(), std::io::Error> {
|
||||
// no-op
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,57 +9,3 @@ pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
|
||||
pub const PG_XACT_FORKNUM: u32 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
|
||||
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
|
||||
pub const CLOG_BITS_PER_XACT: u8 = 2;
|
||||
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
|
||||
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
|
||||
pub const CLOG_ZEROPAGE: u8 = 0x00;
|
||||
pub const CLOG_TRUNCATE: u8 = 0x10;
|
||||
|
||||
// From xact.h
|
||||
pub const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
/*
|
||||
* The following flags, stored in xinfo, determine which information is
|
||||
* contained in commit/abort records.
|
||||
*/
|
||||
pub const XACT_XINFO_HAS_DBINFO: u32 = 1;
|
||||
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 2;
|
||||
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 4;
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_SMGR_ID: u8 = 2;
|
||||
pub const RM_CLOG_ID: u8 = 3;
|
||||
pub const RM_DBASE_ID: u8 = 4;
|
||||
pub const RM_TBLSPC_ID: u8 = 5;
|
||||
// pub const RM_MULTIXACT_ID:u8 = 6;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
|
||||
|
||||
// from dbcommands_xlog.h
|
||||
pub const XLOG_DBASE_CREATE: u8 = 0x00;
|
||||
pub const XLOG_DBASE_DROP: u8 = 0x10;
|
||||
|
||||
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
|
||||
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
@@ -1,434 +0,0 @@
|
||||
//
|
||||
// Restore chunks from local Zenith repository
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "snapshots" and all
|
||||
// WAL from all timelines from the local zenith repository into the in-memory page
|
||||
// cache.
|
||||
//
|
||||
// This also initializes the "last valid LSN" in the page cache to the last LSN
|
||||
// seen in the WAL, so that when the WAL receiver is started, it starts
|
||||
// streaming from that LSN.
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::error::Error;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::page_cache::PageCache;
|
||||
use crate::page_cache::RelTag;
|
||||
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
// FIXME: we'll probably need these elsewhere too, move to some common location
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
//
|
||||
// Load it all into the page cache.
|
||||
//
|
||||
pub fn restore_timeline(
|
||||
conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
) -> Result<()> {
|
||||
let timelinepath = PathBuf::from("timelines").join(timeline.to_string());
|
||||
|
||||
if !timelinepath.exists() {
|
||||
anyhow::bail!("timeline {} does not exist in the page server's repository");
|
||||
}
|
||||
|
||||
// Scan .zenith/timelines/<timeline>/snapshots
|
||||
let snapshotspath = PathBuf::from("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots");
|
||||
|
||||
let mut last_snapshot_lsn: u64 = 0;
|
||||
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let direntry = direntry?;
|
||||
let filename = direntry.file_name().to_str().unwrap().to_owned();
|
||||
|
||||
let lsn = u64::from_str_radix(&filename, 16)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
|
||||
restore_snapshot(conf, pcache, timeline, &filename)?;
|
||||
info!("restored snapshot at {}", filename);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == 0 {
|
||||
error!(
|
||||
"could not find valid snapshot in {}",
|
||||
snapshotspath.display()
|
||||
);
|
||||
// TODO return error?
|
||||
}
|
||||
pcache.init_valid_lsn(last_snapshot_lsn);
|
||||
|
||||
restore_wal(conf, pcache, timeline, last_snapshot_lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<u64> {
|
||||
let snapshotspath = format!("timelines/{}/snapshots", timeline);
|
||||
|
||||
let mut last_snapshot_lsn = 0;
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned();
|
||||
|
||||
let lsn = u64::from_str_radix(&filename, 16)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == 0 {
|
||||
error!("could not find valid snapshot in {}", &snapshotspath);
|
||||
// TODO return error?
|
||||
}
|
||||
Ok(last_snapshot_lsn)
|
||||
}
|
||||
|
||||
fn restore_snapshot(
|
||||
conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
) -> Result<()> {
|
||||
let snapshotpath = PathBuf::from("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots")
|
||||
.join(snapshot);
|
||||
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(snapshotpath.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("pg_control") => continue,
|
||||
Some("pg_filenode.map") => continue,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
for direntry in fs::read_dir(snapshotpath.join("base"))? {
|
||||
let direntry = direntry?;
|
||||
|
||||
let dboid = u32::from_str_radix(direntry.file_name().to_str().unwrap(), 10)?;
|
||||
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => continue,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_relfile(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
_timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
spcoid: u32,
|
||||
dboid: u32,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = u64::from_str_radix(snapshot, 16)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
let (relnode, forknum, segno) = p.unwrap();
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode: relnode,
|
||||
forknum: forknum as u8,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Scan WAL on a timeline, starting from gien LSN, and load all the records
|
||||
// into the page cache.
|
||||
fn restore_wal(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
startpoint: u64,
|
||||
) -> Result<()> {
|
||||
let walpath = format!("timelines/{}/wal", timeline);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024);
|
||||
let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024);
|
||||
let mut last_lsn = 0;
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, 16 * 1024 * 1024);
|
||||
let mut path = walpath.clone() + "/" + &filename;
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path += ".partial";
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let open_result = File::open(&path);
|
||||
if let Err(e) = open_result {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
break;
|
||||
}
|
||||
return Err(e)?;
|
||||
}
|
||||
let mut file = open_result.unwrap();
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
}
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != 16 * 1024 * 1024 - offset as usize {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
loop {
|
||||
let rec = waldecoder.poll_decode();
|
||||
if rec.is_err() {
|
||||
// Assume that an error means we've reached the end of
|
||||
// a partial WAL record. So that's ok.
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
truncate: false,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_valid_lsn(lsn);
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
nrecords += 1;
|
||||
}
|
||||
|
||||
info!("restored {} records from WAL file {}", nrecords, filename);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
info!(
|
||||
"reached end of WAL at {:X}/{:X}",
|
||||
last_lsn >> 32,
|
||||
last_lsn & 0xffffffff
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl Error for FilePathError {
|
||||
fn description(&self) -> &str {
|
||||
&self.msg
|
||||
}
|
||||
}
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u32,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
}
|
||||
@@ -22,7 +22,7 @@ use tokio::runtime;
|
||||
|
||||
use futures::future;
|
||||
|
||||
use crate::{page_cache, PageServerConf};
|
||||
use crate::{controlfile, page_cache, pg_constants, PageServerConf};
|
||||
|
||||
struct Storage {
|
||||
region: Region,
|
||||
@@ -38,9 +38,12 @@ pub fn restore_main(conf: &PageServerConf) {
|
||||
let result = restore_chunk(conf).await;
|
||||
|
||||
match result {
|
||||
Ok(_) => {}
|
||||
Ok(_) => {
|
||||
return;
|
||||
}
|
||||
Err(err) => {
|
||||
error!("S3 error: {}", err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -57,8 +60,8 @@ pub fn restore_main(conf: &PageServerConf) {
|
||||
async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
let backend = Storage {
|
||||
region: Region::Custom {
|
||||
region: env::var("S3_REGION").unwrap(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap(),
|
||||
region: env::var("S3_REGION").unwrap().into(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap().into(),
|
||||
},
|
||||
credentials: Credentials::new(
|
||||
Some(&env::var("S3_ACCESSKEY").unwrap()),
|
||||
@@ -81,8 +84,24 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
.list("relationdata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
|
||||
// TODO: get that from backup
|
||||
let sys_id: u64 = 42;
|
||||
//Before uploading other files, slurp pg_control to set systemid
|
||||
|
||||
let control_results: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list(
|
||||
"relationdata/global/pg_control".to_string(),
|
||||
Some("".to_string()),
|
||||
)
|
||||
.await?;
|
||||
let object = &(&control_results[0]).contents[0];
|
||||
let (data, _) = bucket.get_object(&object.key).await.unwrap();
|
||||
let bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
let c = controlfile::decode_pg_control(bytes);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), c.system_identifier);
|
||||
pcache.set_controldata(c.clone());
|
||||
trace!("uploaded controlfile {:?}", pcache.get_controldata());
|
||||
|
||||
let sys_id: u64 = c.system_identifier;
|
||||
let mut oldest_lsn = 0;
|
||||
let mut slurp_futures: Vec<_> = Vec::new();
|
||||
|
||||
@@ -116,23 +135,47 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
panic!("no base backup found");
|
||||
}
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
//Now add nonrelation files
|
||||
let nonrelresults: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list("nonreldata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
for result in nonrelresults {
|
||||
for object in result.contents {
|
||||
// Download needed non relation files, slurping them into memory
|
||||
|
||||
let key = object.key;
|
||||
let relpath = key.strip_prefix("nonreldata/").unwrap();
|
||||
trace!("list nonrelfiles {}", relpath);
|
||||
|
||||
let parsed = parse_nonrel_file_path(&relpath);
|
||||
|
||||
match parsed {
|
||||
Ok(p) => {
|
||||
let b = bucket.clone();
|
||||
let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
|
||||
|
||||
slurp_futures.push(f);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("unrecognized file: {} ({})", relpath, e);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pcache.init_valid_lsn(oldest_lsn);
|
||||
|
||||
info!("{} files to restore...", slurp_futures.len());
|
||||
|
||||
future::join_all(slurp_futures).await;
|
||||
info!("restored!");
|
||||
info!(
|
||||
"restored! {:?} to {:?}",
|
||||
pcache.first_valid_lsn, pcache.last_valid_lsn
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
// FIXME: we'll probably need these elsewhere too, move to some common location
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
@@ -182,6 +225,17 @@ struct ParsedBaseImageFileName {
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
fn parse_lsn_from_filename(fname: &str) -> Result<u64, FilePathError> {
|
||||
let (_, lsn_str) = fname.split_at(fname.len() - 16);
|
||||
|
||||
let (lsnhi, lsnlo) = lsn_str.split_at(8);
|
||||
let lsn_hi = u64::from_str_radix(lsnhi, 16)?;
|
||||
let lsn_lo = u64::from_str_radix(lsnlo, 16)?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
return Ok(lsn);
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
@@ -196,7 +250,7 @@ fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode: u32 = relnode_str.parse()?;
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
@@ -210,14 +264,54 @@ fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
|
||||
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
|
||||
let lsn_hi = u64::from_str_radix(caps.name("lsnhi").unwrap().as_str(), 16)?;
|
||||
let lsn_lo = u64::from_str_radix(caps.name("lsnlo").unwrap().as_str(), 16)?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
Ok((relnode, forknum, segno, lsn))
|
||||
return Ok((relnode, forknum, segno, lsn));
|
||||
}
|
||||
|
||||
fn parse_nonrel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
//TODO parse segno from xact filenames too
|
||||
if let Some(fname) = path.strip_prefix("pg_xact/") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_XACT_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(fname) = path.strip_prefix("pg_multixact/offsets") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_MXACT_OFFSETS_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(fname) = path.strip_prefix("pg_multixact/members") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_MXACT_MEMBERS_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else {
|
||||
return Err(FilePathError::new("invalid non relation data file name"));
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
@@ -239,22 +333,48 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
if fname.contains("pg_control") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_CONTROLFILE_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
if fname.contains("pg_filenode") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: GLOBALTABLESPACE_OID,
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
});
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let dbnode: u32 = dbnode_str.parse()?;
|
||||
let dbnode = u32::from_str_radix(dbnode_str, 10)?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
@@ -262,21 +382,34 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
if fname.contains("pg_filenode") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: DEFAULTTABLESPACE_OID,
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
});
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
return Err(FilePathError::new("tablespaces not supported"));
|
||||
} else {
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -299,24 +432,55 @@ async fn slurp_base_file(
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
// pg_filenode.map has non-standard size - 512 bytes
|
||||
if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM {
|
||||
let b = bytes.clone();
|
||||
controlfile::decode_filemapping(b);
|
||||
while bytes.remaining() >= 512 {
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
},
|
||||
blknum,
|
||||
blknum: 0,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(512));
|
||||
}
|
||||
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
pcache.relsize_inc(&tag, Some(0));
|
||||
} else {
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let reltag = page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
};
|
||||
|
||||
blknum += 1;
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum: blknum,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
pcache.relsize_inc(&reltag, Some(blknum));
|
||||
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ use tui::text::{Span, Spans, Text};
|
||||
use tui::widgets::{Block, BorderType, Borders, Paragraph, Widget};
|
||||
use tui::Terminal;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
|
||||
lazy_static! {
|
||||
@@ -31,7 +32,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -41,7 +42,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -52,7 +53,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -65,7 +66,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -84,14 +85,14 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
return slog_scope::set_global_logger(logger);
|
||||
}
|
||||
|
||||
pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
// Terminal initialization
|
||||
let stdout = io::stdout().into_raw_mode()?;
|
||||
let stdout = MouseTerminal::from(stdout);
|
||||
@@ -187,7 +188,6 @@ pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct LogWidget<'a> {
|
||||
logger: &'a TuiLogger,
|
||||
title: &'a str,
|
||||
@@ -229,7 +229,7 @@ impl<'a> Widget for LogWidget<'a> {
|
||||
// Render a widget to show some metrics
|
||||
struct MetricsWidget {}
|
||||
|
||||
fn get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
fn get_metric_u64<'a>(title: &'a str, value: u64) -> Spans<'a> {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -240,7 +240,7 @@ fn get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
])
|
||||
}
|
||||
|
||||
fn get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
|
||||
fn get_metric_str<'a>(title: &'a str, value: &'a str) -> Spans<'a> {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::time::Duration;
|
||||
use termion::event::Key;
|
||||
use termion::input::TermRead;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub enum Event<I> {
|
||||
Input(I),
|
||||
Tick,
|
||||
@@ -76,8 +77,8 @@ impl Events {
|
||||
};
|
||||
Events {
|
||||
rx,
|
||||
input_handle,
|
||||
ignore_exit_key,
|
||||
input_handle,
|
||||
tick_handle,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//
|
||||
use chrono::offset::Local;
|
||||
use chrono::DateTime;
|
||||
use slog;
|
||||
use slog::{Drain, Level, OwnedKVList, Record};
|
||||
use slog_async::AsyncRecord;
|
||||
use std::collections::VecDeque;
|
||||
@@ -51,7 +52,7 @@ impl Drain for TuiLogger {
|
||||
events.pop_back();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,7 +81,7 @@ impl<'b> TuiLoggerWidget<'b> {
|
||||
style_trace: None,
|
||||
style_info: None,
|
||||
show_module: true,
|
||||
logger,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -167,7 +168,7 @@ impl<'b> Widget for TuiLoggerWidget<'b> {
|
||||
Level::Debug => (self.style_debug, "DEBUG", true),
|
||||
Level::Trace => (self.style_trace, "TRACE", true),
|
||||
};
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or_default()));
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or(Style::default())));
|
||||
|
||||
if self.show_module {
|
||||
line.push(Span::raw(" "));
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
//#![allow(non_upper_case_globals)]
|
||||
//#![allow(non_camel_case_types)]
|
||||
//#![allow(non_snake_case)]
|
||||
//#![allow(dead_code)]
|
||||
//include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
use crate::pg_constants;
|
||||
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
|
||||
use std::cmp::min;
|
||||
use thiserror::Error;
|
||||
use std::str;
|
||||
|
||||
use log::*;
|
||||
|
||||
const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
@@ -14,7 +21,7 @@ const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogPageHeaderData {
|
||||
struct XLogPageHeaderData {
|
||||
xlp_magic: u16, /* magic value for correctness checks */
|
||||
xlp_info: u16, /* flag bits, see below */
|
||||
xlp_tli: u32, /* TimeLineID of first record on page */
|
||||
@@ -28,7 +35,7 @@ const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogLongPageHeaderData {
|
||||
struct XLogLongPageHeaderData {
|
||||
std: XLogPageHeaderData, /* standard header fields */
|
||||
xlp_sysid: u64, /* system identifier from pg_control */
|
||||
xlp_seg_size: u32, /* just as a cross-check */
|
||||
@@ -39,7 +46,6 @@ pub struct XLogLongPageHeaderData {
|
||||
#[allow(non_upper_case_globals)]
|
||||
const SizeOfXLogLongPHD: usize = (2 + 2 + 4 + 8 + 4) + 4 + 8 + 4 + 4;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: u64,
|
||||
|
||||
@@ -52,13 +58,6 @@ pub struct WalStreamDecoder {
|
||||
recordbuf: BytesMut,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, Clone)]
|
||||
#[error("{msg} at {lsn}")]
|
||||
pub struct WalDecodeError {
|
||||
msg: String,
|
||||
lsn: u64,
|
||||
}
|
||||
|
||||
//
|
||||
// WalRecordStream is a Stream that returns a stream of WAL records
|
||||
// FIXME: This isn't a proper rust stream
|
||||
@@ -66,7 +65,7 @@ pub struct WalDecodeError {
|
||||
impl WalStreamDecoder {
|
||||
pub fn new(lsn: u64) -> WalStreamDecoder {
|
||||
WalStreamDecoder {
|
||||
lsn,
|
||||
lsn: lsn,
|
||||
|
||||
startlsn: 0,
|
||||
contlen: 0,
|
||||
@@ -81,56 +80,40 @@ impl WalStreamDecoder {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
|
||||
/// Attempt to decode another WAL record from the input that has been fed to the
|
||||
/// decoder so far.
|
||||
///
|
||||
/// Returns one of the following:
|
||||
/// Ok((u64, Bytes)): a tuple containing the LSN of next record, and the record itself
|
||||
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(u64, Bytes)>, WalDecodeError> {
|
||||
// Returns a tuple:
|
||||
// (end LSN, record)
|
||||
pub fn poll_decode(&mut self) -> Option<(u64, Bytes)> {
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn % WAL_SEGMENT_SIZE == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogLongPHD {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogLongPageHeaderData();
|
||||
if hdr.std.xlp_pageaddr != self.lsn {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog segment header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.decode_XLogLongPageHeaderData();
|
||||
self.lsn += SizeOfXLogLongPHD as u64;
|
||||
|
||||
// TODO: verify the fields in the header
|
||||
|
||||
continue;
|
||||
} else if self.lsn % (XLOG_BLCKSZ as u64) == 0 {
|
||||
// parse page header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogShortPHD {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogPageHeaderData();
|
||||
if hdr.xlp_pageaddr != self.lsn {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog page header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.decode_XLogPageHeaderData();
|
||||
self.lsn += SizeOfXLogShortPHD as u64;
|
||||
|
||||
// TODO: verify the fields in the header
|
||||
|
||||
continue;
|
||||
} else if self.padlen > 0 {
|
||||
if self.inputbuf.remaining() < self.padlen as usize {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
// skip padding
|
||||
@@ -141,17 +124,20 @@ impl WalStreamDecoder {
|
||||
// need to have at least the xl_tot_len field
|
||||
|
||||
if self.inputbuf.remaining() < 4 {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
// read xl_tot_len FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = self.inputbuf.get_u32_le();
|
||||
if xl_tot_len < SizeOfXLogRecord {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
error!(
|
||||
"invalid xl_tot_len {} at {:X}/{:X}",
|
||||
xl_tot_len,
|
||||
self.lsn >> 32,
|
||||
self.lsn & 0xffffffff
|
||||
);
|
||||
panic!();
|
||||
}
|
||||
self.lsn += 4;
|
||||
|
||||
@@ -169,7 +155,7 @@ impl WalStreamDecoder {
|
||||
let n = min(self.contlen, pageleft) as usize;
|
||||
|
||||
if self.inputbuf.remaining() < n {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
self.recordbuf.put(self.inputbuf.split_to(n));
|
||||
@@ -197,7 +183,7 @@ impl WalStreamDecoder {
|
||||
}
|
||||
|
||||
let result = (self.lsn, recordbuf);
|
||||
return Ok(Some(result));
|
||||
return Some(result);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -228,7 +214,7 @@ impl WalStreamDecoder {
|
||||
// FIXME: check that hdr.xlp_rem_len matches self.contlen
|
||||
//println!("next xlog page (xlp_rem_len: {})", hdr.xlp_rem_len);
|
||||
|
||||
hdr
|
||||
return hdr;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -240,7 +226,7 @@ impl WalStreamDecoder {
|
||||
xlp_xlog_blcksz: self.inputbuf.get_u32_le(),
|
||||
};
|
||||
|
||||
hdr
|
||||
return hdr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,6 +236,7 @@ const BLCKSZ: u16 = 8192;
|
||||
//
|
||||
// Constants from xlogrecord.h
|
||||
//
|
||||
const XLR_INFO_MASK: u8 = 0x0F;
|
||||
|
||||
const XLR_MAX_BLOCK_ID: u8 = 32;
|
||||
|
||||
@@ -270,7 +257,12 @@ const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
|
||||
#[allow(dead_code)]
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
|
||||
|
||||
pub struct DecodedBkpBlock {
|
||||
/* Is this block ref in use? */
|
||||
//in_use: bool,
|
||||
@@ -279,8 +271,7 @@ pub struct DecodedBkpBlock {
|
||||
pub rnode_spcnode: u32,
|
||||
pub rnode_dbnode: u32,
|
||||
pub rnode_relnode: u32,
|
||||
// Note that we have a few special forknum values for non-rel files.
|
||||
pub forknum: u8,
|
||||
pub forknum: u8, // Note that we have a few special forknum values for non-rel files. Handle them too
|
||||
pub blkno: u32,
|
||||
|
||||
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
||||
@@ -298,45 +289,44 @@ pub struct DecodedBkpBlock {
|
||||
|
||||
/* Buffer holding the rmgr-specific data associated with this block */
|
||||
has_data: bool,
|
||||
//char *data;
|
||||
data_len: u16,
|
||||
}
|
||||
|
||||
impl DecodedBkpBlock {
|
||||
pub fn new() -> DecodedBkpBlock {
|
||||
DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: 0,
|
||||
blkno: 0,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
bimg_info: 0,
|
||||
|
||||
has_data: false,
|
||||
data_len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
const SizeOfXLogRecord: u32 = 24;
|
||||
|
||||
pub struct DecodedWALRecord {
|
||||
pub xl_info: u8,
|
||||
pub xl_rmid: u8,
|
||||
pub lsn: u64, // LSN at the *end* of the record
|
||||
pub record: Bytes, // raw XLogRecord
|
||||
|
||||
pub blocks: Vec<DecodedBkpBlock>,
|
||||
pub main_data_offset: usize,
|
||||
}
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
const XLOG_SWITCH: u8 = 0x40;
|
||||
const RM_XLOG_ID: u8 = 0;
|
||||
|
||||
const RM_XACT_ID: u8 = 1;
|
||||
// const RM_CLOG_ID:u8 = 3;
|
||||
//const RM_MULTIXACT_ID:u8 = 6;
|
||||
|
||||
// from xact.h
|
||||
const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
// const XLOG_XACT_PREPARE: u8 = 0x10;
|
||||
// const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
|
||||
// const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
// const XLOG_XACT_ASSIGNMENT: u8 = 0x50;
|
||||
// const XLOG_XACT_INVALIDATIONS: u8 = 0x60;
|
||||
/* free opcode 0x70 */
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
// const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
// so we need to check for that before the rest of the parsing.
|
||||
//
|
||||
@@ -353,88 +343,34 @@ fn is_xlog_switch_record(rec: &Bytes) -> bool {
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
xl_info == pg_constants::XLOG_SWITCH && xl_rmid == pg_constants::RM_XLOG_ID
|
||||
}
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type BlockNumber = u32;
|
||||
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RelFileNode {
|
||||
pub spcnode: Oid, /* tablespace */
|
||||
pub dbnode: Oid, /* database */
|
||||
pub relnode: Oid, /* relation */
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrTruncate {
|
||||
pub blkno: BlockNumber,
|
||||
pub rnode: RelFileNode,
|
||||
pub flags: u32,
|
||||
}
|
||||
|
||||
pub fn decode_truncate_record(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance((SizeOfXLogRecord + 2) as usize);
|
||||
XlSmgrTruncate {
|
||||
blkno: buf.get_u32_le(),
|
||||
rnode: RelFileNode {
|
||||
spcnode: buf.get_u32_le(), /* tablespace */
|
||||
dbnode: buf.get_u32_le(), /* database */
|
||||
relnode: buf.get_u32_le(), /* relation */
|
||||
},
|
||||
flags: buf.get_u32_le(),
|
||||
}
|
||||
return xl_info == XLOG_SWITCH && xl_rmid == RM_XLOG_ID;
|
||||
}
|
||||
|
||||
//
|
||||
// Routines to decode a WAL record and figure out which blocks are modified
|
||||
//
|
||||
// See xlogrecord.h for details
|
||||
// The overall layout of an XLOG record is:
|
||||
// Fixed-size header (XLogRecord struct)
|
||||
// XLogRecordBlockHeader struct
|
||||
// If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
|
||||
// If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, an
|
||||
// XLogRecordBlockCompressHeader struct follows.
|
||||
// If BKPBLOCK_SAME_REL is not set, a RelFileNode follows
|
||||
// BlockNumber follows
|
||||
// XLogRecordBlockHeader struct
|
||||
// ...
|
||||
// XLogRecordDataHeader[Short|Long] struct
|
||||
// block data
|
||||
// block data
|
||||
// ...
|
||||
// main data
|
||||
pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
let mut got_rnode = false;
|
||||
pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord {
|
||||
trace!(
|
||||
"decoding record with LSN {:08X}/{:08X} ({} bytes)",
|
||||
lsn >> 32,
|
||||
lsn & 0xffff_ffff,
|
||||
rec.remaining()
|
||||
);
|
||||
|
||||
let mut buf = record.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
let mut buf = rec.clone();
|
||||
|
||||
// FIXME: assume little-endian here
|
||||
let xl_tot_len = buf.get_u32_le();
|
||||
let xl_xid = buf.get_u32_le();
|
||||
let xl_prev = buf.get_u64_le();
|
||||
let _xl_prev = buf.get_u64_le();
|
||||
let xl_info = buf.get_u8();
|
||||
let xl_rmid = buf.get_u8();
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
trace!(
|
||||
"decode_wal_record xl_rmid = {} xl_info = {}",
|
||||
xl_rmid,
|
||||
xl_info
|
||||
);
|
||||
info!("decode_wal_record xl_rmid = {}", xl_rmid);
|
||||
|
||||
let rminfo: u8 = xl_info & !XLR_INFO_MASK;
|
||||
|
||||
let remaining = xl_tot_len - SizeOfXLogRecord;
|
||||
|
||||
@@ -442,28 +378,78 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
//TODO error
|
||||
}
|
||||
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
let mut got_rnode = false;
|
||||
|
||||
if xl_rmid == RM_XACT_ID
|
||||
&& ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT
|
||||
|| (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED)
|
||||
{
|
||||
info!("decode_wal_record RM_XACT_ID - XLOG_XACT_COMMIT");
|
||||
|
||||
let mut blocks: Vec<DecodedBkpBlock> = Vec::new();
|
||||
|
||||
let blkno = xl_xid / CLOG_XACTS_PER_PAGE;
|
||||
|
||||
let mut blk = DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: pg_constants::PG_XACT_FORKNUM as u8,
|
||||
blkno: blkno,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
bimg_info: 0,
|
||||
|
||||
has_data: true,
|
||||
data_len: 0,
|
||||
};
|
||||
|
||||
let fork_flags = buf.get_u8();
|
||||
blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
|
||||
blk.data_len = buf.get_u16_le();
|
||||
|
||||
info!(
|
||||
"decode_wal_record RM_XACT_ID blk has data with data_len {}",
|
||||
blk.data_len
|
||||
);
|
||||
|
||||
blocks.push(blk);
|
||||
return DecodedWALRecord {
|
||||
lsn: lsn,
|
||||
record: rec,
|
||||
blocks: blocks,
|
||||
};
|
||||
}
|
||||
|
||||
// Decode the headers
|
||||
|
||||
let mut max_block_id = 0;
|
||||
let mut blocks_total_len: u32 = 0;
|
||||
let mut main_data_len = 0;
|
||||
let mut datatotal: u32 = 0;
|
||||
let mut blocks: Vec<DecodedBkpBlock> = Vec::new();
|
||||
|
||||
// 2. Decode the headers.
|
||||
// XLogRecordBlockHeaders if any,
|
||||
// XLogRecordDataHeader[Short|Long]
|
||||
while buf.remaining() > datatotal as usize {
|
||||
let block_id = buf.get_u8();
|
||||
|
||||
match block_id {
|
||||
XLR_BLOCK_ID_DATA_SHORT => {
|
||||
/* XLogRecordDataHeaderShort */
|
||||
main_data_len = buf.get_u8() as u32;
|
||||
let main_data_len = buf.get_u8() as u32;
|
||||
|
||||
datatotal += main_data_len;
|
||||
}
|
||||
|
||||
XLR_BLOCK_ID_DATA_LONG => {
|
||||
/* XLogRecordDataHeaderLong */
|
||||
main_data_len = buf.get_u32_le();
|
||||
/* XLogRecordDataHeaderShort */
|
||||
let main_data_len = buf.get_u32();
|
||||
|
||||
datatotal += main_data_len;
|
||||
}
|
||||
|
||||
@@ -479,7 +465,25 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
|
||||
0..=XLR_MAX_BLOCK_ID => {
|
||||
/* XLogRecordBlockHeader */
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let mut blk = DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: 0,
|
||||
blkno: 0,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
bimg_info: 0,
|
||||
|
||||
has_data: false,
|
||||
data_len: 0,
|
||||
};
|
||||
let fork_flags: u8;
|
||||
|
||||
if block_id <= max_block_id {
|
||||
@@ -499,12 +503,28 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
blk.has_image = (fork_flags & BKPBLOCK_HAS_IMAGE) != 0;
|
||||
blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
|
||||
blk.will_init = (fork_flags & BKPBLOCK_WILL_INIT) != 0;
|
||||
|
||||
blk.data_len = buf.get_u16_le();
|
||||
|
||||
/* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
|
||||
|
||||
/* cross-check that the HAS_DATA flag is set iff data_length > 0 */
|
||||
// TODO
|
||||
/*
|
||||
if (blk->has_data && blk->data_len == 0)
|
||||
{
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_HAS_DATA set, but no data included at %X/%X",
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
}
|
||||
if (!blk->has_data && blk->data_len != 0)
|
||||
{
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X",
|
||||
(unsigned int) blk->data_len,
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
}
|
||||
*/
|
||||
datatotal += blk.data_len as u32;
|
||||
blocks_total_len += blk.data_len as u32;
|
||||
|
||||
if blk.has_image {
|
||||
blk.bimg_len = buf.get_u16_le();
|
||||
@@ -523,7 +543,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
blk.hole_length = BLCKSZ - blk.bimg_len;
|
||||
}
|
||||
datatotal += blk.bimg_len as u32;
|
||||
blocks_total_len += blk.bimg_len as u32;
|
||||
|
||||
/*
|
||||
* cross-check that hole_offset > 0, hole_length > 0 and
|
||||
@@ -599,14 +618,20 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
rnode_spcnode = buf.get_u32_le();
|
||||
rnode_dbnode = buf.get_u32_le();
|
||||
rnode_relnode = buf.get_u32_le();
|
||||
//rnode = &blk->rnode;
|
||||
got_rnode = true;
|
||||
} else if !got_rnode {
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err; */
|
||||
} else {
|
||||
if !got_rnode {
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
*/
|
||||
}
|
||||
|
||||
//blk->rnode = *rnode;
|
||||
}
|
||||
|
||||
blk.rnode_spcnode = rnode_spcnode;
|
||||
@@ -614,13 +639,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
blk.rnode_relnode = rnode_relnode;
|
||||
|
||||
blk.blkno = buf.get_u32_le();
|
||||
trace!(
|
||||
"this record affects {}/{}/{} blk {}",
|
||||
rnode_spcnode,
|
||||
rnode_dbnode,
|
||||
rnode_relnode,
|
||||
blk.blkno
|
||||
);
|
||||
|
||||
//println!("this record affects {}/{}/{} blk {}",rnode_spcnode, rnode_dbnode, rnode_relnode, blk.blkno);
|
||||
|
||||
blocks.push(blk);
|
||||
}
|
||||
@@ -631,94 +651,21 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Decode blocks.
|
||||
// We don't need them, so just skip blocks_total_len bytes
|
||||
buf.advance(blocks_total_len as usize);
|
||||
/*
|
||||
* Ok, we've parsed the fragment headers, and verified that the total
|
||||
* length of the payload in the fragments is equal to the amount of data
|
||||
* left. Copy the data of each fragment to a separate buffer.
|
||||
*
|
||||
* We could just set up pointers into readRecordBuf, but we want to align
|
||||
* the data for the convenience of the callers. Backup images are not
|
||||
* copied, however; they don't need alignment.
|
||||
*/
|
||||
|
||||
let main_data_offset = (xl_tot_len - main_data_len) as usize;
|
||||
// Since we don't care about the data payloads here, we're done.
|
||||
|
||||
// 4. Decode main_data
|
||||
if main_data_len > 0 {
|
||||
assert_eq!(buf.remaining(), main_data_len as usize);
|
||||
}
|
||||
|
||||
//5. Handle special CLOG and XACT records
|
||||
if xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = buf.get_i32_le() as u32;
|
||||
trace!("RM_CLOG_ID updates block {}", blk.blkno);
|
||||
blocks.push(blk);
|
||||
} else if xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT xl_prev {:X}/{:X} xid {} updates block {}",
|
||||
(xl_prev >> 32),
|
||||
xl_prev & 0xffffffff,
|
||||
xl_xid,
|
||||
blk.blkno
|
||||
);
|
||||
blocks.push(blk);
|
||||
//TODO parse commit record to extract subtrans entries
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT xl_prev {:X}/{:X} xid {} updates block {}",
|
||||
(xl_prev >> 32),
|
||||
xl_prev & 0xffffffff,
|
||||
xl_xid,
|
||||
blk.blkno
|
||||
);
|
||||
blocks.push(blk);
|
||||
//TODO parse abort record to extract subtrans entries
|
||||
}
|
||||
}
|
||||
else if xl_rmid == pg_constants::RM_DBASE_ID
|
||||
{
|
||||
let info = xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
//buf points to main_data
|
||||
let db_id = buf.get_u32_le();
|
||||
let tablespace_id = buf.get_u32_le();
|
||||
let src_db_id = buf.get_u32_le();
|
||||
let src_tablespace_id = buf.get_u32_le();
|
||||
trace!("XLOG_DBASE_CREATE db_id {} src_db_id {}", db_id, src_db_id);
|
||||
// in postgres it is implemented as copydir
|
||||
// we need to copy all pages in page_cache
|
||||
}
|
||||
else
|
||||
{
|
||||
trace!("XLOG_DBASE_DROP is not handled yet");
|
||||
}
|
||||
}
|
||||
else if xl_rmid == pg_constants::RM_TBLSPC_ID
|
||||
{
|
||||
let info = xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_TBLSPC_CREATE
|
||||
{
|
||||
//buf points to main_data
|
||||
let ts_id = buf.get_u32_le();
|
||||
let ts_path = str::from_utf8(&buf).unwrap();
|
||||
trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
|
||||
}
|
||||
else
|
||||
{
|
||||
trace!("XLOG_TBLSPC_DROP is not handled yet");
|
||||
}
|
||||
}
|
||||
|
||||
DecodedWALRecord {
|
||||
xl_info,
|
||||
xl_rmid,
|
||||
record,
|
||||
blocks,
|
||||
main_data_offset,
|
||||
}
|
||||
return DecodedWALRecord {
|
||||
lsn: lsn,
|
||||
record: rec,
|
||||
blocks: blocks,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,98 +1,29 @@
|
||||
//!
|
||||
//! WAL receiver
|
||||
//!
|
||||
//! The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
//! For each WAL record, it decodes the record to figure out which data blocks
|
||||
//! the record affects, and adds the records to the page cache.
|
||||
//!
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::{BufferTag, RelTag};
|
||||
use crate::pg_constants;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Error;
|
||||
use lazy_static::lazy_static;
|
||||
//
|
||||
// WAL receiver
|
||||
//
|
||||
// The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
// For each WAL record, it decodes the record to figure out which data blocks
|
||||
// the record affects, and adds the records to the page cache.
|
||||
//
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
|
||||
use tokio::runtime;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
|
||||
use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
//
|
||||
// We keep one WAL Receiver active per timeline.
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
}
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::waldecoder::WalStreamDecoder;
|
||||
use crate::PageServerConf;
|
||||
|
||||
lazy_static! {
|
||||
static ref WAL_RECEIVERS: Mutex<HashMap<ZTimelineId, WalReceiverEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
match receivers.get_mut(&timelineid) {
|
||||
Some(receiver) => {
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
let conf_copy = conf.clone();
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
thread_main(&conf_copy, timelineid);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Look up current WAL producer connection string in the hash table
|
||||
fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
let receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
receivers
|
||||
.get(&timelineid)
|
||||
.unwrap()
|
||||
.wal_producer_connstr
|
||||
.clone()
|
||||
}
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use tokio_postgres::{connect_replication, Error, NoTls, ReplicationMode};
|
||||
|
||||
//
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
pub fn thread_main(conf: PageServerConf, wal_producer_connstr: &String) {
|
||||
info!("WAL receiver thread started: '{}'", wal_producer_connstr);
|
||||
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
@@ -101,32 +32,31 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
|
||||
runtime.block_on(async {
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
let _res = walreceiver_main(conf.clone(), wal_producer_connstr).await;
|
||||
|
||||
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await;
|
||||
|
||||
if let Err(e) = res {
|
||||
info!(
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
// TODO: print/log the error
|
||||
info!(
|
||||
"WAL streaming connection failed, retrying in 1 second...: {:?}",
|
||||
_res
|
||||
);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async fn walreceiver_main(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
conf: PageServerConf,
|
||||
wal_producer_connstr: &String,
|
||||
) -> Result<(), Error> {
|
||||
// Connect to the database in replication mode.
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
|
||||
let (rclient, connection) = tokio_postgres::connect(&connect_cfg, NoTls).await?;
|
||||
info!("connected!");
|
||||
debug!("connecting to {}...", wal_producer_connstr);
|
||||
let (mut rclient, connection) = connect_replication(
|
||||
wal_producer_connstr.as_str(),
|
||||
NoTls,
|
||||
ReplicationMode::Physical,
|
||||
)
|
||||
.await?;
|
||||
debug!("connected!");
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
@@ -136,29 +66,28 @@ async fn walreceiver_main(
|
||||
}
|
||||
});
|
||||
|
||||
let identify = identify_system(&rclient).await?;
|
||||
info!("{:?}", identify);
|
||||
let end_of_wal = u64::from(identify.xlogpos);
|
||||
let identify_system = rclient.identify_system().await?;
|
||||
let end_of_wal = u64::from(identify_system.xlogpos());
|
||||
let mut caught_up = false;
|
||||
|
||||
let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
|
||||
let sysid: u64 = identify_system.systemid().parse().unwrap();
|
||||
let pcache = page_cache::get_pagecache(conf, sysid);
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
//
|
||||
let mut startpoint = pcache.get_last_valid_lsn();
|
||||
let last_valid_lsn = pcache.get_last_valid_lsn();
|
||||
if startpoint == 0 {
|
||||
// If we start here with identify.xlogpos we will have race condition with
|
||||
// If we start here with identify_system.xlogpos() we will have race condition with
|
||||
// postgres start: insert into postgres may request page that was modified with lsn
|
||||
// smaller than identify.xlogpos.
|
||||
// smaller than identify_system.xlogpos().
|
||||
//
|
||||
// Current procedure for starting postgres will anyway be changed to something
|
||||
// different like having 'initdb' method on a pageserver (or importing some shared
|
||||
// empty database snapshot), so for now I just put start of first segment which
|
||||
// seems to be a valid record.
|
||||
pcache.init_valid_lsn(0x_1_000_000_u64);
|
||||
startpoint = 0x_1_000_000_u64;
|
||||
startpoint = u64::from(0x_1_000_000_u64);
|
||||
} else {
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
@@ -170,23 +99,16 @@ async fn walreceiver_main(
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"last_valid_lsn {:X}/{:X} starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...",
|
||||
(last_valid_lsn >> 32),
|
||||
(last_valid_lsn & 0xffffffff),
|
||||
"starting replication from {:X}/{:X}, server is at {:X}/{:X}...",
|
||||
(startpoint >> 32),
|
||||
(startpoint & 0xffffffff),
|
||||
timelineid,
|
||||
(end_of_wal >> 32),
|
||||
(end_of_wal & 0xffffffff)
|
||||
);
|
||||
|
||||
let startpoint = PgLsn::from(startpoint);
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
let copy_stream = rclient.copy_both_simple::<bytes::Bytes>(&query).await?;
|
||||
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
tokio::pin!(physical_stream);
|
||||
|
||||
let startpoint = tokio_postgres::types::Lsn::from(startpoint);
|
||||
let mut physical_stream = rclient
|
||||
.start_physical_replication(None, startpoint, None)
|
||||
.await?;
|
||||
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
|
||||
|
||||
while let Some(replication_message) = physical_stream.next().await {
|
||||
@@ -198,13 +120,6 @@ async fn walreceiver_main(
|
||||
let startlsn = xlog_data.wal_start();
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
|
||||
write_wal_file(
|
||||
startlsn,
|
||||
timelineid,
|
||||
16 * 1024 * 1024, // FIXME
|
||||
data,
|
||||
)?;
|
||||
|
||||
trace!(
|
||||
"received XLogData between {:X}/{:X} and {:X}/{:X}",
|
||||
(startlsn >> 32),
|
||||
@@ -216,62 +131,35 @@ async fn walreceiver_main(
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
loop {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode() {
|
||||
let decoded =
|
||||
crate::waldecoder::decode_wal_record(startlsn, recdata.clone());
|
||||
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
lsn: lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
truncate: false,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
// include truncate wal record in all pages
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = decode_truncate_record(&decoded);
|
||||
if (truncate.flags & SMGR_TRUNCATE_HEAP) != 0 {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: truncate.rnode.spcnode,
|
||||
dbnode: truncate.rnode.dbnode,
|
||||
relnode: truncate.rnode.relnode,
|
||||
forknum: MAIN_FORKNUM,
|
||||
},
|
||||
blknum: truncate.blkno,
|
||||
};
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn: lsn,
|
||||
will_init: false,
|
||||
truncate: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
pcache.put_rel_wal_record(tag, rec).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_record_lsn(lsn);
|
||||
pcache.advance_last_valid_lsn(lsn);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -295,174 +183,12 @@ async fn walreceiver_main(
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
let reply_requested: bool = keepalive.reply() != 0;
|
||||
|
||||
trace!(
|
||||
"received PrimaryKeepAlive(wal_end: {}, timestamp: {} reply: {})",
|
||||
wal_end,
|
||||
timestamp,
|
||||
reply_requested,
|
||||
);
|
||||
if reply_requested {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(pcache.get_last_valid_lsn());
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::INVALID;
|
||||
let ts = PgTimestamp::now()?;
|
||||
const NO_REPLY: u8 = 0u8;
|
||||
|
||||
physical_stream
|
||||
.as_mut()
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)
|
||||
.await?;
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(_keepalive) => {
|
||||
trace!("received PrimaryKeepAlive");
|
||||
// FIXME: Reply, or the connection will time out
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
///
|
||||
/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html
|
||||
#[derive(Debug)]
|
||||
pub struct IdentifySystem {
|
||||
systemid: u64,
|
||||
timeline: u32,
|
||||
xlogpos: PgLsn,
|
||||
dbname: Option<String>,
|
||||
}
|
||||
|
||||
/// There was a problem parsing the response to
|
||||
/// a postgres IDENTIFY_SYSTEM command.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("IDENTIFY_SYSTEM parse error")]
|
||||
pub struct IdentifyError;
|
||||
|
||||
/// Run the postgres `IDENTIFY_SYSTEM` command
|
||||
pub async fn identify_system(client: &tokio_postgres::Client) -> Result<IdentifySystem, Error> {
|
||||
let query_str = "IDENTIFY_SYSTEM";
|
||||
let response = client.simple_query(query_str).await?;
|
||||
|
||||
// get(N) from row, then parse it as some destination type.
|
||||
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
|
||||
where
|
||||
T: FromStr,
|
||||
{
|
||||
let val = row.get(idx).ok_or(IdentifyError)?;
|
||||
val.parse::<T>().or(Err(IdentifyError))
|
||||
}
|
||||
|
||||
// extract the row contents into an IdentifySystem struct.
|
||||
// written as a closure so I can use ? for Option here.
|
||||
if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
|
||||
Ok(IdentifySystem {
|
||||
systemid: get_parse(first_row, 0)?,
|
||||
timeline: get_parse(first_row, 1)?,
|
||||
xlogpos: get_parse(first_row, 2)?,
|
||||
dbname: get_parse(first_row, 3).ok(),
|
||||
})
|
||||
} else {
|
||||
Err(IdentifyError)?
|
||||
}
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
startpos: XLogRecPtr,
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
|
||||
/*
|
||||
* If crossing a WAL boundary, only write up until we reach wal
|
||||
* segment size.
|
||||
*/
|
||||
if xlogoff + bytes_left > wal_seg_size {
|
||||
bytes_to_write = wal_seg_size - xlogoff;
|
||||
} else {
|
||||
bytes_to_write = bytes_left;
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = XLByteToSeg(start_pos, wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
wal_seg_size,
|
||||
);
|
||||
let wal_file_path = wal_dir.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
let mut wal_file: File;
|
||||
/* Try to open already completed segment */
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
wal_file = file;
|
||||
partial = false;
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
|
||||
/* Try to open existed partial file */
|
||||
wal_file = file;
|
||||
partial = true;
|
||||
} else {
|
||||
/* Create and fill new partial file */
|
||||
partial = true;
|
||||
match OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(&ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
||||
|
||||
// FIXME: Flush the file
|
||||
//wal_file.sync_all()?;
|
||||
}
|
||||
/* Write was successful, advance our position */
|
||||
bytes_written += bytes_to_write;
|
||||
bytes_left -= bytes_to_write;
|
||||
start_pos += bytes_to_write as u64;
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if XLogSegmentOffset(start_pos, wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -18,35 +18,31 @@ use log::*;
|
||||
use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::Error;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::{path::PathBuf, process::Stdio};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::time::timeout;
|
||||
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::CacheEntry;
|
||||
use crate::page_cache::WALRecord;
|
||||
use crate::ZTimelineId;
|
||||
use crate::{page_cache::BufferTag, pg_constants, PageServerConf};
|
||||
use crate::{page_cache::BufferTag, PageServerConf};
|
||||
|
||||
static TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
info!("WAL redo thread started {}", timelineid);
|
||||
pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) {
|
||||
info!("WAL redo thread started {}", sys_id);
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
@@ -56,41 +52,34 @@ pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, timelineid).unwrap();
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
let walredo_channel_receiver = &pcache.walredo_receiver;
|
||||
loop {
|
||||
let mut process: WalRedoProcess;
|
||||
let datadir = format!("wal-redo/{}", timelineid);
|
||||
let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id));
|
||||
|
||||
info!("launching WAL redo postgres process {}", timelineid);
|
||||
info!("launching WAL redo postgres process {}", sys_id);
|
||||
{
|
||||
let _guard = runtime.enter();
|
||||
process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
|
||||
}
|
||||
info!("WAL redo postgres started");
|
||||
|
||||
// Pretty arbitrarily, reuse the same Postgres process for 100 requests.
|
||||
// After that, kill it and start a new one. This is mostly to avoid
|
||||
// using up all shared buffers in Postgres's shared buffer cache; we don't
|
||||
// want to write any pages to disk in the WAL redo process.
|
||||
for _i in 1..100000 {
|
||||
for _i in 1..100 {
|
||||
let request = walredo_channel_receiver.recv().unwrap();
|
||||
|
||||
let result = handle_apply_request(&pcache, &process, &runtime, request);
|
||||
if result.is_err() {
|
||||
// Something went wrong with handling the request. It's not clear
|
||||
// if the request was faulty, and the next request would succeed
|
||||
// again, or if the 'postgres' process went haywire. To be safe,
|
||||
// kill the 'postgres' process so that we will start from a clean
|
||||
// slate, with a new process, for the next request.
|
||||
// On error, kill the process.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Time to kill the 'postgres' process. A new one will be launched on next
|
||||
// iteration of the loop.
|
||||
info!("killing WAL redo postgres process");
|
||||
let _ = runtime.block_on(process.stdin.get_mut().shutdown());
|
||||
let mut child = process.child;
|
||||
@@ -99,59 +88,6 @@ pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
}
|
||||
}
|
||||
|
||||
fn transaction_id_set_status_bit(
|
||||
xl_info: u8,
|
||||
xl_rmid: u8,
|
||||
xl_xid: u32,
|
||||
record: WALRecord,
|
||||
page: &mut BytesMut,
|
||||
) {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
} else {
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
return;
|
||||
}
|
||||
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort) lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
|
||||
let byteno: usize = ((xl_rmid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let byteptr = &mut page[byteno..byteno + 1];
|
||||
let bshift: u8 = ((xl_xid % pg_constants::CLOG_XACTS_PER_BYTE)
|
||||
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
let mut curval = byteptr[0];
|
||||
curval = (curval >> bshift) & pg_constants::CLOG_XACT_BITMASK;
|
||||
|
||||
let mut byteval = [0];
|
||||
byteval[0] = curval;
|
||||
byteval[0] &= !(((1 << pg_constants::CLOG_BITS_PER_XACT as u8) - 1) << bshift);
|
||||
byteval[0] |= status << bshift;
|
||||
|
||||
byteptr.copy_from_slice(&byteval);
|
||||
trace!(
|
||||
"xl_xid {} byteno {} curval {} byteval {}",
|
||||
xl_xid,
|
||||
byteno,
|
||||
curval,
|
||||
byteval[0]
|
||||
);
|
||||
}
|
||||
|
||||
fn handle_apply_request(
|
||||
pcache: &page_cache::PageCache,
|
||||
process: &WalRedoProcess,
|
||||
@@ -163,57 +99,17 @@ fn handle_apply_request(
|
||||
let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref());
|
||||
|
||||
let mut entry = entry_rc.content.lock().unwrap();
|
||||
assert!(entry.apply_pending);
|
||||
entry.apply_pending = false;
|
||||
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if tag.rel.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
|
||||
//TODO use base image if any
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let zero_page_bytes: &[u8] = &ZERO_PAGE;
|
||||
let mut page = BytesMut::from(zero_page_bytes);
|
||||
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let _xl_tot_len = buf.get_u32_le();
|
||||
let xl_xid = buf.get_u32_le();
|
||||
let _xl_prev = buf.get_u64_le();
|
||||
let xl_info = buf.get_u8();
|
||||
let xl_rmid = buf.get_u8();
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
if xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
page.clone_from_slice(zero_page_bytes);
|
||||
trace!("handle_apply_request for RM_CLOG_ID-CLOG_ZEROPAGE lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
} else if xl_rmid == pg_constants::RM_XACT_ID {
|
||||
transaction_id_set_status_bit(xl_info, xl_rmid, xl_xid, record, &mut page);
|
||||
}
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
} else {
|
||||
apply_result = process.apply_wal_records(runtime, tag, base_img, records);
|
||||
}
|
||||
|
||||
let apply_result = process.apply_wal_records(runtime, tag, base_img, records);
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result;
|
||||
|
||||
trace!(
|
||||
debug!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
@@ -226,13 +122,16 @@ fn handle_apply_request(
|
||||
result = Err(e);
|
||||
} else {
|
||||
entry.page_image = Some(apply_result.unwrap());
|
||||
pcache
|
||||
.num_page_images
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
result = Ok(());
|
||||
}
|
||||
|
||||
// Wake up the requester, whether the operation succeeded or not.
|
||||
entry_rc.walredo_condvar.notify_all();
|
||||
|
||||
result
|
||||
return result;
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
@@ -248,13 +147,13 @@ impl WalRedoProcess {
|
||||
// Tests who run pageserver binary are setting proper PG_BIN_DIR
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
|
||||
// switch to setting same things in pageserver config file.
|
||||
fn launch(datadir: &str, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
|
||||
fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres deleting old one.
|
||||
fs::remove_dir_all(datadir).ok();
|
||||
fs::remove_dir_all(datadir.to_str().unwrap()).ok();
|
||||
let initdb = runtime
|
||||
.block_on(
|
||||
Command::new("initdb")
|
||||
.args(&["-D", datadir])
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.output(),
|
||||
)
|
||||
@@ -266,25 +165,22 @@ impl WalRedoProcess {
|
||||
std::str::from_utf8(&initdb.stdout).unwrap(),
|
||||
std::str::from_utf8(&initdb.stderr).unwrap()
|
||||
);
|
||||
} else {
|
||||
// Limit shared cache for wal-redo-postres
|
||||
let mut config = OpenOptions::new()
|
||||
.append(true)
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write(b"shared_buffers=128kB\n")?;
|
||||
config.write(b"fsync=off\n")?;
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
let mut child = Command::new("postgres")
|
||||
.arg("--wal-redo")
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env("PGDATA", datadir)
|
||||
.env("PGDATA", datadir.to_str().unwrap())
|
||||
.spawn()
|
||||
.expect("postgres --wal-redo command failed to start");
|
||||
|
||||
info!("launched WAL redo postgres process on {}", datadir);
|
||||
info!(
|
||||
"launched WAL redo postgres process on {}",
|
||||
datadir.to_str().unwrap()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
let stderr = child.stderr.take().expect("failed to open child's stderr");
|
||||
@@ -304,7 +200,7 @@ impl WalRedoProcess {
|
||||
if res.unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
error!("wal-redo-postgres: {}", line.trim());
|
||||
debug!("wal-redo-postgres: {}", line.trim());
|
||||
line.clear();
|
||||
}
|
||||
Ok::<(), Error>(())
|
||||
@@ -312,7 +208,7 @@ impl WalRedoProcess {
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(WalRedoProcess {
|
||||
child,
|
||||
child: child,
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
})
|
||||
@@ -331,7 +227,7 @@ impl WalRedoProcess {
|
||||
) -> Result<Bytes, Error> {
|
||||
let mut stdin = self.stdin.borrow_mut();
|
||||
let mut stdout = self.stdout.borrow_mut();
|
||||
runtime.block_on(async {
|
||||
return runtime.block_on(async {
|
||||
//
|
||||
// This async block sends all the commands to the process.
|
||||
//
|
||||
@@ -394,7 +290,7 @@ impl WalRedoProcess {
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -402,13 +298,17 @@ fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u8('B' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
@@ -417,39 +317,47 @@ fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
let len = 4 + 5 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u8('P' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
buf.put(base_img);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u8('A' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn);
|
||||
buf.put(rec);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u8('G' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
@@ -10,10 +10,6 @@
|
||||
#
|
||||
# 2) installs postgres to REPO_ROOT/tmp_install/
|
||||
#
|
||||
|
||||
# Halt immediately if any command fails
|
||||
set -e
|
||||
|
||||
REPO_ROOT=$(dirname "$0")
|
||||
REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
|
||||
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
[package]
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
authors = ["Heikki Linnakangas <heikki@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
log = "0.4.14"
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.57"
|
||||
@@ -1,3 +0,0 @@
|
||||
This module contains utility functions for interacting with PostgreSQL
|
||||
file formats.
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
extern crate bindgen;
|
||||
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn main() {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=pg_control_ffi.h");
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("pg_control_ffi.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
|
||||
.whitelist_type("ControlFileData")
|
||||
.whitelist_var("PG_CONTROL_FILE_SIZE")
|
||||
.whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
||||
.whitelist_type("DBState")
|
||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
||||
// and used DESTDIR to move it into tmp_install, then it's in
|
||||
// tmp_install/include/postgres/server (that's how the pgbuild.sh script does it).
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
.clang_arg("-I../tmp_install/include/postgresql/server")
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
|
||||
bindings
|
||||
.write_to_file(out_path.join("bindings.rs"))
|
||||
.expect("Couldn't write bindings!");
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
#include "c.h"
|
||||
#include "catalog/pg_control.h"
|
||||
|
||||
const uint32 PG_CONTROLFILEDATA_OFFSETOF_CRC = offsetof(ControlFileData, crc);
|
||||
@@ -1,69 +0,0 @@
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
pub mod xlog_utils;
|
||||
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
|
||||
// sizeof(ControlFileData)
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize;
|
||||
|
||||
impl ControlFileData {
|
||||
// Initialize an all-zeros ControlFileData struct
|
||||
pub fn new() -> ControlFileData {
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
let b = [0u8; SIZEOF_CONTROLDATA];
|
||||
controlfile =
|
||||
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
controlfile
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
|
||||
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
|
||||
buf.copy_to_slice(&mut b);
|
||||
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
// TODO: verify CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let expectedcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
controlfile = unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
if expectedcrc != controlfile.crc {
|
||||
anyhow::bail!(
|
||||
"invalid CRC in control file: expected {:08X}, was {:08X}",
|
||||
expectedcrc,
|
||||
controlfile.crc
|
||||
);
|
||||
}
|
||||
|
||||
Ok(controlfile)
|
||||
}
|
||||
|
||||
pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
|
||||
let b: [u8; SIZEOF_CONTROLDATA];
|
||||
|
||||
b = unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(controlfile) };
|
||||
|
||||
// Recompute the CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let newcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
|
||||
|
||||
buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
buf.extend_from_slice(&newcrc.to_ne_bytes());
|
||||
// Fill the rest of the control file with zeros.
|
||||
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
|
||||
|
||||
buf.into()
|
||||
}
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: af9c507616...a71b5c24eb
@@ -7,10 +7,14 @@ edition = "2018"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
crossbeam-channel = "0.5.0"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3.13"
|
||||
lazy_static = "1.4.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
slog-async = "2.6.0"
|
||||
@@ -19,15 +23,16 @@ slog-term = "2.8.0"
|
||||
slog = "2.7.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
termion = "1.5.6"
|
||||
tui = "0.14.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
|
||||
@@ -9,15 +9,17 @@ use std::path::PathBuf;
|
||||
use std::thread;
|
||||
use std::{fs::File, fs::OpenOptions};
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
use slog_scope;
|
||||
use slog_stdlog;
|
||||
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::WalAcceptorConf;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
fn main() -> Result<(), io::Error> {
|
||||
let arg_matches = App::new("Zenith wal_acceptor")
|
||||
.about("Store WAL stream to local file system and push it to WAL receivers")
|
||||
.arg(
|
||||
@@ -27,13 +29,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Path to the WAL acceptor data directory"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("systemid")
|
||||
.long("systemid")
|
||||
.takes_value(true)
|
||||
.required(true)
|
||||
.help("PostgreSQL system id, from pg_control"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
@@ -64,23 +59,16 @@ fn main() -> Result<()> {
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let systemid_str = arg_matches.value_of("systemid").unwrap();
|
||||
let systemid: u64 = systemid_str.parse()?;
|
||||
|
||||
let mut conf = WalAcceptorConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
systemid,
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_addr: "127.0.0.1:5454".parse()?,
|
||||
listen_addr: "127.0.0.1:5454".parse().unwrap(),
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
conf.data_dir = PathBuf::from(dir);
|
||||
|
||||
// change into the data directory.
|
||||
std::env::set_current_dir(&conf.data_dir)?;
|
||||
}
|
||||
|
||||
if arg_matches.is_present("no-sync") {
|
||||
@@ -102,9 +90,9 @@ fn main() -> Result<()> {
|
||||
start_wal_acceptor(conf)
|
||||
}
|
||||
|
||||
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> {
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf)?;
|
||||
let _scope_guard = init_logging(&conf);
|
||||
let _log_guard = slog_stdlog::init().unwrap();
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
@@ -113,20 +101,20 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
// that we will see any accidental manual fpritf's or backtraces.
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open("wal_acceptor.log")
|
||||
.open(conf.data_dir.join("wal_acceptor.log"))
|
||||
.unwrap();
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open("wal_acceptor.log")
|
||||
.open(conf.data_dir.join("wal_acceptor.log"))
|
||||
.unwrap();
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("wal_acceptor.pid")
|
||||
.pid_file(conf.data_dir.join("wal_acceptor.pid"))
|
||||
.working_directory(Path::new("."))
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
@@ -153,24 +141,20 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &WalAcceptorConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
fn init_logging(conf: &WalAcceptorConf) -> slog_scope::GlobalLoggerGuard {
|
||||
if conf.daemonize {
|
||||
let log = conf.data_dir.join("wal_acceptor.log");
|
||||
let log_file = File::create(&log).map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file"));
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
slog_scope::set_global_logger(logger)
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
return slog_scope::set_global_logger(logger);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,13 +4,12 @@ use std::path::PathBuf;
|
||||
|
||||
mod pq_protocol;
|
||||
pub mod wal_service;
|
||||
pub mod xlog_utils;
|
||||
|
||||
use crate::pq_protocol::SystemId;
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WalAcceptorConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub systemid: SystemId,
|
||||
pub daemonize: bool,
|
||||
pub no_sync: bool,
|
||||
pub listen_addr: SocketAddr,
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use pageserver::ZTimelineId;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type SystemId = u64;
|
||||
@@ -39,7 +37,7 @@ pub enum BeMessage<'a> {
|
||||
pub struct FeStartupMessage {
|
||||
pub version: u32,
|
||||
pub kind: StartupRequestCode,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub system_id: SystemId,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -85,33 +83,26 @@ impl FeStartupMessage {
|
||||
let params_str = str::from_utf8(¶ms_bytes).unwrap();
|
||||
let params = params_str.split('\0');
|
||||
let mut options = false;
|
||||
let mut timelineid: Option<ZTimelineId> = None;
|
||||
let mut system_id: u64 = 0;
|
||||
for p in params {
|
||||
if p == "options" {
|
||||
options = true;
|
||||
} else if options {
|
||||
for opt in p.split(' ') {
|
||||
if let Some(ztimelineid_str) = opt.strip_prefix("ztimelineid=") {
|
||||
// FIXME: rethrow parsing error, don't unwrap
|
||||
timelineid = Some(ZTimelineId::from_str(ztimelineid_str).unwrap());
|
||||
if opt.starts_with("system.id=") {
|
||||
system_id = opt[10..].parse::<u64>().unwrap();
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if timelineid.is_none() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"timelineid is required",
|
||||
));
|
||||
}
|
||||
|
||||
buf.advance(len as usize);
|
||||
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
|
||||
version,
|
||||
kind,
|
||||
timelineid: timelineid.unwrap(),
|
||||
system_id,
|
||||
})))
|
||||
}
|
||||
}
|
||||
@@ -155,20 +146,20 @@ impl<'a> BeMessage<'a> {
|
||||
|
||||
BeMessage::RowDescription(rows) => {
|
||||
buf.put_u8(b'T');
|
||||
|
||||
let mut body = BytesMut::new();
|
||||
body.put_i16(rows.len() as i16); // # of fields
|
||||
let total_len: u32 = rows
|
||||
.iter()
|
||||
.fold(0, |acc, row| acc + row.name.len() as u32 + 3 * (4 + 2));
|
||||
buf.put_u32(4 + 2 + total_len);
|
||||
for row in rows.iter() {
|
||||
body.put_slice(row.name);
|
||||
body.put_i32(0); /* table oid */
|
||||
body.put_i16(0); /* attnum */
|
||||
body.put_u32(row.typoid);
|
||||
body.put_i16(row.typlen);
|
||||
body.put_i32(-1); /* typmod */
|
||||
body.put_i16(0); /* format code */
|
||||
buf.put_i16(row.name.len() as i16);
|
||||
buf.put_slice(row.name);
|
||||
buf.put_i32(0); /* table oid */
|
||||
buf.put_i16(0); /* attnum */
|
||||
buf.put_u32(row.typoid);
|
||||
buf.put_i16(row.typlen);
|
||||
buf.put_i32(-1); /* typmod */
|
||||
buf.put_i16(0); /* format code */
|
||||
}
|
||||
buf.put_i32((4 + body.len()) as i32); // # of bytes, including len field itself
|
||||
buf.put(body);
|
||||
}
|
||||
|
||||
BeMessage::DataRow(vals) => {
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
// receive WAL from wal_proposer and send it to WAL receivers
|
||||
//
|
||||
|
||||
extern crate fs2;
|
||||
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use fs2::FileExt;
|
||||
@@ -29,9 +31,8 @@ use tokio::task;
|
||||
use tokio_postgres::{connect, Error, NoTls};
|
||||
|
||||
use crate::pq_protocol::*;
|
||||
use crate::xlog_utils::*;
|
||||
use crate::WalAcceptorConf;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
|
||||
type FullTransactionId = u64;
|
||||
|
||||
@@ -63,8 +64,7 @@ struct ServerInfo {
|
||||
protocol_version: u32, /* proxy-safekeeper protocol version */
|
||||
pg_version: u32, /* Postgres server version */
|
||||
node_id: NodeId,
|
||||
system_id: SystemId,
|
||||
timeline_id: ZTimelineId, /* Zenith timelineid */
|
||||
system_id: SystemId, /* Postgres system identifier */
|
||||
wal_end: XLogRecPtr,
|
||||
timeline: TimeLineID,
|
||||
wal_seg_size: u32,
|
||||
@@ -146,8 +146,8 @@ struct SharedState {
|
||||
* Database instance (tenant)
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
pub struct Timeline {
|
||||
timelineid: ZTimelineId,
|
||||
pub struct System {
|
||||
id: SystemId,
|
||||
mutex: Mutex<SharedState>,
|
||||
cond: Notify, /* conditional variable used to notify wal senders */
|
||||
}
|
||||
@@ -157,7 +157,7 @@ pub struct Timeline {
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
struct Connection {
|
||||
timeline: Option<Arc<Timeline>>,
|
||||
system: Option<Arc<System>>,
|
||||
stream: TcpStream, /* Postgres connection */
|
||||
inbuf: BytesMut, /* input buffer */
|
||||
outbuf: BytesMut, /* output buffer */
|
||||
@@ -211,7 +211,6 @@ impl Serializer for ServerInfo {
|
||||
buf.put_u32_le(self.pg_version);
|
||||
self.node_id.pack(buf);
|
||||
buf.put_u64_le(self.system_id);
|
||||
buf.put_slice(&self.timeline_id.as_arr());
|
||||
buf.put_u64_le(self.wal_end);
|
||||
buf.put_u32_le(self.timeline);
|
||||
buf.put_u32_le(self.wal_seg_size);
|
||||
@@ -222,7 +221,6 @@ impl Serializer for ServerInfo {
|
||||
pg_version: buf.get_u32_le(),
|
||||
node_id: NodeId::unpack(buf),
|
||||
system_id: buf.get_u64_le(),
|
||||
timeline_id: ZTimelineId::get_from_buf(buf),
|
||||
wal_end: buf.get_u64_le(),
|
||||
timeline: buf.get_u32_le(),
|
||||
wal_seg_size: buf.get_u32_le(),
|
||||
@@ -280,7 +278,6 @@ impl SafeKeeperInfo {
|
||||
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
|
||||
node_id: NodeId { term: 0, uuid: 0 },
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
timeline_id: ZTimelineId::from([0u8; 16]),
|
||||
wal_end: 0,
|
||||
timeline: 0,
|
||||
wal_seg_size: 0,
|
||||
@@ -352,8 +349,7 @@ impl Serializer for SafeKeeperResponse {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref TIMELINES: Mutex<HashMap<ZTimelineId, Arc<Timeline>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
pub static ref SYSTEMS: Mutex<HashMap<SystemId, Arc<System>>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
pub fn thread_main(conf: WalAcceptorConf) {
|
||||
@@ -370,7 +366,7 @@ pub fn thread_main(conf: WalAcceptorConf) {
|
||||
info!("Starting wal acceptor on {}", conf.listen_addr);
|
||||
|
||||
runtime.block_on(async {
|
||||
main_loop(&conf).await.unwrap();
|
||||
let _unused = main_loop(&conf).await;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -393,8 +389,8 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub fn new(timelineid: ZTimelineId) -> Timeline {
|
||||
impl System {
|
||||
pub fn new(id: SystemId) -> System {
|
||||
let shared_state = SharedState {
|
||||
commit_lsn: 0,
|
||||
info: SafeKeeperInfo::new(),
|
||||
@@ -405,8 +401,8 @@ impl Timeline {
|
||||
catalog_xmin: u64::MAX,
|
||||
},
|
||||
};
|
||||
Timeline {
|
||||
timelineid,
|
||||
System {
|
||||
id: id,
|
||||
mutex: Mutex::new(shared_state),
|
||||
cond: Notify::new(),
|
||||
}
|
||||
@@ -444,26 +440,15 @@ impl Timeline {
|
||||
|
||||
fn get_hs_feedback(&self) -> HotStandbyFeedback {
|
||||
let shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.hs_feedback
|
||||
return shared_state.hs_feedback;
|
||||
}
|
||||
|
||||
// Load and lock control file (prevent running more than one instance of safekeeper)
|
||||
fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
|
||||
if shared_state.control_file.is_some() {
|
||||
info!(
|
||||
"control file for timeline {} is already open",
|
||||
self.timelineid
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Load and lock control file (prevent running more than one instance of safekeeper
|
||||
fn load_control_file(&self, conf: &WalAcceptorConf) {
|
||||
let control_file_path = conf
|
||||
.data_dir
|
||||
.join(self.timelineid.to_string())
|
||||
.join(self.id.to_string())
|
||||
.join(CONTROL_FILE_NAME);
|
||||
info!("loading control file {}", control_file_path.display());
|
||||
match OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
@@ -475,13 +460,13 @@ impl Timeline {
|
||||
match file.try_lock_exclusive() {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
io_error!(
|
||||
panic!(
|
||||
"Control file {:?} is locked by some other process: {}",
|
||||
&control_file_path,
|
||||
e
|
||||
&control_file_path, e
|
||||
);
|
||||
}
|
||||
}
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.control_file = Some(file);
|
||||
|
||||
const SIZE: usize = mem::size_of::<SafeKeeperInfo>();
|
||||
@@ -498,13 +483,12 @@ impl Timeline {
|
||||
let my_info = SafeKeeperInfo::unpack(&mut input);
|
||||
|
||||
if my_info.magic != SK_MAGIC {
|
||||
io_error!("Invalid control file magic: {}", my_info.magic);
|
||||
panic!("Invalid control file magic: {}", my_info.magic);
|
||||
}
|
||||
if my_info.format_version != SK_FORMAT_VERSION {
|
||||
io_error!(
|
||||
panic!(
|
||||
"Incompatible format version: {} vs. {}",
|
||||
my_info.format_version,
|
||||
SK_FORMAT_VERSION
|
||||
my_info.format_version, SK_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
shared_state.info = my_info;
|
||||
@@ -517,7 +501,6 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_control_file(&self, sync: bool) -> Result<()> {
|
||||
@@ -527,7 +510,7 @@ impl Timeline {
|
||||
|
||||
let file = shared_state.control_file.as_mut().unwrap();
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.write_all(&buf[..])?;
|
||||
file.write_all(&mut buf[..])?;
|
||||
if sync {
|
||||
file.sync_all()?;
|
||||
}
|
||||
@@ -538,7 +521,7 @@ impl Timeline {
|
||||
impl Connection {
|
||||
pub fn new(socket: TcpStream, conf: &WalAcceptorConf) -> Connection {
|
||||
Connection {
|
||||
timeline: None,
|
||||
system: None,
|
||||
stream: socket,
|
||||
inbuf: BytesMut::with_capacity(10 * 1024),
|
||||
outbuf: BytesMut::with_capacity(10 * 1024),
|
||||
@@ -547,14 +530,14 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
|
||||
fn timeline(&self) -> Arc<Timeline> {
|
||||
self.timeline.as_ref().unwrap().clone()
|
||||
fn system(&self) -> Arc<System> {
|
||||
self.system.as_ref().unwrap().clone()
|
||||
}
|
||||
|
||||
async fn run(&mut self) -> Result<()> {
|
||||
self.inbuf.resize(4, 0u8);
|
||||
self.stream.read_exact(&mut self.inbuf[0..4]).await?;
|
||||
let startup_pkg_len = BigEndian::read_u32(&self.inbuf[0..4]);
|
||||
let startup_pkg_len = BigEndian::read_u32(&mut self.inbuf[0..4]);
|
||||
if startup_pkg_len == 0 {
|
||||
self.receive_wal().await?; // internal protocol between wal_proposer and wal_acceptor
|
||||
} else {
|
||||
@@ -580,15 +563,10 @@ impl Connection {
|
||||
"no_user",
|
||||
);
|
||||
let callme = format!(
|
||||
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
|
||||
self.timeline().timelineid,
|
||||
"callmemaybe host={} port={} replication=1 options='-c system.id={}'",
|
||||
self.conf.listen_addr.ip(),
|
||||
self.conf.listen_addr.port(),
|
||||
self.timeline().timelineid
|
||||
);
|
||||
info!(
|
||||
"requesting page server to connect to us: start {} {}",
|
||||
ps_connstr, callme
|
||||
self.system().get_info().server.system_id,
|
||||
);
|
||||
let (client, connection) = connect(&ps_connstr, NoTls).await?;
|
||||
|
||||
@@ -604,14 +582,22 @@ impl Connection {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_timeline(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
let mut timelines = TIMELINES.lock().unwrap();
|
||||
if !timelines.contains_key(&timelineid) {
|
||||
info!("creating timeline dir {}", timelineid);
|
||||
fs::create_dir_all(timelineid.to_string())?;
|
||||
timelines.insert(timelineid, Arc::new(Timeline::new(timelineid)));
|
||||
fn set_system(&mut self, id: SystemId) -> Result<()> {
|
||||
let mut systems = SYSTEMS.lock().unwrap();
|
||||
if id == 0 {
|
||||
// non-multitenant configuration: just a single instance
|
||||
if let Some(system) = systems.values().next() {
|
||||
self.system = Some(system.clone());
|
||||
return Ok(());
|
||||
}
|
||||
io_error!("No active instances");
|
||||
}
|
||||
self.timeline = Some(timelines.get(&timelineid).unwrap().clone());
|
||||
if !systems.contains_key(&id) {
|
||||
let system_dir = self.conf.data_dir.join(id.to_string());
|
||||
fs::create_dir_all(system_dir)?;
|
||||
systems.insert(id, Arc::new(System::new(id)));
|
||||
}
|
||||
self.system = Some(systems.get(&id).unwrap().clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -620,16 +606,14 @@ impl Connection {
|
||||
// Receive information about server
|
||||
let server_info = self.read_req::<ServerInfo>().await?;
|
||||
info!(
|
||||
"Start handshake with wal_proposer {} sysid {} timeline {}",
|
||||
"Start handshake with wal_proposer {} sysid {}",
|
||||
self.stream.peer_addr()?,
|
||||
server_info.system_id,
|
||||
server_info.timeline_id,
|
||||
server_info.system_id
|
||||
);
|
||||
// FIXME: also check that the system identifier matches
|
||||
self.set_timeline(server_info.timeline_id)?;
|
||||
self.timeline().load_control_file(&self.conf)?;
|
||||
self.set_system(server_info.system_id)?;
|
||||
self.system().load_control_file(&self.conf);
|
||||
|
||||
let mut my_info = self.timeline().get_info();
|
||||
let mut my_info = self.system().get_info();
|
||||
|
||||
/* Check protocol compatibility */
|
||||
if server_info.protocol_version != SK_PROTOCOL_VERSION {
|
||||
@@ -678,9 +662,9 @@ impl Connection {
|
||||
);
|
||||
}
|
||||
my_info.server.node_id = prop.node_id;
|
||||
self.timeline().set_info(&my_info);
|
||||
self.system().set_info(&my_info);
|
||||
/* Need to persist our vote first */
|
||||
self.timeline().save_control_file(true)?;
|
||||
self.system().save_control_file(true)?;
|
||||
|
||||
let mut flushed_restart_lsn: XLogRecPtr = 0;
|
||||
let wal_seg_size = server_info.wal_seg_size as usize;
|
||||
@@ -694,13 +678,12 @@ impl Connection {
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
if let Err(e) = self.request_callback().await {
|
||||
// Do not treate it as fatal error and continue work
|
||||
// FIXME: we should retry after a while...
|
||||
error!("Failed to send callme request to pageserver: {}", e);
|
||||
}
|
||||
|
||||
info!(
|
||||
"Start streaming from timeline {} address {:?}",
|
||||
server_info.timeline_id,
|
||||
"Start streaming from server {} address {:?}",
|
||||
server_info.system_id,
|
||||
self.stream.peer_addr()?
|
||||
);
|
||||
|
||||
@@ -722,15 +705,6 @@ impl Connection {
|
||||
let rec_size = (end_pos - start_pos) as usize;
|
||||
assert!(rec_size <= MAX_SEND_SIZE);
|
||||
|
||||
debug!(
|
||||
"received for {} bytes between {:X}/{:X} and {:X}/{:X}",
|
||||
rec_size,
|
||||
start_pos >> 32,
|
||||
start_pos & 0xffffffff,
|
||||
end_pos >> 32,
|
||||
end_pos & 0xffffffff
|
||||
);
|
||||
|
||||
/* Receive message body */
|
||||
self.inbuf.resize(rec_size, 0u8);
|
||||
self.stream.read_exact(&mut self.inbuf[0..rec_size]).await?;
|
||||
@@ -761,7 +735,7 @@ impl Connection {
|
||||
* when restart_lsn delta exceeds WAL segment size.
|
||||
*/
|
||||
sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn;
|
||||
self.timeline().save_control_file(sync_control_file)?;
|
||||
self.system().save_control_file(sync_control_file)?;
|
||||
|
||||
if sync_control_file {
|
||||
flushed_restart_lsn = my_info.restart_lsn;
|
||||
@@ -772,7 +746,7 @@ impl Connection {
|
||||
let resp = SafeKeeperResponse {
|
||||
epoch: my_info.epoch,
|
||||
flush_lsn: end_pos,
|
||||
hs_feedback: self.timeline().get_hs_feedback(),
|
||||
hs_feedback: self.system().get_hs_feedback(),
|
||||
};
|
||||
self.start_sending();
|
||||
resp.pack(&mut self.outbuf);
|
||||
@@ -782,7 +756,7 @@ impl Connection {
|
||||
* Ping wal sender that new data is available.
|
||||
* FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper.
|
||||
*/
|
||||
self.timeline()
|
||||
self.system()
|
||||
.notify_wal_senders(min(req.commit_lsn, end_pos));
|
||||
}
|
||||
Ok(())
|
||||
@@ -833,7 +807,7 @@ impl Connection {
|
||||
}
|
||||
|
||||
//
|
||||
// Send WAL to replica or WAL receiver using standard libpq replication protocol
|
||||
// Send WAL to replica or WAL sender using standard libpq replication protocol
|
||||
//
|
||||
async fn send_wal(&mut self) -> Result<()> {
|
||||
info!("WAL sender to {:?} is started", self.stream.peer_addr()?);
|
||||
@@ -854,7 +828,7 @@ impl Connection {
|
||||
BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery);
|
||||
self.send().await?;
|
||||
self.init_done = true;
|
||||
self.set_timeline(m.timelineid)?;
|
||||
self.set_system(m.system_id)?;
|
||||
}
|
||||
StartupRequestCode::Cancel => return Ok(()),
|
||||
}
|
||||
@@ -887,7 +861,7 @@ impl Connection {
|
||||
let (start_pos, timeline) = self.find_end_of_wal(false);
|
||||
let lsn = format!("{:X}/{:>08X}", (start_pos >> 32) as u32, start_pos as u32);
|
||||
let tli = timeline.to_string();
|
||||
let sysid = self.timeline().get_info().server.system_id.to_string();
|
||||
let sysid = self.system().get_info().server.system_id.to_string();
|
||||
let lsn_bytes = lsn.as_bytes();
|
||||
let tli_bytes = tli.as_bytes();
|
||||
let sysid_bytes = sysid.as_bytes();
|
||||
@@ -919,11 +893,11 @@ impl Connection {
|
||||
);
|
||||
BeMessage::write(
|
||||
&mut self.outbuf,
|
||||
&BeMessage::DataRow(&[Some(sysid_bytes), Some(tli_bytes), Some(lsn_bytes), None]),
|
||||
&BeMessage::DataRow(&[Some(lsn_bytes), Some(tli_bytes), Some(sysid_bytes), None]),
|
||||
);
|
||||
BeMessage::write(
|
||||
&mut self.outbuf,
|
||||
&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM\0"),
|
||||
&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"),
|
||||
);
|
||||
BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery);
|
||||
self.send().await?;
|
||||
@@ -943,7 +917,7 @@ impl Connection {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let wal_seg_size = self.timeline().get_info().server.wal_seg_size as usize;
|
||||
let wal_seg_size = self.system().get_info().server.wal_seg_size as usize;
|
||||
if wal_seg_size == 0 {
|
||||
io_error!("Can not start replication before connecting to wal_proposer");
|
||||
}
|
||||
@@ -961,6 +935,15 @@ impl Connection {
|
||||
BeMessage::write(&mut self.outbuf, &BeMessage::Copy);
|
||||
self.send().await?;
|
||||
|
||||
/*
|
||||
* Always start streaming at the beginning of a segment
|
||||
*
|
||||
* FIXME: It is common practice to start streaming at the beginning of
|
||||
* the segment, but it should be up to the client to decide that. We
|
||||
* shouldn't enforce that here.
|
||||
*/
|
||||
start_pos -= XLogSegmentOffset(start_pos, wal_seg_size) as u64;
|
||||
|
||||
let mut end_pos: XLogRecPtr;
|
||||
let mut commit_lsn: XLogRecPtr;
|
||||
let mut wal_file: Option<File> = None;
|
||||
@@ -977,18 +960,19 @@ impl Connection {
|
||||
end_pos = stop_pos;
|
||||
} else {
|
||||
/* normal mode */
|
||||
let timeline = self.timeline();
|
||||
loop {
|
||||
// Rust doesn't allow to grab async result from mutex scope
|
||||
let system = self.system();
|
||||
let notified = system.cond.notified();
|
||||
{
|
||||
let shared_state = timeline.mutex.lock().unwrap();
|
||||
let shared_state = system.mutex.lock().unwrap();
|
||||
commit_lsn = shared_state.commit_lsn;
|
||||
if start_pos < commit_lsn {
|
||||
end_pos = commit_lsn;
|
||||
break;
|
||||
}
|
||||
}
|
||||
timeline.cond.notified().await;
|
||||
notified.await;
|
||||
}
|
||||
}
|
||||
if end_pos == END_REPLICATION_MARKER {
|
||||
@@ -997,15 +981,15 @@ impl Connection {
|
||||
// Try to fetch replica's feedback
|
||||
match self.stream.try_read_buf(&mut self.inbuf) {
|
||||
Ok(0) => break,
|
||||
Ok(_) => {
|
||||
if let Some(FeMessage::CopyData(m)) = self.parse_message()? {
|
||||
self.timeline()
|
||||
.add_hs_feedback(HotStandbyFeedback::parse(&m.body))
|
||||
}
|
||||
}
|
||||
Ok(_) => match self.parse_message()? {
|
||||
Some(FeMessage::CopyData(m)) => self
|
||||
.system()
|
||||
.add_hs_feedback(HotStandbyFeedback::parse(&m.body)),
|
||||
_ => {}
|
||||
},
|
||||
Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1020,7 +1004,7 @@ impl Connection {
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name.clone() + ".partial");
|
||||
if let Ok(opened_file) = File::open(&wal_file_path) {
|
||||
file = opened_file;
|
||||
@@ -1028,30 +1012,21 @@ impl Connection {
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name);
|
||||
match File::open(&wal_file_path) {
|
||||
Ok(opened_file) => file = opened_file,
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
||||
|
||||
// How much to read and send in message? We cannot cross the WAL file
|
||||
// boundary, and we don't want send more than MAX_SEND_SIZE.
|
||||
let send_size = (end_pos - start_pos) as usize;
|
||||
let send_size = min(send_size, wal_seg_size - xlogoff);
|
||||
let send_size = min(send_size, MAX_SEND_SIZE);
|
||||
|
||||
let send_size = min((end_pos - start_pos) as usize, MAX_SEND_SIZE);
|
||||
let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size;
|
||||
let data_start = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE;
|
||||
let data_end = data_start + send_size;
|
||||
|
||||
file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
file.read_exact(&mut self.outbuf[data_start..data_end])?;
|
||||
self.outbuf[0] = b'd';
|
||||
BigEndian::write_u32(
|
||||
@@ -1066,12 +1041,6 @@ impl Connection {
|
||||
self.stream.write_all(&self.outbuf[0..msg_size]).await?;
|
||||
start_pos += send_size as u64;
|
||||
|
||||
debug!(
|
||||
"Sent WAL to page server up to {:X}/{:>08X}",
|
||||
(end_pos >> 32) as u32,
|
||||
end_pos as u32
|
||||
);
|
||||
|
||||
if XLogSegmentOffset(start_pos, wal_seg_size) != 0 {
|
||||
wal_file = Some(file);
|
||||
}
|
||||
@@ -1102,7 +1071,7 @@ impl Connection {
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
||||
@@ -1126,12 +1095,12 @@ impl Connection {
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
@@ -1161,7 +1130,7 @@ impl Connection {
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1194,7 +1163,7 @@ impl Connection {
|
||||
fn find_end_of_wal(&self, precise: bool) -> (XLogRecPtr, TimeLineID) {
|
||||
find_end_of_wal(
|
||||
&self.conf.data_dir,
|
||||
self.timeline().get_info().server.wal_seg_size as usize,
|
||||
self.system().get_info().server.wal_seg_size as usize,
|
||||
precise,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,19 +1,10 @@
|
||||
//
|
||||
// This file contains common utilities for dealing with PostgreSQL WAL files and
|
||||
// LSNs.
|
||||
//
|
||||
// Many of these functions have been copied from PostgreSQL, and rewritten in
|
||||
// Rust. That's why they don't follow the usual Rust naming conventions, they
|
||||
// have been named the same as the corresponding PostgreSQL functions instead.
|
||||
//
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::PathBuf;
|
||||
use std::time::SystemTime;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
@@ -32,17 +23,17 @@ pub type XLogSegNo = u64;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
(xlogptr as u32) & (wal_segsz_bytes as u32 - 1)
|
||||
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
xlogptr / wal_segsz_bytes as u64
|
||||
return xlogptr / wal_segsz_bytes as u64;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -51,7 +42,7 @@ pub fn XLogSegNoOffsetToRecPtr(
|
||||
offset: u32,
|
||||
wal_segsz_bytes: usize,
|
||||
) -> XLogRecPtr {
|
||||
segno * (wal_segsz_bytes as u64) + (offset as u64)
|
||||
return segno * (wal_segsz_bytes as u64) + (offset as u64);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -69,7 +60,7 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
|
||||
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -79,7 +70,7 @@ pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
|
||||
return fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]);
|
||||
}
|
||||
|
||||
pub fn get_current_timestamp() -> TimestampTz {
|
||||
@@ -98,7 +89,7 @@ pub fn get_current_timestamp() -> TimestampTz {
|
||||
}
|
||||
|
||||
fn find_end_of_wal_segment(
|
||||
data_dir: &Path,
|
||||
data_dir: &PathBuf,
|
||||
segno: XLogSegNo,
|
||||
tli: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
@@ -190,14 +181,11 @@ fn find_end_of_wal_segment(
|
||||
}
|
||||
}
|
||||
}
|
||||
last_valid_rec_pos as u32
|
||||
return last_valid_rec_pos as u32;
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
|
||||
///
|
||||
pub fn find_end_of_wal(
|
||||
data_dir: &Path,
|
||||
data_dir: &PathBuf,
|
||||
wal_seg_size: usize,
|
||||
precise: bool,
|
||||
) -> (XLogRecPtr, TimeLineID) {
|
||||
@@ -249,7 +237,7 @@ pub fn find_end_of_wal(
|
||||
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
|
||||
return (high_ptr, high_tli);
|
||||
}
|
||||
(0, 0)
|
||||
return (0, 0);
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
@@ -1,16 +0,0 @@
|
||||
[package]
|
||||
name = "zenith"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
clap = "2.33.0"
|
||||
anyhow = "1.0"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
@@ -1,333 +0,0 @@
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{anyhow, bail};
|
||||
use clap::{App, Arg, ArgMatches, SubCommand};
|
||||
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::storage::PageServerNode;
|
||||
use control_plane::{compute::ComputeControlPlane, local_env, storage};
|
||||
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
fn zenith_repo_dir() -> PathBuf {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
// Main entry point for the 'zenith' CLI utility
|
||||
//
|
||||
// This utility can used to work with a local zenith repository.
|
||||
// In order to run queries in it, you need to launch the page server,
|
||||
// and a compute node against the page server
|
||||
fn main() -> Result<()> {
|
||||
let name_arg = Arg::with_name("NAME")
|
||||
.short("n")
|
||||
.index(1)
|
||||
.help("name of this postgres instance")
|
||||
.required(true);
|
||||
let matches = App::new("zenith")
|
||||
.about("Zenith CLI")
|
||||
.subcommand(
|
||||
SubCommand::with_name("init")
|
||||
.about("Initialize a new Zenith repository in current directory"),
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("branch")
|
||||
.about("Create a new branch")
|
||||
.arg(Arg::with_name("branchname").required(false).index(1))
|
||||
.arg(Arg::with_name("start-point").required(false).index(2)),
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("pageserver")
|
||||
.about("Manage pageserver instance")
|
||||
.subcommand(SubCommand::with_name("status"))
|
||||
.subcommand(SubCommand::with_name("start"))
|
||||
.subcommand(SubCommand::with_name("stop")),
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("pg")
|
||||
.about("Manage postgres instances")
|
||||
.subcommand(
|
||||
SubCommand::with_name("create")
|
||||
// .arg(name_arg.clone()
|
||||
// .required(false)
|
||||
// .help("name of this postgres instance (will be pgN if omitted)"))
|
||||
.arg(Arg::with_name("timeline").required(false).index(1)),
|
||||
)
|
||||
.subcommand(SubCommand::with_name("list"))
|
||||
.subcommand(SubCommand::with_name("start").arg(name_arg.clone()))
|
||||
.subcommand(SubCommand::with_name("stop").arg(name_arg.clone()))
|
||||
.subcommand(SubCommand::with_name("destroy").arg(name_arg.clone())),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
// handle init separately and exit
|
||||
if let ("init", Some(sub_args)) = matches.subcommand() {
|
||||
run_init_cmd(sub_args.clone())?;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// all other commands would need config
|
||||
|
||||
let repopath = zenith_repo_dir();
|
||||
if !repopath.exists() {
|
||||
bail!(
|
||||
"Zenith repository does not exists in {}.\n\
|
||||
Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'",
|
||||
repopath.display()
|
||||
);
|
||||
}
|
||||
// TODO: check that it looks like a zenith repository
|
||||
let env = match local_env::load_config(&repopath) {
|
||||
Ok(conf) => conf,
|
||||
Err(e) => {
|
||||
eprintln!("Error loading config from {}: {}", repopath.display(), e);
|
||||
exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
match matches.subcommand() {
|
||||
("init", Some(_)) => {
|
||||
panic!() /* Should not happen. Init was handled before */
|
||||
}
|
||||
|
||||
("branch", Some(sub_args)) => run_branch_cmd(&env, sub_args.clone())?,
|
||||
("pageserver", Some(sub_args)) => run_pageserver_cmd(&env, sub_args.clone())?,
|
||||
|
||||
("start", Some(_sub_m)) => {
|
||||
let pageserver = storage::PageServerNode::from_env(&env);
|
||||
|
||||
if let Err(e) = pageserver.start() {
|
||||
eprintln!("pageserver start: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
("stop", Some(_sub_m)) => {
|
||||
let pageserver = storage::PageServerNode::from_env(&env);
|
||||
if let Err(e) = pageserver.stop() {
|
||||
eprintln!("pageserver stop: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
("status", Some(_sub_m)) => {}
|
||||
|
||||
("pg", Some(pg_match)) => {
|
||||
if let Err(e) = handle_pg(pg_match, &env) {
|
||||
eprintln!("pg operation failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_pageserver_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> {
|
||||
match args.subcommand() {
|
||||
("status", Some(_sub_m)) => {
|
||||
todo!();
|
||||
}
|
||||
("start", Some(_sub_m)) => {
|
||||
let psnode = PageServerNode::from_env(local_env);
|
||||
psnode.start()?;
|
||||
println!("Page server started");
|
||||
}
|
||||
("stop", Some(_sub_m)) => {
|
||||
todo!();
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Peek into the repository, to grab the timeline ID of given branch
|
||||
pub fn get_branch_timeline(repopath: &Path, branchname: &str) -> ZTimelineId {
|
||||
let branchpath = repopath.join("refs/branches/".to_owned() + branchname);
|
||||
|
||||
ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
|
||||
}
|
||||
|
||||
fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
|
||||
match pg_match.subcommand() {
|
||||
("create", Some(sub_m)) => {
|
||||
// FIXME: cheat and resolve the timeline by peeking into the
|
||||
// repository. In reality, when you're launching a compute node
|
||||
// against a possibly-remote page server, we wouldn't know what
|
||||
// branches exist in the remote repository. Or would we require
|
||||
// that you "zenith fetch" them into a local repoitory first?
|
||||
let timeline_arg = sub_m.value_of("timeline").unwrap_or("main");
|
||||
let timeline = get_branch_timeline(&env.repo_path, timeline_arg);
|
||||
|
||||
println!("Initializing Postgres on timeline {}...", timeline);
|
||||
|
||||
cplane.new_node(timeline)?;
|
||||
}
|
||||
("list", Some(_sub_m)) => {
|
||||
println!("NODE\tADDRESS\tSTATUS");
|
||||
for (node_name, node) in cplane.nodes.iter() {
|
||||
println!("{}\t{}\t{}", node_name, node.address, node.status());
|
||||
}
|
||||
}
|
||||
("start", Some(sub_m)) => {
|
||||
let name = sub_m.value_of("NAME").unwrap();
|
||||
let node = cplane
|
||||
.nodes
|
||||
.get(name)
|
||||
.ok_or_else(|| anyhow!("postgres {} is not found", name))?;
|
||||
node.start()?;
|
||||
}
|
||||
("stop", Some(sub_m)) => {
|
||||
let name = sub_m.value_of("NAME").unwrap();
|
||||
let node = cplane
|
||||
.nodes
|
||||
.get(name)
|
||||
.ok_or_else(|| anyhow!("postgres {} is not found", name))?;
|
||||
node.stop()?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// "zenith init" - Initialize a new Zenith repository in current dir
|
||||
fn run_init_cmd(_args: ArgMatches) -> Result<()> {
|
||||
local_env::init()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// handle "zenith branch" subcommand
|
||||
fn run_branch_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> {
|
||||
let repopath = local_env.repo_path.to_str().unwrap();
|
||||
|
||||
if let Some(branchname) = args.value_of("branchname") {
|
||||
if PathBuf::from(format!("{}/refs/branches/{}", repopath, branchname)).exists() {
|
||||
anyhow::bail!("branch {} already exists", branchname);
|
||||
}
|
||||
|
||||
if let Some(startpoint_str) = args.value_of("start-point") {
|
||||
let mut startpoint = parse_point_in_time(startpoint_str)?;
|
||||
|
||||
if startpoint.lsn == 0 {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = local_env::find_end_of_wal(local_env, startpoint.timelineid)?;
|
||||
|
||||
println!(
|
||||
"branching at end of WAL: {:X}/{:X}",
|
||||
end_of_wal >> 32,
|
||||
end_of_wal & 0xffffffff
|
||||
);
|
||||
|
||||
startpoint.lsn = end_of_wal;
|
||||
}
|
||||
|
||||
return local_env::create_branch(local_env, branchname, startpoint);
|
||||
} else {
|
||||
panic!("Missing start-point");
|
||||
}
|
||||
} else {
|
||||
// No arguments, list branches
|
||||
list_branches()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn list_branches() -> Result<()> {
|
||||
// list branches
|
||||
let paths = fs::read_dir(zenith_repo_dir().join("refs").join("branches"))?;
|
||||
|
||||
for path in paths {
|
||||
println!(" {}", path?.file_name().to_str().unwrap());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Parse user-given string that represents a point-in-time.
|
||||
//
|
||||
// We support multiple variants:
|
||||
//
|
||||
// Raw timeline id in hex, meaning the end of that timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d
|
||||
//
|
||||
// A specific LSN on a timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
|
||||
//
|
||||
// Same, with a human-friendly branch name:
|
||||
// main
|
||||
// main@2/15D3DD8
|
||||
//
|
||||
// Human-friendly tag name:
|
||||
// mytag
|
||||
//
|
||||
//
|
||||
fn parse_point_in_time(s: &str) -> Result<local_env::PointInTime> {
|
||||
let mut strings = s.split('@');
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<u64>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
let mut s = lsnstr.split('/');
|
||||
let lsn_hi: u64 = s
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.parse()?;
|
||||
let lsn_lo: u64 = s
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.parse()?;
|
||||
lsn = Some(lsn_hi << 32 | lsn_lo);
|
||||
} else {
|
||||
lsn = None
|
||||
}
|
||||
|
||||
// Check if it's a tag
|
||||
if lsn.is_none() {
|
||||
let tagpath = zenith_repo_dir().join("refs").join("tags").join(name);
|
||||
if tagpath.exists() {
|
||||
let pointstr = fs::read_to_string(tagpath)?;
|
||||
|
||||
return parse_point_in_time(&pointstr);
|
||||
}
|
||||
}
|
||||
// Check if it's a branch
|
||||
// Check if it's branch @ LSN
|
||||
let branchpath = zenith_repo_dir().join("refs").join("branches").join(name);
|
||||
if branchpath.exists() {
|
||||
let pointstr = fs::read_to_string(branchpath)?;
|
||||
|
||||
let mut result = parse_point_in_time(&pointstr)?;
|
||||
|
||||
result.lsn = lsn.unwrap_or(0);
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Check if it's a timelineid
|
||||
// Check if it's timelineid @ LSN
|
||||
let tlipath = zenith_repo_dir().join("timelines").join(name);
|
||||
if tlipath.exists() {
|
||||
let result = local_env::PointInTime {
|
||||
timelineid: ZTimelineId::from_str(name)?,
|
||||
lsn: lsn.unwrap_or(0),
|
||||
};
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
panic!("could not parse point-in-time {}", s);
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
[package]
|
||||
name = "zenith_utils"
|
||||
version = "0.1.0"
|
||||
authors = ["Eric Seppanen <eric@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1.5", features = ["sync", "time" ] }
|
||||
thiserror = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { version = "1.5", features = ["macros", "rt"] }
|
||||
@@ -1,4 +0,0 @@
|
||||
//! zenith_utils is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
pub mod seqwait;
|
||||
@@ -1,199 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::mem;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch::{channel, Receiver, Sender};
|
||||
use tokio::time::timeout;
|
||||
|
||||
/// An error happened while waiting for a number
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
#[error("SeqWaitError")]
|
||||
pub enum SeqWaitError {
|
||||
/// The wait timeout was reached
|
||||
Timeout,
|
||||
/// [`SeqWait::shutdown`] was called
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt {
|
||||
waiters: BTreeMap<u64, (Sender<()>, Receiver<()>)>,
|
||||
current: u64,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
/// A tool for waiting on a sequence number
|
||||
///
|
||||
/// This provides a way to await the arrival of a number.
|
||||
/// As soon as the number arrives by another caller calling
|
||||
/// [`advance`], then the waiter will be woken up.
|
||||
///
|
||||
/// This implementation takes a blocking Mutex on both [`wait_for`]
|
||||
/// and [`advance`], meaning there may be unexpected executor blocking
|
||||
/// due to thread scheduling unfairness. There are probably better
|
||||
/// implementations, but we can probably live with this for now.
|
||||
///
|
||||
/// [`wait_for`]: SeqWait::wait_for
|
||||
/// [`advance`]: SeqWait::advance
|
||||
///
|
||||
pub struct SeqWait {
|
||||
internal: Mutex<SeqWaitInt>,
|
||||
}
|
||||
|
||||
impl SeqWait {
|
||||
/// Create a new `SeqWait`, initialized to a particular number
|
||||
pub fn new(starting_num: u64) -> Self {
|
||||
let internal = SeqWaitInt {
|
||||
waiters: BTreeMap::new(),
|
||||
current: starting_num,
|
||||
shutdown: false,
|
||||
};
|
||||
SeqWait {
|
||||
internal: Mutex::new(internal),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shut down a `SeqWait`, causing all waiters (present and
|
||||
/// future) to return an error.
|
||||
pub fn shutdown(&self) {
|
||||
let waiters = {
|
||||
// Prevent new waiters; wake all those that exist.
|
||||
// Wake everyone with an error.
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
// This will steal the entire waiters map.
|
||||
// When we drop it all waiters will be woken.
|
||||
mem::take(&mut internal.waiters);
|
||||
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
|
||||
// When we drop the waiters list, each Receiver will
|
||||
// be woken with an error.
|
||||
// This drop doesn't need to be explicit; it's done
|
||||
// here to make it easier to read the code and understand
|
||||
// the order of events.
|
||||
drop(waiters);
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
pub async fn wait_for(&self, num: u64) -> Result<(), SeqWaitError> {
|
||||
let mut rx = {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
if internal.current >= num {
|
||||
return Ok(());
|
||||
}
|
||||
if internal.shutdown {
|
||||
return Err(SeqWaitError::Shutdown);
|
||||
}
|
||||
|
||||
// If we already have a channel for waiting on this number, reuse it.
|
||||
if let Some((_, rx)) = internal.waiters.get_mut(&num) {
|
||||
// an Err from changed() means the sender was dropped.
|
||||
rx.clone()
|
||||
} else {
|
||||
// Create a new channel.
|
||||
let (tx, rx) = channel(());
|
||||
internal.waiters.insert(num, (tx, rx.clone()));
|
||||
rx
|
||||
}
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
rx.changed().await.map_err(|_| SeqWaitError::Shutdown)
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
///
|
||||
/// If that hasn't happened after the specified timeout duration,
|
||||
/// [`SeqWaitError::Timeout`] will be returned.
|
||||
pub async fn wait_for_timeout(
|
||||
&self,
|
||||
num: u64,
|
||||
timeout_duration: Duration,
|
||||
) -> Result<(), SeqWaitError> {
|
||||
timeout(timeout_duration, self.wait_for(num))
|
||||
.await
|
||||
.unwrap_or(Err(SeqWaitError::Timeout))
|
||||
}
|
||||
|
||||
/// Announce a new number has arrived
|
||||
///
|
||||
/// All waiters at this value or below will be woken.
|
||||
///
|
||||
/// `advance` will panic if you send it a lower number than
|
||||
/// a previous call.
|
||||
pub fn advance(&self, num: u64) {
|
||||
let wake_these = {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
if internal.current > num {
|
||||
panic!(
|
||||
"tried to advance backwards, from {} to {}",
|
||||
internal.current, num
|
||||
);
|
||||
}
|
||||
internal.current = num;
|
||||
|
||||
// split_off will give me all the high-numbered waiters,
|
||||
// so split and then swap. Everything at or above (num + 1)
|
||||
// gets to stay.
|
||||
let mut split = internal.waiters.split_off(&(num + 1));
|
||||
std::mem::swap(&mut split, &mut internal.waiters);
|
||||
split
|
||||
};
|
||||
|
||||
for (_wake_num, (tx, _rx)) in wake_these {
|
||||
// This can fail if there are no receivers.
|
||||
// We don't care; discard the error.
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
#[tokio::test]
|
||||
async fn seqwait() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
let seq3 = Arc::clone(&seq);
|
||||
tokio::spawn(async move {
|
||||
seq2.wait_for(42).await.expect("wait_for 42");
|
||||
seq2.advance(100);
|
||||
seq2.wait_for(999).await.expect_err("no 999");
|
||||
});
|
||||
tokio::spawn(async move {
|
||||
seq3.wait_for(42).await.expect("wait_for 42");
|
||||
seq3.wait_for(0).await.expect("wait_for 0");
|
||||
});
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
seq.advance(99);
|
||||
seq.wait_for(100).await.expect("wait_for 100");
|
||||
seq.shutdown();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seqwait_timeout() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
tokio::spawn(async move {
|
||||
let timeout = Duration::from_millis(1);
|
||||
let res = seq2.wait_for_timeout(42, timeout).await;
|
||||
assert_eq!(res, Err(SeqWaitError::Timeout));
|
||||
});
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
// This will attempt to wake, but nothing will happen
|
||||
// because the waiter already dropped its Receiver.
|
||||
seq.advance(99);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user