Compare commits

..

21 Commits

Author SHA1 Message Date
Konstantin Knizhnik
3e08ad485a Fix bug in using brin index in GC 2021-11-30 16:33:50 +03:00
Konstantin Knizhnik
5ad82418a9 Add upload thread 2021-11-24 18:04:48 +03:00
Konstantin Knizhnik
92562145c0 Change toast store API 2021-11-23 11:34:32 +03:00
Konstantin Knizhnik
915001c67e Fix clippy warnings 2021-11-22 20:05:34 +03:00
Konstantin Knizhnik
13f9565ff8 Fix indentation 2021-11-22 19:22:59 +03:00
Konstantin Knizhnik
f73d043a8b Use COW version of YAKV 2021-11-22 19:22:39 +03:00
Konstantin Knizhnik
8fda7a6183 Fix indentation 2021-11-17 12:50:46 +03:00
Konstantin Knizhnik
4acd292717 Use BRIN to optimize GC 2021-11-17 12:50:25 +03:00
Konstantin Knizhnik
b365a075f4 Save materialized pages 2021-11-17 12:16:14 +03:00
Konstantin Knizhnik
6311135d73 Save materialized pages 2021-11-17 12:16:02 +03:00
Konstantin Knizhnik
ee29446edc Add BRIN index for checkpointer 2021-11-12 17:20:17 +03:00
Konstantin Knizhnik
d2e5e0e728 Fix compression 2021-11-11 00:16:37 +03:00
Konstantin Knizhnik
3b471494ff Add import/export functions from buffered stotage to files with layeres 2021-11-10 09:48:28 +03:00
Konstantin Knizhnik
9947de4a2a Fix issues with garbage collector 2021-11-03 12:15:24 +03:00
Konstantin Knizhnik
a3e94e888a Implement garbage collector for buffered repository 2021-10-30 13:10:04 +03:00
Konstantin Knizhnik
e6f33a5cd0 Rewrite TOAST to use the same tree as main index 2021-10-29 17:00:09 +03:00
Konstantin Knizhnik
2dd35b1fbe Fix indentation 2021-10-27 19:37:50 +03:00
Konstantin Knizhnik
ce779cc754 Use delayed commit in buffered_repo 2021-10-27 19:37:23 +03:00
Konstantin Knizhnik
497258c6fe Do not produce error in get_page_at_lsn on missed page 2021-10-26 20:07:11 +03:00
Konstantin Knizhnik
0b6008012d Apply cargo fmt 2021-10-22 19:52:05 +03:00
Konstantin Knizhnik
d35fc20181 Implement buffered repository 2021-10-22 19:50:59 +03:00
106 changed files with 5035 additions and 8277 deletions

View File

@@ -1,13 +1,13 @@
version: 2.1
orbs:
python: circleci/python@1.4.0
executors:
zenith-build-executor:
resource_class: xlarge
docker:
- image: cimg/rust:1.55.0
zenith-python-executor:
docker:
- image: cimg/python:3.7.10 # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI
jobs:
check-codestyle:
@@ -183,26 +183,23 @@ jobs:
- "*"
check-python:
executor: zenith-python-executor
executor: python/default
steps:
- checkout
- run:
name: Install deps
name: Install pipenv & deps
working_directory: test_runner
command: pipenv --python 3.7 install --dev
command: |
pip install pipenv
pipenv install --dev
- run:
name: Run yapf to ensure code format
when: always
working_directory: test_runner
command: pipenv run yapf --recursive --diff .
- run:
name: Run mypy to check types
when: always
working_directory: test_runner
command: pipenv run mypy .
run-pytest:
executor: zenith-python-executor
#description: "Run pytest"
executor: python/default
parameters:
# pytest args to specify the tests to run.
#
@@ -237,9 +234,11 @@ jobs:
steps:
- run: git submodule update --init --depth 1
- run:
name: Install deps
name: Install pipenv & deps
working_directory: test_runner
command: pipenv --python 3.7 install
command: |
pip install pipenv
pipenv install
- run:
name: Run pytest
working_directory: test_runner

495
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -15,3 +15,7 @@ members = [
# This is useful for profiling and, to some extent, debug.
# Besides, debug info should not affect the performance.
debug = true
panic = 'abort'
[profile.dev]
panic = 'abort'

View File

@@ -33,7 +33,7 @@ libssl-dev clang pkg-config libpq-dev
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
To run the integration tests (not required to use the code), install
Python (3.7 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
2. Build zenith and patched postgres
```sh
@@ -47,26 +47,17 @@ make -j5
# Create repository in .zenith with proper paths to binaries and data
# Later that would be responsibility of a package install script
> ./target/debug/zenith init
initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
created main branch
pageserver init succeeded
# start pageserver and safekeeper
# start pageserver
> ./target/debug/zenith start
Starting pageserver at 'localhost:64000' in '.zenith'
Starting pageserver at '127.0.0.1:64000' in .zenith
Pageserver started
initializing for single for 7676
Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
Safekeeper started
# start postgres compute node
# start postgres on top on the pageserver
> ./target/debug/zenith pg start main
Starting new postgres main on main...
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
waiting for server to start.... done
server started
# check list of running postgres instances
> ./target/debug/zenith pg list
@@ -117,9 +108,10 @@ postgres=# insert into t values(2,2);
INSERT 0 1
```
6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
you have just started. You can stop them all with one command:
6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
```sh
> ./target/debug/zenith pg stop migration_check
> ./target/debug/zenith pg stop main
> ./target/debug/zenith stop
```

View File

@@ -18,7 +18,7 @@ regex = "1"
anyhow = "1.0"
thiserror = "1"
bytes = "1.0.1"
nix = "0.23"
nix = "0.20"
url = "2.2.2"
hex = { version = "0.4.3", features = ["serde"] }
reqwest = { version = "0.11", features = ["blocking", "json"] }

View File

@@ -1,20 +0,0 @@
# Page server and three safekeepers.
[pageserver]
pg_port = 64000
http_port = 9898
auth_type = 'Trust'
[[safekeepers]]
name = 'sk1'
pg_port = 5454
http_port = 7676
[[safekeepers]]
name = 'sk2'
pg_port = 5455
http_port = 7677
[[safekeepers]]
name = 'sk3'
pg_port = 5456
http_port = 7678

View File

@@ -1,11 +0,0 @@
# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
# defaults that you get with no --config
[pageserver]
pg_port = 64000
http_port = 9898
auth_type = 'Trust'
[[safekeepers]]
name = 'single'
pg_port = 5454
http_port = 7676

View File

@@ -39,6 +39,8 @@ impl ComputeControlPlane {
// | |- <tenant_id>
// | | |- <branch name>
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
// TODO: since pageserver do not have config file yet we believe here that
// it is running on default port. Change that when pageserver will have config.
let pageserver = Arc::new(PageServerNode::from_env(&env));
let mut nodes = BTreeMap::default();
@@ -73,6 +75,15 @@ impl ComputeControlPlane {
.unwrap_or(self.base_port)
}
pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
ComputeControlPlane {
base_port: 65431,
pageserver: Arc::clone(pageserver),
nodes: BTreeMap::new(),
env: local_env.clone(),
}
}
// FIXME: see also parse_point_in_time in branches.rs.
fn parse_point_in_time(
&self,
@@ -125,7 +136,7 @@ impl ComputeControlPlane {
});
node.create_pgdata()?;
node.setup_pg_conf(self.env.pageserver.auth_type)?;
node.setup_pg_conf(self.env.auth_type)?;
self.nodes
.insert((tenantid, node.name.clone()), Arc::clone(&node));
@@ -199,7 +210,7 @@ impl PostgresNode {
})
}
fn sync_safekeepers(&self) -> Result<Lsn> {
fn sync_walkeepers(&self) -> Result<Lsn> {
let pg_path = self.env.pg_bin_dir().join("postgres");
let sync_handle = Command::new(pg_path)
.arg("--sync-safekeepers")
@@ -224,7 +235,7 @@ impl PostgresNode {
}
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
println!("Safekeepers synced on {}", lsn);
println!("Walkeepers synced on {}", lsn);
Ok(lsn)
}
@@ -287,6 +298,7 @@ impl PostgresNode {
conf.append("max_replication_slots", "10");
conf.append("hot_standby", "on");
conf.append("shared_buffers", "1MB");
conf.append("max_wal_size", "100GB");
conf.append("fsync", "off");
conf.append("max_connections", "100");
conf.append("wal_sender_timeout", "0");
@@ -328,25 +340,9 @@ impl PostgresNode {
}
conf.append_line("");
if !self.env.safekeepers.is_empty() {
// Configure the node to connect to the safekeepers
conf.append("synchronous_standby_names", "walproposer");
let wal_acceptors = self
.env
.safekeepers
.iter()
.map(|sk| format!("localhost:{}", sk.pg_port))
.collect::<Vec<String>>()
.join(",");
conf.append("wal_acceptors", &wal_acceptors);
} else {
// Configure the node to stream WAL directly to the pageserver
// This isn't really a supported configuration, but can be useful for
// testing.
conf.append("synchronous_standby_names", "pageserver");
conf.append("zenith.callmemaybe_connstring", &self.connstr());
}
// Configure the node to stream WAL directly to the pageserver
conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
conf.append("zenith.callmemaybe_connstring", &self.connstr());
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
file.write_all(conf.to_string().as_bytes())?;
@@ -362,7 +358,7 @@ impl PostgresNode {
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
// procedure evolves quite actively right now, so let's think about it again
// when things would be more stable (TODO).
let lsn = self.sync_safekeepers()?;
let lsn = self.sync_walkeepers()?;
if lsn == Lsn(0) {
None
} else {

View File

@@ -13,7 +13,6 @@ use std::path::Path;
pub mod compute;
pub mod local_env;
pub mod postgresql_conf;
pub mod safekeeper;
pub mod storage;
/// Read a PID file

View File

@@ -7,102 +7,46 @@
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::env;
use std::fmt::Write;
use std::fs;
use std::path::{Path, PathBuf};
use std::path::PathBuf;
use std::process::{Command, Stdio};
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::ZTenantId;
//
// This data structures represents zenith CLI config
//
// It is deserialized from the .zenith/config file, or the config file passed
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
// an example.
// This data structures represent deserialized zenith CLI config
//
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
// compute nodes).
//
// This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the ZENITH_REPO_DIR env variable or
// '.zenith' if not given.
#[serde(skip)]
// Pageserver connection settings
pub pageserver_pg_port: u16,
pub pageserver_http_port: u16,
// Base directory for both pageserver and compute nodes
pub base_data_dir: PathBuf,
// Path to postgres distribution. It's expected that "bin", "include",
// "lib", "share" from postgres distribution are there. If at some point
// in time we will be able to run against vanilla postgres we may split that
// to four separate paths and match OS-specific installation layout.
#[serde(default)]
pub pg_distrib_dir: PathBuf,
// Path to pageserver binary.
#[serde(default)]
pub zenith_distrib_dir: PathBuf,
// Default tenant ID to use with the 'zenith' command line utility, when
// --tenantid is not explicitly specified.
#[serde(with = "opt_tenantid_serde")]
#[serde(default)]
pub default_tenantid: Option<ZTenantId>,
// keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
#[serde(with = "hex")]
pub tenantid: ZTenantId,
// used to issue tokens during e.g pg start
#[serde(default)]
pub private_key_path: PathBuf,
pub pageserver: PageServerConf,
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(default)]
pub struct PageServerConf {
// Pageserver connection settings
pub pg_port: u16,
pub http_port: u16,
// jwt auth token used for communication with pageserver
pub auth_token: String,
// used to determine which auth type is used
pub auth_type: AuthType,
// jwt auth token used for communication with pageserver
pub auth_token: String,
}
impl Default for PageServerConf {
fn default() -> Self {
Self {
pg_port: 0,
http_port: 0,
auth_type: AuthType::Trust,
auth_token: "".to_string(),
}
}
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(default)]
pub struct SafekeeperConf {
pub name: String,
pub pg_port: u16,
pub http_port: u16,
pub sync: bool,
}
impl Default for SafekeeperConf {
fn default() -> Self {
Self {
name: "".to_string(),
pg_port: 0,
http_port: 0,
sync: true,
}
}
// used to issue tokens during e.g pg start
pub private_key_path: PathBuf,
}
impl LocalEnv {
@@ -118,10 +62,6 @@ impl LocalEnv {
Ok(self.zenith_distrib_dir.join("pageserver"))
}
pub fn safekeeper_bin(&self) -> Result<PathBuf> {
Ok(self.zenith_distrib_dir.join("safekeeper"))
}
pub fn pg_data_dirs_path(&self) -> PathBuf {
self.base_data_dir.join("pgdatadirs").join("tenants")
}
@@ -136,187 +76,6 @@ impl LocalEnv {
pub fn pageserver_data_dir(&self) -> PathBuf {
self.base_data_dir.clone()
}
pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
self.base_data_dir.join("safekeepers").join(node_name)
}
/// Create a LocalEnv from a config file.
///
/// Unlike 'load_config', this function fills in any defaults that are missing
/// from the config file.
pub fn create_config(toml: &str) -> Result<LocalEnv> {
let mut env: LocalEnv = toml::from_str(toml)?;
// Find postgres binaries.
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
if env.pg_distrib_dir == Path::new("") {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
env.pg_distrib_dir = postgres_bin.into();
} else {
let cwd = env::current_dir()?;
env.pg_distrib_dir = cwd.join("tmp_install")
}
}
if !env.pg_distrib_dir.join("bin/postgres").exists() {
anyhow::bail!(
"Can't find postgres binary at {}",
env.pg_distrib_dir.display()
);
}
// Find zenith binaries.
if env.zenith_distrib_dir == Path::new("") {
env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
if !env.zenith_distrib_dir.join("pageserver").exists() {
anyhow::bail!("Can't find pageserver binary.");
}
if !env.zenith_distrib_dir.join("safekeeper").exists() {
anyhow::bail!("Can't find safekeeper binary.");
}
// If no initial tenant ID was given, generate it.
if env.default_tenantid.is_none() {
env.default_tenantid = Some(ZTenantId::generate());
}
env.base_data_dir = base_path();
Ok(env)
}
/// Locate and load config
pub fn load_config() -> Result<LocalEnv> {
let repopath = base_path();
if !repopath.exists() {
anyhow::bail!(
"Zenith config is not found in {}. You need to run 'zenith init' first",
repopath.to_str().unwrap()
);
}
// TODO: check that it looks like a zenith repository
// load and parse file
let config = fs::read_to_string(repopath.join("config"))?;
let mut env: LocalEnv = toml::from_str(config.as_str())?;
env.base_data_dir = repopath;
Ok(env)
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
let private_key_path = if self.private_key_path.is_absolute() {
self.private_key_path.to_path_buf()
} else {
self.base_data_dir.join(&self.private_key_path)
};
let key_data = fs::read(private_key_path)?;
encode_from_key_file(claims, &key_data)
}
//
// Initialize a new Zenith repository
//
pub fn init(&mut self) -> Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
if base_path == Path::new("") {
anyhow::bail!("repository base path is missing");
}
if base_path.exists() {
anyhow::bail!(
"directory '{}' already exists. Perhaps already initialized?",
base_path.to_str().unwrap()
);
}
fs::create_dir(&base_path)?;
// generate keys for jwt
// openssl genrsa -out private_key.pem 2048
let private_key_path;
if self.private_key_path == PathBuf::new() {
private_key_path = base_path.join("auth_private_key.pem");
let keygen_output = Command::new("openssl")
.arg("genrsa")
.args(&["-out", private_key_path.to_str().unwrap()])
.arg("2048")
.stdout(Stdio::null())
.output()
.with_context(|| "failed to generate auth private key")?;
if !keygen_output.status.success() {
anyhow::bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
let public_key_path = base_path.join("auth_public_key.pem");
// openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
let keygen_output = Command::new("openssl")
.arg("rsa")
.args(&["-in", private_key_path.to_str().unwrap()])
.arg("-pubout")
.args(&["-outform", "PEM"])
.args(&["-out", public_key_path.to_str().unwrap()])
.stdout(Stdio::null())
.output()
.with_context(|| "failed to generate auth private key")?;
if !keygen_output.status.success() {
anyhow::bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
}
self.pageserver.auth_token =
self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
fs::create_dir_all(self.pg_data_dirs_path())?;
for safekeeper in self.safekeepers.iter() {
fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
}
let mut conf_content = String::new();
// Currently, the user first passes a config file with 'zenith init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
write!(
&mut conf_content,
r#"# This file describes a locale deployment of the page server
# and safekeeeper node. It is read by the 'zenith' command-line
# utility.
"#
)?;
// Convert the LocalEnv to a toml file.
//
// This could be as simple as this:
//
// conf_content += &toml::to_string_pretty(env)?;
//
// But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
fs::write(base_path.join("config"), conf_content)?;
Ok(())
}
}
fn base_path() -> PathBuf {
@@ -326,29 +85,118 @@ fn base_path() -> PathBuf {
}
}
/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
mod opt_tenantid_serde {
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::str::FromStr;
use zenith_utils::zid::ZTenantId;
pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
tenantid.map(|t| t.to_string()).serialize(ser)
//
// Initialize a new Zenith repository
//
pub fn init(
pageserver_pg_port: u16,
pageserver_http_port: u16,
tenantid: ZTenantId,
auth_type: AuthType,
) -> Result<()> {
// check if config already exists
let base_path = base_path();
if base_path.exists() {
anyhow::bail!(
"{} already exists. Perhaps already initialized?",
base_path.to_str().unwrap()
);
}
fs::create_dir(&base_path)?;
pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
where
D: Deserializer<'de>,
{
let s: Option<String> = Option::deserialize(des)?;
if let Some(s) = s {
return Ok(Some(
ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
));
// ok, now check that expected binaries are present
// Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
let pg_distrib_dir: PathBuf = {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
postgres_bin.into()
} else {
let cwd = env::current_dir()?;
cwd.join("tmp_install")
}
Ok(None)
};
if !pg_distrib_dir.join("bin/postgres").exists() {
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
}
// generate keys for jwt
// openssl genrsa -out private_key.pem 2048
let private_key_path = base_path.join("auth_private_key.pem");
let keygen_output = Command::new("openssl")
.arg("genrsa")
.args(&["-out", private_key_path.to_str().unwrap()])
.arg("2048")
.stdout(Stdio::null())
.output()
.with_context(|| "failed to generate auth private key")?;
if !keygen_output.status.success() {
anyhow::bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
let public_key_path = base_path.join("auth_public_key.pem");
// openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
let keygen_output = Command::new("openssl")
.arg("rsa")
.args(&["-in", private_key_path.to_str().unwrap()])
.arg("-pubout")
.args(&["-outform", "PEM"])
.args(&["-out", public_key_path.to_str().unwrap()])
.stdout(Stdio::null())
.output()
.with_context(|| "failed to generate auth private key")?;
if !keygen_output.status.success() {
anyhow::bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
let auth_token =
encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
// Find zenith binaries.
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
if !zenith_distrib_dir.join("pageserver").exists() {
anyhow::bail!("Can't find pageserver binary.",);
}
let conf = LocalEnv {
pageserver_pg_port,
pageserver_http_port,
pg_distrib_dir,
zenith_distrib_dir,
base_data_dir: base_path,
tenantid,
auth_token,
auth_type,
private_key_path,
};
fs::create_dir_all(conf.pg_data_dirs_path())?;
let toml = toml::to_string_pretty(&conf)?;
fs::write(conf.base_data_dir.join("config"), toml)?;
Ok(())
}
// Locate and load config
pub fn load_config() -> Result<LocalEnv> {
let repopath = base_path();
if !repopath.exists() {
anyhow::bail!(
"Zenith config is not found in {}. You need to run 'zenith init' first",
repopath.to_str().unwrap()
);
}
// TODO: check that it looks like a zenith repository
// load and parse file
let config = fs::read_to_string(repopath.join("config"))?;
toml::from_str(config.as_str()).map_err(|e| e.into())
}

View File

@@ -1,277 +0,0 @@
use std::io::Write;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::sync::Arc;
use std::time::Duration;
use std::{io, result, thread};
use anyhow::bail;
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use postgres::Config;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use zenith_utils::http::error::HttpErrorBody;
use zenith_utils::postgres_backend::AuthType;
use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::read_pidfile;
use crate::storage::PageServerNode;
use zenith_utils::connstring::connection_address;
use zenith_utils::connstring::connection_host_port;
#[derive(Error, Debug)]
pub enum SafekeeperHttpError {
#[error("Reqwest error: {0}")]
Transport(#[from] reqwest::Error),
#[error("Error: {0}")]
Response(String),
}
type Result<T> = result::Result<T, SafekeeperHttpError>;
pub trait ResponseErrorMessageExt: Sized {
fn error_from_body(self) -> Result<Self>;
}
impl ResponseErrorMessageExt for Response {
fn error_from_body(self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
}
// reqwest do not export it's error construction utility functions, so lets craft the message ourselves
let url = self.url().to_owned();
Err(SafekeeperHttpError::Response(
match self.json::<HttpErrorBody>() {
Ok(err_body) => format!("Error: {}", err_body.msg),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
},
))
}
}
//
// Control routines for safekeeper.
//
// Used in CLI and tests.
//
#[derive(Debug)]
pub struct SafekeeperNode {
pub name: String,
pub conf: SafekeeperConf,
pub pg_connection_config: Config,
pub env: LocalEnv,
pub http_client: Client,
pub http_base_url: String,
pub pageserver: Arc<PageServerNode>,
}
impl SafekeeperNode {
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
let pageserver = Arc::new(PageServerNode::from_env(env));
println!("initializing for {} for {}", conf.name, conf.http_port);
SafekeeperNode {
name: conf.name.clone(),
conf: conf.clone(),
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
env: env.clone(),
http_client: Client::new(),
http_base_url: format!("http://localhost:{}/v1", conf.http_port),
pageserver,
}
}
/// Construct libpq connection string for connecting to this safekeeper.
fn safekeeper_connection_config(port: u16) -> Config {
// TODO safekeeper authentication not implemented yet
format!("postgresql://no_user@localhost:{}/no_db", port)
.parse()
.unwrap()
}
pub fn datadir_path(&self) -> PathBuf {
self.env.safekeeper_data_dir(&self.name)
}
pub fn pid_file(&self) -> PathBuf {
self.datadir_path().join("safekeeper.pid")
}
pub fn start(&self) -> anyhow::Result<()> {
print!(
"Starting safekeeper at '{}' in '{}'",
connection_address(&self.pg_connection_config),
self.datadir_path().display()
);
io::stdout().flush().unwrap();
// Configure connection to page server
//
// FIXME: We extract the host and port from the connection string instead of using
// the connection string directly, because the 'safekeeper' binary expects
// host:port format. That's a bit silly when we already have a full libpq connection
// string at hand.
let pageserver_conn = {
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
format!("{}:{}", host, port)
};
let listen_pg = format!("localhost:{}", self.conf.pg_port);
let listen_http = format!("localhost:{}", self.conf.http_port);
let mut cmd: &mut Command = &mut Command::new(self.env.safekeeper_bin()?);
cmd = cmd
.args(&["-D", self.datadir_path().to_str().unwrap()])
.args(&["--listen-pg", &listen_pg])
.args(&["--listen-http", &listen_http])
.args(&["--pageserver", &pageserver_conn])
.args(&["--recall", "1 second"])
.arg("--daemonize")
.env_clear()
.env("RUST_BACKTRACE", "1");
if !self.conf.sync {
cmd = cmd.arg("--no-sync");
}
if self.env.pageserver.auth_type == AuthType::ZenithJWT {
cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
}
if !cmd.status()?.success() {
bail!(
"Safekeeper failed to start. See '{}' for details.",
self.datadir_path().join("safekeeper.log").display()
);
}
// It takes a while for the safekeeper to start up. Wait until it is
// open for business.
const RETRIES: i8 = 15;
for retries in 1..RETRIES {
match self.check_status() {
Ok(_) => {
println!("\nSafekeeper started");
return Ok(());
}
Err(err) => {
match err {
SafekeeperHttpError::Transport(err) => {
if err.is_connect() && retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!(
"Safekeeper not responding yet, err {} retrying ({})...",
err, retries
);
}
}
SafekeeperHttpError::Response(msg) => {
bail!("safekeeper failed to start: {} ", msg)
}
}
thread::sleep(Duration::from_secs(1));
}
}
}
bail!("safekeeper failed to start in {} seconds", RETRIES);
}
///
/// Stop the server.
///
/// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
/// Otherwise we use SIGTERM, triggering a clean shutdown
///
/// If the server is not running, returns success
///
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
let pid_file = self.pid_file();
if !pid_file.exists() {
println!("Safekeeper {} is already stopped", self.name);
return Ok(());
}
let pid = read_pidfile(&pid_file)?;
let pid = Pid::from_raw(pid);
let sig = if immediate {
println!("Stop safekeeper immediately");
Signal::SIGQUIT
} else {
println!("Stop safekeeper gracefully");
Signal::SIGTERM
};
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
println!(
"Safekeeper with pid {} does not exist, but a PID file was found",
pid
);
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to safekeeper with pid {}: {}",
pid,
err.desc()
),
}
let address = connection_address(&self.pg_connection_config);
// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if safekeeper flushes a lot of data
for _ in 0..100 {
if let Err(_e) = TcpStream::connect(&address) {
println!("Safekeeper stopped receiving connections");
//Now check status
match self.check_status() {
Ok(_) => {
println!("Safekeeper status is OK. Wait a bit.");
thread::sleep(Duration::from_secs(1));
}
Err(err) => {
println!("Safekeeper status is: {}", err);
return Ok(());
}
}
} else {
println!("Safekeeper still receives connections");
thread::sleep(Duration::from_secs(1));
}
}
bail!("Failed to stop safekeeper with pid {}", pid);
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
// TODO: authentication
//if self.env.auth_type == AuthType::ZenithJWT {
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
//}
self.http_client.request(method, url)
}
pub fn check_status(&self) -> Result<()> {
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
.send()?
.error_from_body()?;
Ok(())
}
}

View File

@@ -6,7 +6,6 @@ use std::time::Duration;
use std::{io, result, thread};
use anyhow::{anyhow, bail};
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
@@ -21,7 +20,6 @@ use zenith_utils::zid::ZTenantId;
use crate::local_env::LocalEnv;
use crate::read_pidfile;
use pageserver::branches::BranchInfo;
use pageserver::tenant_mgr::TenantInfo;
use zenith_utils::connstring::connection_address;
#[derive(Error, Debug)]
@@ -64,6 +62,7 @@ impl ResponseErrorMessageExt for Response {
//
#[derive(Debug)]
pub struct PageServerNode {
pub kill_on_exit: bool,
pub pg_connection_config: Config,
pub env: LocalEnv,
pub http_client: Client,
@@ -72,34 +71,34 @@ pub struct PageServerNode {
impl PageServerNode {
pub fn from_env(env: &LocalEnv) -> PageServerNode {
let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
&env.pageserver.auth_token
let password = if env.auth_type == AuthType::ZenithJWT {
&env.auth_token
} else {
""
};
PageServerNode {
kill_on_exit: false,
pg_connection_config: Self::pageserver_connection_config(
password,
env.pageserver.pg_port,
env.pageserver_pg_port,
),
env: env.clone(),
http_client: Client::new(),
http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
}
}
/// Construct libpq connection string for connecting to the pageserver.
fn pageserver_connection_config(password: &str, port: u16) -> Config {
format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
.parse()
.unwrap()
}
pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
let mut cmd = Command::new(self.env.pageserver_bin()?);
let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
let mut args = vec![
"--init",
"-D",
@@ -112,11 +111,10 @@ impl PageServerNode {
&listen_http,
];
let auth_type_str = &self.env.pageserver.auth_type.to_string();
if self.env.pageserver.auth_type != AuthType::Trust {
if enable_auth {
args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
args.extend(&["--auth-type", "ZenithJWT"]);
}
args.extend(&["--auth-type", auth_type_str]);
if let Some(tenantid) = create_tenant {
args.extend(&["--create-tenant", tenantid])
@@ -154,7 +152,7 @@ impl PageServerNode {
let mut cmd = Command::new(self.env.pageserver_bin()?);
cmd.args(&["-D", self.repo_path().to_str().unwrap()])
.arg("--daemonize")
.arg("-d")
.env_clear()
.env("RUST_BACKTRACE", "1");
@@ -201,43 +199,19 @@ impl PageServerNode {
bail!("pageserver failed to start in {} seconds", RETRIES);
}
///
/// Stop the server.
///
/// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
/// Otherwise we use SIGTERM, triggering a clean shutdown
///
/// If the server is not running, returns success
///
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
let pid_file = self.pid_file();
if !pid_file.exists() {
println!("Pageserver is already stopped");
return Ok(());
}
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
let sig = if immediate {
let pid = read_pidfile(&self.pid_file())?;
let pid = Pid::from_raw(pid);
if immediate {
println!("Stop pageserver immediately");
Signal::SIGQUIT
if kill(pid, Signal::SIGQUIT).is_err() {
bail!("Failed to kill pageserver with pid {}", pid);
}
} else {
println!("Stop pageserver gracefully");
Signal::SIGTERM
};
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
println!(
"Pageserver with pid {} does not exist, but a PID file was found",
pid
);
return Ok(());
if kill(pid, Signal::SIGTERM).is_err() {
bail!("Failed to stop pageserver with pid {}", pid);
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
}
let address = connection_address(&self.pg_connection_config);
@@ -282,8 +256,8 @@ impl PageServerNode {
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
let mut builder = self.http_client.request(method, url);
if self.env.pageserver.auth_type == AuthType::ZenithJWT {
builder = builder.bearer_auth(&self.env.pageserver.auth_token)
if self.env.auth_type == AuthType::ZenithJWT {
builder = builder.bearer_auth(&self.env.auth_token)
}
builder
}
@@ -295,7 +269,7 @@ impl PageServerNode {
Ok(())
}
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
pub fn tenant_list(&self) -> Result<Vec<String>> {
Ok(self
.http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
.send()?
@@ -358,3 +332,12 @@ impl PageServerNode {
.json()?)
}
}
impl Drop for PageServerNode {
fn drop(&mut self) {
// TODO Looks like this flag is never set
if self.kill_on_exit {
let _ = self.stop(true);
}
}
}

View File

@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
log = "0.4.14"
clap = "2.33.0"
daemonize = "0.4.1"
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt", "io-util"] }
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
@@ -37,7 +37,9 @@ async-trait = "0.1"
const_format = "0.2.21"
tracing = "0.1.27"
signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
url = "2"
#yakv = { path = "../../yakv" }
yakv = "0.2.4"
lz4_flex = "0.9.0"
postgres_ffi = { path = "../postgres_ffi" }
zenith_metrics = { path = "../zenith_metrics" }
@@ -46,4 +48,3 @@ workspace_hack = { path = "../workspace_hack" }
[dev-dependencies]
hex-literal = "0.3"
tempfile = "3.2"

View File

@@ -41,7 +41,7 @@ Legend:
+--+
....
. . Component at its early development phase.
. . Component that we will need, but doesn't exist at the moment. A TODO.
....
---> Data flow
@@ -116,49 +116,13 @@ Remove old on-disk layer files that are no longer needed according to the
PITR retention policy
### Backup service
TODO: Backup service
--------------------
The backup service, responsible for storing pageserver recovery data externally.
The backup service is responsible for periodically pushing the chunks to S3.
Currently, pageserver stores its files in a filesystem directory it's pointed to.
That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
Therefore, the server interacts with external, more reliable storage to back up and restore its state.
The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
There are the following implementations present:
* local filesystem — to use in tests mainly
* AWS S3 - to use in production
Implementation details are covered in the [storage readme](./src/relish_storage/README.md) and corresponding Rust file docs.
The backup service is disabled by default and can be enabled to interact with a single remote storage.
CLI examples:
* Local FS: `${PAGESERVER_BIN} --relish-storage-local-path="/some/local/path/"`
* AWS S3 : `${PAGESERVER_BIN} --relish-storage-s3-bucket="some-sample-bucket" --relish-storage-region="eu-north-1" --relish-storage-access-key="SOMEKEYAAAAASADSAH*#" --relish-storage-secret-access-key="SOMEsEcReTsd292v"`
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
For local S3 installations, refer to the their documentation for name format and credentials.
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup backup targets.
Required sections are:
```toml
[relish_storage]
local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
```
or
```toml
[relish_storage]
bucket_name = 'some-sample-bucket'
bucket_region = 'eu-north-1'
access_key_id = 'SOMEKEYAAAAASADSAH*#'
secret_access_key = 'SOMEsEcReTsd292v'
```
Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
a chunk we don't currently have? Or when an external Control Plane tells us?
TODO: Sharding
--------------------

View File

@@ -1,25 +0,0 @@
//! Main entry point for the dump_layerfile executable
//!
//! A handy tool for debugging, that's all.
use anyhow::Result;
use clap::{App, Arg};
use pageserver::layered_repository::dump_layerfile_from_path;
use std::path::PathBuf;
fn main() -> Result<()> {
let arg_matches = App::new("Zenith dump_layerfile utility")
.about("Dump contents of one layer file, for debugging")
.arg(
Arg::with_name("path")
.help("Path to file to dump")
.required(true)
.index(1),
)
.get_matches();
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
dump_layerfile_from_path(&path)?;
Ok(())
}

View File

@@ -42,6 +42,9 @@ struct CfgFileParams {
listen_http_addr: Option<String>,
checkpoint_distance: Option<String>,
checkpoint_period: Option<String>,
upload_distance: Option<String>,
upload_period: Option<String>,
reconstruct_threshold: Option<String>,
gc_horizon: Option<String>,
gc_period: Option<String>,
pg_distrib_dir: Option<String>,
@@ -103,6 +106,9 @@ impl CfgFileParams {
listen_http_addr: get_arg("listen-http"),
checkpoint_distance: get_arg("checkpoint_distance"),
checkpoint_period: get_arg("checkpoint_period"),
upload_distance: get_arg("upload_distance"),
upload_period: get_arg("upload_period"),
reconstruct_threshold: get_arg("reconstruct_threshold"),
gc_horizon: get_arg("gc_horizon"),
gc_period: get_arg("gc_period"),
pg_distrib_dir: get_arg("postgres-distrib"),
@@ -121,6 +127,9 @@ impl CfgFileParams {
listen_http_addr: self.listen_http_addr.or(other.listen_http_addr),
checkpoint_distance: self.checkpoint_distance.or(other.checkpoint_distance),
checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
upload_distance: self.upload_distance.or(other.upload_distance),
upload_period: self.upload_period.or(other.upload_period),
reconstruct_threshold: self.reconstruct_threshold.or(other.reconstruct_threshold),
gc_horizon: self.gc_horizon.or(other.gc_horizon),
gc_period: self.gc_period.or(other.gc_period),
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
@@ -158,6 +167,20 @@ impl CfgFileParams {
None => DEFAULT_CHECKPOINT_PERIOD,
};
let upload_distance: u64 = match self.upload_distance.as_ref() {
Some(upload_distance_str) => upload_distance_str.parse()?,
None => DEFAULT_UPLOAD_DISTANCE,
};
let upload_period = match self.upload_period.as_ref() {
Some(upload_period_str) => humantime::parse_duration(upload_period_str)?,
None => DEFAULT_UPLOAD_PERIOD,
};
let reconstruct_threshold: u64 = match self.reconstruct_threshold.as_ref() {
Some(reconstruct_threshold_str) => reconstruct_threshold_str.parse()?,
None => DEFAULT_RECONSTRUCT_THRESHOLD,
};
let gc_horizon: u64 = match self.gc_horizon.as_ref() {
Some(horizon_str) => horizon_str.parse()?,
None => DEFAULT_GC_HORIZON,
@@ -236,6 +259,9 @@ impl CfgFileParams {
listen_http_addr,
checkpoint_distance,
checkpoint_period,
upload_distance,
upload_period,
reconstruct_threshold,
gc_horizon,
gc_period,
@@ -296,6 +322,24 @@ fn main() -> Result<()> {
.takes_value(true)
.help("Interval between checkpoint iterations"),
)
.arg(
Arg::with_name("checkpoint_distance")
.long("checkpoint_distance")
.takes_value(true)
.help("Distance from current LSN to perform checkpoint of in-memory layers"),
)
.arg(
Arg::with_name("upload_period")
.long("upload_period")
.takes_value(true)
.help("Interval between upload iterations"),
)
.arg(
Arg::with_name("reconstruct_threshold")
.long("reconstruct_threshold")
.takes_value(true)
.help("Minimal size of deltas after which page reconstruction (materialization) can be performed"),
)
.arg(
Arg::with_name("gc_horizon")
.long("gc_horizon")
@@ -600,6 +644,9 @@ mod tests {
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
upload_distance: Some("upload_distance_VALUE".to_string()),
upload_period: Some("upload_period_VALUE".to_string()),
reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
gc_horizon: Some("gc_horizon_VALUE".to_string()),
gc_period: Some("gc_period_VALUE".to_string()),
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
@@ -623,6 +670,9 @@ mod tests {
listen_http_addr = 'listen_http_addr_VALUE'
checkpoint_distance = 'checkpoint_distance_VALUE'
checkpoint_period = 'checkpoint_period_VALUE'
upload_distance = 'upload_distance_VALUE'
upload_period = 'upload_period_VALUE'
reconstruct_threshold = 'reconstruct_threshold_VALUE'
gc_horizon = 'gc_horizon_VALUE'
gc_period = 'gc_period_VALUE'
pg_distrib_dir = 'pg_distrib_dir_VALUE'
@@ -657,6 +707,9 @@ local_path = 'relish_storage_local_VALUE'
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
upload_distance: Some("upload_distance_VALUE".to_string()),
upload_period: Some("upload_period_VALUE".to_string()),
reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
gc_horizon: Some("gc_horizon_VALUE".to_string()),
gc_period: Some("gc_period_VALUE".to_string()),
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
@@ -683,6 +736,9 @@ local_path = 'relish_storage_local_VALUE'
listen_http_addr = 'listen_http_addr_VALUE'
checkpoint_distance = 'checkpoint_distance_VALUE'
checkpoint_period = 'checkpoint_period_VALUE'
upload_distance = 'upload_distance_VALUE'
upload_period = 'upload_period_VALUE'
reconstruct_threshold = 'reconstruct_threshold_VALUE'
gc_horizon = 'gc_horizon_VALUE'
gc_period = 'gc_period_VALUE'
pg_distrib_dir = 'pg_distrib_dir_VALUE'

View File

@@ -4,7 +4,7 @@
// TODO: move all paths construction to conf impl
//
use anyhow::{bail, Context, Result};
use anyhow::{bail, ensure, Context, Result};
use postgres_ffi::ControlFileData;
use serde::{Deserialize, Serialize};
use std::{
@@ -23,7 +23,6 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
use crate::tenant_mgr;
use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
use crate::{repository::Repository, PageServerConf};
use crate::{restore_local_repo, LOG_FILE_NAME};
@@ -36,7 +35,7 @@ pub struct BranchInfo {
pub ancestor_id: Option<String>,
pub ancestor_lsn: Option<String>,
pub current_logical_size: usize,
pub current_logical_size_non_incremental: Option<usize>,
pub current_logical_size_non_incremental: usize,
}
impl BranchInfo {
@@ -45,7 +44,6 @@ impl BranchInfo {
conf: &PageServerConf,
tenantid: &ZTenantId,
repo: &Arc<dyn Repository>,
include_non_incremental_logical_size: bool,
) -> Result<Self> {
let name = path
.as_ref()
@@ -80,14 +78,6 @@ impl BranchInfo {
);
}
// non incremental size calculation can be heavy, so let it be optional
// needed for tests to check size calculation
let current_logical_size_non_incremental = include_non_incremental_logical_size
.then(|| {
timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
})
.transpose()?;
Ok(BranchInfo {
name,
timeline_id,
@@ -95,7 +85,8 @@ impl BranchInfo {
ancestor_id,
ancestor_lsn,
current_logical_size: timeline.get_current_logical_size(),
current_logical_size_non_incremental,
current_logical_size_non_incremental: timeline
.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
})
}
}
@@ -156,7 +147,7 @@ pub fn create_repo(
let tli = create_timeline(conf, None, &tenantid)?;
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
let repo = Arc::new(crate::buffered_repository::BufferedRepository::new(
conf,
wal_redo_manager,
tenantid,
@@ -239,7 +230,7 @@ fn bootstrap_timeline(
timeline.writer().as_ref(),
lsn,
)?;
timeline.checkpoint(CheckpointConfig::Forced)?;
timeline.checkpoint()?;
println!(
"created initial timeline {} timeline.lsn {}",
@@ -257,11 +248,19 @@ fn bootstrap_timeline(
Ok(())
}
pub(crate) fn get_branches(
conf: &PageServerConf,
tenantid: &ZTenantId,
include_non_incremental_logical_size: bool,
) -> Result<Vec<BranchInfo>> {
pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
let tenants_dir = conf.tenants_path();
std::fs::read_dir(&tenants_dir)?
.map(|dir_entry_res| {
let dir_entry = dir_entry_res?;
ensure!(dir_entry.file_type()?.is_dir());
Ok(dir_entry.file_name().to_str().unwrap().to_owned())
})
.collect()
}
pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
// Each branch has a corresponding record (text file) in the refs/branches
@@ -271,13 +270,7 @@ pub(crate) fn get_branches(
std::fs::read_dir(&branches_dir)?
.map(|dir_entry_res| {
let dir_entry = dir_entry_res?;
BranchInfo::from_path(
dir_entry.path(),
conf,
tenantid,
&repo,
include_non_incremental_logical_size,
)
BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
})
.collect()
}
@@ -339,7 +332,7 @@ pub(crate) fn create_branch(
ancestor_id: None,
ancestor_lsn: None,
current_logical_size: 0,
current_logical_size_non_incremental: Some(0),
current_logical_size_non_incremental: 0,
})
}

File diff suppressed because it is too large Load Diff

View File

@@ -25,11 +25,6 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get branches for tenant
responses:
@@ -78,11 +73,6 @@ paths:
required: true
schema:
type: string
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get branches for tenant
responses:
@@ -174,13 +164,13 @@ paths:
description: Get tenants list
responses:
"200":
description: TenantInfo
description: OK
content:
application/json:
schema:
type: array
items:
$ref: "#/components/schemas/TenantInfo"
type: string
"401":
description: Unauthorized Error
content:
@@ -253,16 +243,6 @@ components:
scheme: bearer
bearerFormat: JWT
schemas:
TenantInfo:
type: object
required:
- id
- state
properties:
id:
type: string
state:
type: string
BranchInfo:
type: object
required:
@@ -270,6 +250,7 @@ components:
- timeline_id
- latest_valid_lsn
- current_logical_size
- current_logical_size_non_incremental
properties:
name:
type: string

View File

@@ -86,59 +86,31 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
Ok(json_response(StatusCode::CREATED, response_data)?)
}
// Gate non incremental logical size calculation behind a flag
// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
// and tenants it can take noticeable amount of time. Also the value currently used only in tests
fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
request
.uri()
.query()
.map(|v| {
url::form_urlencoded::parse(v.as_bytes())
.into_owned()
.any(|(param, _)| param == "include-non-incremental-logical-size")
})
.unwrap_or(false)
}
async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
check_permission(&request, Some(tenantid))?;
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
crate::branches::get_branches(
get_config(&request),
&tenantid,
include_non_incremental_logical_size,
)
crate::branches::get_branches(get_config(&request), &tenantid)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::OK, response_data)?)
}
// TODO add to swagger
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
let conf = get_state(&request).conf;
let path = conf.branch_path(&branch_name, &tenantid);
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
BranchInfo::from_path(
path,
conf,
&tenantid,
&repo,
include_non_incremental_logical_size,
)
BranchInfo::from_path(path, conf, &tenantid, &repo)
})
.await
.map_err(ApiError::from_err)??;
@@ -152,7 +124,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_list").entered();
crate::tenant_mgr::list_tenants()
crate::branches::get_tenants(get_config(&request))
})
.await
.map_err(ApiError::from_err)??;

File diff suppressed because it is too large Load Diff

View File

@@ -39,9 +39,8 @@
//!
use crate::layered_repository::blob::BlobWriter;
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
};
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
use crate::repository::{PageReconstructData, PageReconstructResult, PageVersion};
use crate::waldecoder;
use crate::PageServerConf;
use crate::{ZTenantId, ZTimelineId};
@@ -148,6 +147,10 @@ pub struct DeltaLayerInner {
}
impl Layer for DeltaLayer {
fn get_tenant_id(&self) -> ZTenantId {
self.tenantid
}
fn get_timeline_id(&self) -> ZTimelineId {
self.timelineid
}
@@ -201,22 +204,22 @@ impl Layer for DeltaLayer {
for ((_blknum, pv_lsn), blob_range) in iter {
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
if let Some(img) = pv.page_image {
// Found a page image, return it
reconstruct_data.page_img = Some(img);
need_image = false;
break;
} else if let Some(rec) = pv.record {
let will_init = rec.will_init;
reconstruct_data.records.push((*pv_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
match pv {
PageVersion::Page(img) => {
// Found a page image, return it
reconstruct_data.page_img = Some(img);
need_image = false;
break;
}
} else {
// No base image, and no WAL record. Huh?
bail!("no page image or WAL record for requested page");
PageVersion::Wal(rec) => {
let will_init = rec.will_init;
reconstruct_data.records.push((*pv_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
@@ -226,7 +229,7 @@ impl Layer for DeltaLayer {
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(PageReconstructResult::Continue(self.start_lsn))
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
} else {
Ok(PageReconstructResult::Complete)
}
@@ -307,19 +310,22 @@ impl Layer for DeltaLayer {
let buf = read_blob(&chapter, blob_range)?;
let pv = PageVersion::des(&buf)?;
if let Some(img) = pv.page_image.as_ref() {
write!(&mut desc, " img {} bytes", img.len())?;
}
if let Some(rec) = pv.record.as_ref() {
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
write!(
&mut desc,
" rec {} bytes will_init: {} {}",
rec.rec.len(),
rec.will_init,
wal_desc
)?;
match pv {
PageVersion::Page(img) => {
write!(&mut desc, " img {} bytes", img.len())?;
}
PageVersion::Wal(rec) => {
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
write!(
&mut desc,
" rec {} bytes will_init: {} {}",
rec.rec.len(),
rec.will_init,
wal_desc
)?;
}
}
println!(" blk {} at {}: {}", blk, lsn, desc);
}
@@ -328,6 +334,19 @@ impl Layer for DeltaLayer {
}
impl DeltaLayer {
/// debugging function to print out the contents of the layer
pub fn versions(&self) -> Result<Vec<(u32, Lsn, PageVersion)>> {
let mut versions: Vec<(u32, Lsn, PageVersion)> = Vec::new();
let inner = self.load()?;
let (_path, book) = self.open_book()?;
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
let buf = read_blob(&chapter, blob_range)?;
versions.push((*blk, *lsn, PageVersion::des(&buf)?));
}
Ok(versions)
}
fn path_for(
path_or_conf: &PathOrConf,
timelineid: ZTimelineId,
@@ -360,6 +379,7 @@ impl DeltaLayer {
dropped: bool,
page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
relsizes: VecMap<Lsn, u32>,
nosync: bool,
) -> Result<DeltaLayer> {
if seg.rel.is_blocky() {
assert!(!relsizes.is_empty());
@@ -431,8 +451,10 @@ impl DeltaLayer {
let book = chapter.close()?;
// This flushes the underlying 'buf_writer'.
book.close()?;
let writer = book.close()?;
if !nosync {
writer.get_ref().sync_all()?;
}
trace!("saved {}", &path.display());
drop(inner);

View File

@@ -13,8 +13,6 @@ use anyhow::Result;
use log::*;
use zenith_utils::lsn::Lsn;
use super::METADATA_FILE_NAME;
// Note: LayeredTimeline::load_layer_map() relies on this sort order
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct DeltaFileName {
@@ -292,7 +290,7 @@ pub fn list_files(
deltafiles.push(deltafilename);
} else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
imgfiles.push(imgfilename);
} else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
} else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
// ignore these
} else {
warn!("unrecognized filename in timeline dir: {}", fname);

View File

@@ -22,11 +22,8 @@
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
//!
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, SegmentTag,
};
use crate::layered_repository::LayeredTimeline;
use crate::layered_repository::RELISH_SEG_SIZE;
use crate::layered_repository::storage_layer::{Layer, SegmentTag, RELISH_SEG_SIZE};
use crate::repository::{PageReconstructData, PageReconstructResult};
use crate::PageServerConf;
use crate::{ZTenantId, ZTimelineId};
use anyhow::{anyhow, bail, ensure, Result};
@@ -117,6 +114,10 @@ impl Layer for ImageLayer {
PathBuf::from(self.layer_name().to_string())
}
fn get_tenant_id(&self) -> ZTenantId {
self.tenantid
}
fn get_timeline_id(&self) -> ZTimelineId {
self.timelineid
}
@@ -250,13 +251,14 @@ impl ImageLayer {
}
/// Create a new image file, using the given array of pages.
fn create(
pub fn create(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
lsn: Lsn,
base_images: Vec<Bytes>,
nosync: bool,
) -> Result<ImageLayer> {
let image_type = if seg.rel.is_blocky() {
let num_blocks: u32 = base_images.len().try_into()?;
@@ -315,8 +317,10 @@ impl ImageLayer {
let book = chapter.close()?;
// This flushes the underlying 'buf_writer'.
book.close()?;
let writer = book.close()?;
if !nosync {
writer.get_ref().sync_all()?;
}
trace!("saved {}", path.display());
drop(inner);
@@ -324,6 +328,7 @@ impl ImageLayer {
Ok(layer)
}
/*
// Create a new image file by materializing every page in a source layer
// at given LSN.
pub fn create_from_src(
@@ -361,6 +366,7 @@ impl ImageLayer {
Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
}
*/
///
/// Load the contents of the file into memory

View File

@@ -281,6 +281,12 @@ pub struct LayersOnDisk {
pub image_layers: Vec<ImageLayer>,
}
impl LayersOnDisk {
pub fn is_empty(&self) -> bool {
self.delta_layers.is_empty() && self.image_layers.is_empty()
}
}
impl InMemoryLayer {
/// Return the oldest page version that's stored in this layer
pub fn get_oldest_pending_lsn(&self) -> Lsn {

View File

@@ -3,10 +3,9 @@
//!
use crate::relish::RelishTag;
use crate::repository::WALRecord;
use crate::ZTimelineId;
use crate::repository::{PageReconstructData, PageReconstructResult};
use crate::{ZTenantId, ZTimelineId};
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::path::PathBuf;
@@ -45,56 +44,6 @@ impl SegmentTag {
}
}
///
/// Represents a version of a page at a specific LSN. The LSN is the key of the
/// entry in the 'page_versions' hash, it is not duplicated here.
///
/// A page version can be stored as a full page image, or as WAL record that needs
/// to be applied over the previous page version to reconstruct this version.
///
/// It's also possible to have both a WAL record and a page image in the same
/// PageVersion. That happens if page version is originally stored as a WAL record
/// but it is later reconstructed by a GetPage@LSN request by performing WAL
/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
/// the WAL record in that case. TODO: That's pretty accidental, not the result
/// of any grand design. If we want to keep reconstructed page versions around, we
/// probably should have a separate buffer cache so that we could control the
/// replacement policy globally. Or if we keep a reconstructed page image, we
/// could throw away the WAL record.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageVersion {
/// an 8kb page image
pub page_image: Option<Bytes>,
/// WAL record to get from previous page version to this one.
pub record: Option<WALRecord>,
}
///
/// Data needed to reconstruct a page version
///
/// 'page_img' is the old base image of the page to start the WAL replay with.
/// It can be None, if the first WAL record initializes the page (will_init)
/// 'records' contains the records to apply over the base image.
///
pub struct PageReconstructData {
pub records: Vec<(Lsn, WALRecord)>,
pub page_img: Option<Bytes>,
}
/// Return value from Layer::get_page_reconstruct_data
pub enum PageReconstructResult {
/// Got all the data needed to reconstruct the requested page
Complete,
/// This layer didn't contain all the required data, the caller should look up
/// the predecessor layer at the returned LSN and collect more data from there.
Continue(Lsn),
/// This layer didn't contain data needed to reconstruct the page version at
/// the returned LSN. This is usually considered an error, but might be OK
/// in some circumstances.
Missing(Lsn),
}
///
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
@@ -104,6 +53,8 @@ pub enum PageReconstructResult {
/// in-memory and on-disk layers.
///
pub trait Layer: Send + Sync {
fn get_tenant_id(&self) -> ZTenantId;
/// Identify the timeline this relish belongs to
fn get_timeline_id(&self) -> ZTimelineId;

View File

@@ -1,4 +1,3 @@
use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
@@ -10,6 +9,7 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
pub mod basebackup;
pub mod branches;
pub mod buffered_repository;
pub mod http;
pub mod layered_repository;
pub mod page_service;
@@ -18,7 +18,7 @@ pub mod relish_storage;
pub mod repository;
pub mod restore_local_repo;
pub mod tenant_mgr;
pub mod tenant_threads;
pub mod toast_store;
pub mod waldecoder;
pub mod walreceiver;
pub mod walredo;
@@ -32,14 +32,17 @@ pub mod defaults {
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
// would be more appropriate. But a low value forces the code to be exercised more,
// which is good for now to trigger bugs.
// Minimal size of WAL records chain to trigger materialization of the page
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
pub const DEFAULT_UPLOAD_DISTANCE: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_UPLOAD_PERIOD: Duration = Duration::from_secs(2500);
pub const DEFAULT_RECONSTRUCT_THRESHOLD: u64 = 0;
pub const DEFAULT_GC_HORIZON: u64 = 1024;
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
@@ -66,6 +69,9 @@ pub struct PageServerConf {
// page server crashes.
pub checkpoint_distance: u64,
pub checkpoint_period: Duration,
pub upload_period: Duration,
pub upload_distance: u64,
pub reconstruct_threshold: u64,
pub gc_horizon: u64,
pub gc_period: Duration,
@@ -93,7 +99,7 @@ impl PageServerConf {
//
fn tenants_path(&self) -> PathBuf {
self.workdir.join(TENANTS_SEGMENT_NAME)
self.workdir.join("tenants")
}
fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
@@ -117,7 +123,7 @@ impl PageServerConf {
}
fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
self.tenant_path(tenantid).join("timelines")
}
fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
@@ -151,6 +157,9 @@ impl PageServerConf {
daemonize: false,
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_period: Duration::from_secs(10),
upload_distance: defaults::DEFAULT_UPLOAD_DISTANCE,
upload_period: defaults::DEFAULT_UPLOAD_PERIOD,
reconstruct_threshold: defaults::DEFAULT_RECONSTRUCT_THRESHOLD,
gc_horizon: defaults::DEFAULT_GC_HORIZON,
gc_period: Duration::from_secs(10),
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
@@ -165,15 +174,6 @@ impl PageServerConf {
}
}
/// Config for the Repository checkpointer
#[derive(Debug, Clone, Copy)]
pub enum CheckpointConfig {
// Flush in-memory data that is older than this
Distance(u64),
// Flush all in-memory data
Forced,
}
/// External relish storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone)]
pub struct RelishStorageConfig {

View File

@@ -630,16 +630,14 @@ impl postgres_backend::Handler for PageServerHandler {
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
// since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
// just use false in place of include non incremental logical size
let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
let branches = crate::branches::get_branches(self.conf, &tenantid)?;
let branches_buf = serde_json::to_vec(&branches)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
.write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("tenant_list") {
let tenants = crate::tenant_mgr::list_tenants()?;
let tenants = crate::branches::get_tenants(self.conf)?;
let tenants_buf = serde_json::to_vec(&tenants)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -695,67 +693,21 @@ impl postgres_backend::Handler for PageServerHandler {
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"layer_relfiles_total"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
RowDescriptor::int8_col(b"layer_relfiles_removed"),
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"meta_total"),
RowDescriptor::int8_col(b"meta_removed"),
RowDescriptor::int8_col(b"meta_dropped"),
RowDescriptor::int8_col(b"pages_total"),
RowDescriptor::int8_col(b"pages_removed"),
RowDescriptor::int8_col(b"pages_dropped"),
RowDescriptor::int8_col(b"elapsed"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
Some(
result
.ondisk_relfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
result
.ondisk_relfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
Some(
result
.ondisk_relfiles_needed_as_tombstone
.to_string()
.as_bytes(),
),
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
Some(
result
.ondisk_nonrelfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
result
.ondisk_nonrelfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
Some(
result
.ondisk_nonrelfiles_needed_as_tombstone
.to_string()
.as_bytes(),
),
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
Some(result.meta_total.to_string().as_bytes()),
Some(result.meta_removed.to_string().as_bytes()),
Some(result.meta_dropped.to_string().as_bytes()),
Some(result.pages_total.to_string().as_bytes()),
Some(result.pages_removed.to_string().as_bytes()),
Some(result.pages_dropped.to_string().as_bytes()),
Some(result.elapsed.as_millis().to_string().as_bytes()),
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;

View File

@@ -1,138 +1,60 @@
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
//! This particular module serves as a public API border between pageserver and the internal storage machinery.
//! No other modules from this tree are supposed to be used directly by the external code.
//! Abstractions for the page server to store its relish layer data in the external storage.
//!
//! There are a few components the storage machinery consists of:
//! * [`RelishStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
//! * [`local_fs`] allows to use local file system as an external storage
//! * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
//! in a way, optimal for page server.
//!
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
//!
//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_upload`]
//!
//! Here's a schematic overview of all interactions relish storage and the rest of the pageserver perform:
//!
//! +------------------------+ +--------->-------+
//! | | - - - (init async loop) - - - -> | |
//! | | | |
//! | | -------------------------------> | async |
//! | pageserver | (schedule frozen layer upload) | upload/download |
//! | | | loop |
//! | | <------------------------------- | |
//! | | (register downloaded layers) | |
//! +------------------------+ +---------<-------+
//! |
//! |
//! CRUD layer file operations |
//! (upload/download/delete/list, etc.) |
//! V
//! +------------------------+
//! | |
//! | [`RelishStorage`] impl |
//! | |
//! | pageserver assumes it |
//! | owns exclusive write |
//! | access to this storage |
//! +------------------------+
//!
//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
//!
//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
//! by the storage upload, if enabled.
//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
//! when the newer timeline is downloaded.
//!
//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
//!
//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
//!
//! NOTES:
//! * pageserver assumes it has exclusive write access to the relish storage. If supported, the way multiple pageservers can be separated in the same storage
//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
//!
//! * the uploads do not happen right after pageserver startup, they are registered when
//! 1. pageserver does the checkpoint, which happens further in the future after the server start
//! 2. pageserver loads the timeline from disk for the first time
//!
//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
//!
//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of relish files,
//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
//! The abstractions hide multiple custom external storage API implementations,
//! such as AWS S3, local filesystem, etc., located in the submodules.
mod local_fs;
mod rust_s3;
mod storage_sync;
/// A queue-based storage with the background machinery behind it to synchronize
/// local page server layer files with external storage.
mod synced_storage;
use std::{
path::{Path, PathBuf},
thread,
};
use std::{path::Path, thread};
use anyhow::{anyhow, ensure, Context};
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use anyhow::Context;
pub use self::storage_sync::schedule_timeline_upload;
use self::{local_fs::LocalFs, rust_s3::S3};
use crate::{
layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
PageServerConf, RelishStorageKind,
};
pub use self::synced_storage::schedule_timeline_upload;
use self::{local_fs::LocalFs, rust_s3::RustS3};
use crate::{PageServerConf, RelishStorageKind};
/// Based on the config, initiates the remote storage connection and starts a separate thread
/// that ensures that pageserver and the remote storage are in sync with each other.
/// If no external configuraion connection given, no thread or storage initialization is done.
pub fn run_storage_sync_thread(
config: &'static PageServerConf,
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
match &config.relish_storage_config {
Some(relish_storage_config) => {
let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
let handle = match &relish_storage_config.storage {
RelishStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
match &relish_storage_config.storage {
RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
config,
LocalFs::new(root.clone(), &config.workdir)?,
LocalFs::new(root.clone())?,
max_concurrent_sync,
),
RelishStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
config,
S3::new(s3_config, &config.workdir)?,
RustS3::new(s3_config)?,
max_concurrent_sync,
),
};
handle.map(Some)
}
}
None => Ok(None),
}
}
/// Storage (potentially remote) API to manage its state.
/// This storage tries to be unaware of any layered repository context,
/// providing basic CRUD operations with storage files.
#[async_trait::async_trait]
trait RelishStorage: Send + Sync {
/// A way to uniquely reference relish in the remote storage.
pub trait RelishStorage: Send + Sync {
type RelishStoragePath;
/// Attempts to derive the storage path out of the local path, if the latter is correct.
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath>;
fn derive_destination(
page_server_workdir: &Path,
relish_local_path: &Path,
) -> anyhow::Result<Self::RelishStoragePath>;
/// Gets the layered storage information about the given entry.
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo>;
/// Lists all items the storage has right now.
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
async fn download_relish<W: 'static + std::io::Write + Send>(
&self,
from: &Self::RelishStoragePath,
@@ -143,7 +65,6 @@ trait RelishStorage: Send + Sync {
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
/// Streams the local file contents into remote into the remote storage entry.
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
&self,
from: &mut tokio::io::BufReader<R>,
@@ -151,173 +72,16 @@ trait RelishStorage: Send + Sync {
) -> anyhow::Result<()>;
}
/// Information about a certain remote storage entry.
#[derive(Debug, PartialEq, Eq)]
struct RemoteRelishInfo {
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
/// Path in the pageserver workdir where the file should go to.
download_destination: PathBuf,
is_metadata: bool,
}
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
if prefix == path {
anyhow::bail!(
"Prefix and the path are equal, cannot strip: '{}'",
prefix.display()
)
} else {
path.strip_prefix(prefix).with_context(|| {
fn strip_workspace_prefix<'a>(
page_server_workdir: &'a Path,
relish_local_path: &'a Path,
) -> anyhow::Result<&'a Path> {
relish_local_path
.strip_prefix(page_server_workdir)
.with_context(|| {
format!(
"Path '{}' is not prefixed with '{}'",
path.display(),
prefix.display(),
"Unexpected: relish local path '{}' is not relevant to server workdir",
relish_local_path.display(),
)
})
}
}
fn parse_ids_from_path<'a, R: std::fmt::Display>(
path_segments: impl Iterator<Item = &'a str>,
path_log_representation: &R,
) -> anyhow::Result<(ZTenantId, ZTimelineId)> {
let mut segments = path_segments.skip_while(|&segment| segment != TENANTS_SEGMENT_NAME);
let tenants_segment = segments.next().ok_or_else(|| {
anyhow!(
"Found no '{}' segment in the storage path '{}'",
TENANTS_SEGMENT_NAME,
path_log_representation
)
})?;
ensure!(
tenants_segment == TENANTS_SEGMENT_NAME,
"Failed to extract '{}' segment from storage path '{}'",
TENANTS_SEGMENT_NAME,
path_log_representation
);
let tenant_id = segments
.next()
.ok_or_else(|| {
anyhow!(
"Found no tenant id in the storage path '{}'",
path_log_representation
)
})?
.parse::<ZTenantId>()
.with_context(|| {
format!(
"Failed to parse tenant id from storage path '{}'",
path_log_representation
)
})?;
let timelines_segment = segments.next().ok_or_else(|| {
anyhow!(
"Found no '{}' segment in the storage path '{}'",
TIMELINES_SEGMENT_NAME,
path_log_representation
)
})?;
ensure!(
timelines_segment == TIMELINES_SEGMENT_NAME,
"Failed to extract '{}' segment from storage path '{}'",
TIMELINES_SEGMENT_NAME,
path_log_representation
);
let timeline_id = segments
.next()
.ok_or_else(|| {
anyhow!(
"Found no timeline id in the storage path '{}'",
path_log_representation
)
})?
.parse::<ZTimelineId>()
.with_context(|| {
format!(
"Failed to parse timeline id from storage path '{}'",
path_log_representation
)
})?;
Ok((tenant_id, timeline_id))
}
/// A set of common test utils to share in unit tests inside the module tree.
#[cfg(test)]
mod test_utils {
use std::path::{Path, PathBuf};
use anyhow::ensure;
use crate::{
layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
repository::repo_harness::{RepoHarness, TIMELINE_ID},
};
/// Gives a timeline path with pageserver workdir stripped off.
pub fn relative_timeline_path(harness: &RepoHarness) -> anyhow::Result<PathBuf> {
let timeline_path = harness.timeline_path(&TIMELINE_ID);
Ok(timeline_path
.strip_prefix(&harness.conf.workdir)?
.to_path_buf())
}
/// Creates a path with custom tenant id in one of its segments.
/// Useful for emulating paths with wrong ids.
pub fn custom_tenant_id_path(
path_with_tenant_id: &Path,
new_tenant_id: &str,
) -> anyhow::Result<PathBuf> {
let mut new_path = PathBuf::new();
let mut is_tenant_id = false;
let mut tenant_id_replaced = false;
for segment in path_with_tenant_id {
match segment.to_str() {
Some(TENANTS_SEGMENT_NAME) => is_tenant_id = true,
Some(_tenant_id_str) if is_tenant_id => {
is_tenant_id = false;
new_path.push(new_tenant_id);
tenant_id_replaced = true;
continue;
}
_ => {}
}
new_path.push(segment)
}
ensure!(tenant_id_replaced, "Found no tenant id segment to replace");
Ok(new_path)
}
/// Creates a path with custom timeline id in one of its segments.
/// Useful for emulating paths with wrong ids.
pub fn custom_timeline_id_path(
path_with_timeline_id: &Path,
new_timeline_id: &str,
) -> anyhow::Result<PathBuf> {
let mut new_path = PathBuf::new();
let mut is_timeline_id = false;
let mut timeline_id_replaced = false;
for segment in path_with_timeline_id {
match segment.to_str() {
Some(TIMELINES_SEGMENT_NAME) => is_timeline_id = true,
Some(_timeline_id_str) if is_timeline_id => {
is_timeline_id = false;
new_path.push(new_timeline_id);
timeline_id_replaced = true;
continue;
}
_ => {}
}
new_path.push(segment)
}
ensure!(
timeline_id_replaced,
"Found no timeline id segment to replace"
);
Ok(new_path)
}
}

View File

@@ -1,82 +0,0 @@
# Non-implementation details
This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../relish_storage.rs) and its submodules.
Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
## Approach
Backup functionality is a new component, appeared way after the core DB functionality was implemented.
Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
## What's done
Current implementation
* provides remote storage wrappers for AWS S3 and local FS
* uploads layers, frozen by pageserver checkpoint thread
* downloads and registers layers, found on the remote storage, but missing locally
No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
### Peculiarities
As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
Here's the list of known compromises with comments:
* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish file and a metadata file, ~31 MB combined.
AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
* no proper file comparison
Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
No file contents assertion is done currently, but should be.
AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
* no proper retry management
Now, the storage sync attempts to redo the upload/download operation for the image files that failed.
No proper task eviction or backpressure is implemented currently: the tasks will stay in the queue forever, reattempting the downloads.
This will be fixed when more details on the file consistency model will be agreed on.
* sad rust-s3 api
rust-s3 is not very pleasant to use:
1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
3. it's a prerelease library with unclear maintenance status
4. noisy on debug level
But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
* gc and branches are ignored
So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
Only checkpointer loop affects the remote storage.
* more layers should be downloaded on demand
Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
so the pageserver is unable to respond property on requests to such ancestors.
To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
* no IT tests
Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
After it's fixed, benchmark runs should also be carried out to find bottlenecks.

View File

@@ -1,11 +1,13 @@
//! Local filesystem relish storage.
//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
//!
//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
//! volume is mounted to the local FS.
//! Page server already stores layer data on the server, when freezing it.
//! This storage serves a way to
//!
//! * test things locally simply
//! * allow to compabre both binary sets
//! * help validating the relish storage API
use std::{
ffi::OsStr,
future::Future,
io::Write,
path::{Path, PathBuf},
@@ -14,31 +16,25 @@ use std::{
use anyhow::{bail, Context};
use tokio::{fs, io};
use tracing::*;
use super::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo};
use crate::layered_repository::METADATA_FILE_NAME;
use super::{strip_workspace_prefix, RelishStorage};
pub struct LocalFs {
pageserver_workdir: &'static Path,
root: PathBuf,
}
impl LocalFs {
/// Attempts to create local FS relish storage, along with the storage root directory.
pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
/// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
pub fn new(root: PathBuf) -> anyhow::Result<Self> {
if !root.exists() {
std::fs::create_dir_all(&root).with_context(|| {
format!(
"Failed to create all directories in the given root path '{}'",
"Failed to create all directories in the given root path {}",
root.display(),
)
})?;
}
Ok(Self {
pageserver_workdir,
root,
})
Ok(Self { root })
}
fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
@@ -59,29 +55,11 @@ impl LocalFs {
impl RelishStorage for LocalFs {
type RelishStoragePath = PathBuf;
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
Ok(self.root.join(
strip_path_prefix(self.pageserver_workdir, local_path)
.context("local path does not belong to this storage")?,
))
}
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
let is_metadata =
storage_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME);
let relative_path = strip_path_prefix(&self.root, storage_path)
.context("local path does not belong to this storage")?;
let download_destination = self.pageserver_workdir.join(relative_path);
let (tenant_id, timeline_id) = parse_ids_from_path(
relative_path.iter().filter_map(|segment| segment.to_str()),
&relative_path.display(),
)?;
Ok(RemoteRelishInfo {
tenant_id,
timeline_id,
download_destination,
is_metadata,
})
fn derive_destination(
page_server_workdir: &Path,
relish_local_path: &Path,
) -> anyhow::Result<Self::RelishStoragePath> {
Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
}
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
@@ -94,7 +72,6 @@ impl RelishStorage for LocalFs {
mut to: std::io::BufWriter<W>,
) -> anyhow::Result<std::io::BufWriter<W>> {
let file_path = self.resolve_in_storage(from)?;
if file_path.exists() && file_path.is_file() {
let updated_buffer = tokio::task::spawn_blocking(move || {
let mut source = std::io::BufReader::new(
@@ -127,7 +104,7 @@ impl RelishStorage for LocalFs {
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
let file_path = self.resolve_in_storage(path)?;
if file_path.exists() && file_path.is_file() {
Ok(fs::remove_file(file_path).await?)
Ok(tokio::fs::remove_file(file_path).await?)
} else {
bail!(
"File '{}' either does not exist or is not a file",
@@ -175,12 +152,12 @@ where
if directory_path.exists() {
if directory_path.is_dir() {
let mut paths = Vec::new();
let mut dir_contents = fs::read_dir(directory_path).await?;
let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path = dir_entry.path();
if file_type.is_symlink() {
debug!("{:?} us a symlink, skipping", entry_path)
log::debug!("{:?} us a symlink, skipping", entry_path)
} else if file_type.is_dir() {
paths.extend(get_all_files(entry_path).await?.into_iter())
} else {
@@ -206,369 +183,7 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
),
};
if !target_dir.exists() {
fs::create_dir_all(target_dir).await?;
tokio::fs::create_dir_all(target_dir).await?;
}
Ok(())
}
#[cfg(test)]
mod pure_tests {
use crate::{
relish_storage::test_utils::{
custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
},
repository::repo_harness::{RepoHarness, TIMELINE_ID},
};
use super::*;
#[test]
fn storage_path_positive() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("storage_path_positive")?;
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
pageserver_workdir: &repo_harness.conf.workdir,
root: storage_root.clone(),
};
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("relish_name");
let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
assert_eq!(
expected_path,
storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
"Relish paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
);
Ok(())
}
#[test]
fn storage_path_negatives() -> anyhow::Result<()> {
#[track_caller]
fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
match storage.storage_path(mismatching_path) {
Ok(wrong_path) => panic!(
"Expected path '{}' to error, but got storage path: {:?}",
mismatching_path.display(),
wrong_path,
),
Err(e) => format!("{:?}", e),
}
}
let repo_harness = RepoHarness::create("storage_path_negatives")?;
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
pageserver_workdir: &repo_harness.conf.workdir,
root: storage_root,
};
let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
assert!(error_string.contains("does not belong to this storage"));
assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
let mismatching_path_str = "/something/else";
let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
assert!(
error_message.contains(mismatching_path_str),
"Error should mention wrong path"
);
assert!(
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
"Error should mention server workdir"
);
assert!(error_message.contains("does not belong to this storage"));
Ok(())
}
#[test]
fn info_positive() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("info_positive")?;
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
pageserver_workdir: &repo_harness.conf.workdir,
root: storage_root.clone(),
};
let name = "not a metadata";
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
assert_eq!(
RemoteRelishInfo {
tenant_id: repo_harness.tenant_id,
timeline_id: TIMELINE_ID,
download_destination: local_path.clone(),
is_metadata: false,
},
storage
.info(&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?))
.expect("For a valid input, valid S3 info should be parsed"),
"Should be able to parse metadata out of the correctly named remote delta relish"
);
let local_metadata_path = repo_harness
.timeline_path(&TIMELINE_ID)
.join(METADATA_FILE_NAME);
let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
assert_eq!(
RemoteRelishInfo {
tenant_id: repo_harness.tenant_id,
timeline_id: TIMELINE_ID,
download_destination: local_metadata_path,
is_metadata: true,
},
storage
.info(&remote_metadata_path)
.expect("For a valid input, valid S3 info should be parsed"),
"Should be able to parse metadata out of the correctly named remote metadata file"
);
Ok(())
}
#[test]
fn info_negatives() -> anyhow::Result<()> {
#[track_caller]
#[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.info` parameter requirements
fn storage_info_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
match storage.info(storage_path) {
Ok(wrong_info) => panic!(
"Expected storage path input {:?} to cause an error, but got relish info: {:?}",
storage_path, wrong_info,
),
Err(e) => format!("{:?}", e),
}
}
let repo_harness = RepoHarness::create("info_negatives")?;
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
pageserver_workdir: &repo_harness.conf.workdir,
root: storage_root.clone(),
};
let totally_wrong_path = "wrong_wrong_wrong";
let error_message = storage_info_error(&storage, &PathBuf::from(totally_wrong_path));
assert!(error_message.contains(totally_wrong_path));
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
let relative_relish_path =
custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?
.join("wrong_tenant_id_name");
let wrong_tenant_id_path = storage_root.join(&relative_relish_path);
let error_message = storage_info_error(&storage, &wrong_tenant_id_path);
assert!(
error_message.contains(relative_relish_path.to_str().unwrap()),
"Error message '{}' does not contain the expected substring",
error_message
);
let relative_relish_path =
custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?
.join("wrong_timeline_id_name");
let wrong_timeline_id_path = storage_root.join(&relative_relish_path);
let error_message = storage_info_error(&storage, &wrong_timeline_id_path);
assert!(
error_message.contains(relative_relish_path.to_str().unwrap()),
"Error message '{}' does not contain the expected substring",
error_message
);
Ok(())
}
#[test]
fn download_destination_matches_original_path() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
let storage_root = PathBuf::from("somewhere").join("else");
let dummy_storage = LocalFs {
pageserver_workdir: &repo_harness.conf.workdir,
root: storage_root,
};
let storage_path = dummy_storage.storage_path(&original_path)?;
let download_destination = dummy_storage.info(&storage_path)?.download_destination;
assert_eq!(
original_path, download_destination,
"'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
);
Ok(())
}
}
#[cfg(test)]
mod fs_tests {
use crate::{
relish_storage::test_utils::relative_timeline_path, repository::repo_harness::RepoHarness,
};
use super::*;
use tempfile::tempdir;
#[tokio::test]
async fn upload_relish() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("upload_relish")?;
let storage = create_storage()?;
let mut source = create_file_for_upload(
&storage.pageserver_workdir.join("whatever"),
"whatever_contents",
)
.await?;
let target_path = PathBuf::from("/").join("somewhere").join("else");
match storage.upload_relish(&mut source, &target_path).await {
Ok(()) => panic!("Should not allow storing files with wrong target path"),
Err(e) => {
let message = format!("{:?}", e);
assert!(message.contains(&target_path.display().to_string()));
assert!(message.contains("does not belong to the current storage"));
}
}
assert!(storage.list_relishes().await?.is_empty());
let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
assert_eq!(
storage.list_relishes().await?,
vec![target_path_1.clone()],
"Should list a single file after first upload"
);
let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
assert_eq!(
list_relishes_sorted(&storage).await?,
vec![target_path_1.clone(), target_path_2.clone()],
"Should list a two different files after second upload"
);
// match storage.upload_relish(&mut source, &target_path_1).await {
// Ok(()) => panic!("Should not allow reuploading storage files"),
// Err(e) => {
// let message = format!("{:?}", e);
// assert!(message.contains(&target_path_1.display().to_string()));
// assert!(message.contains("File exists"));
// }
// }
assert_eq!(
list_relishes_sorted(&storage).await?,
vec![target_path_1, target_path_2],
"Should list a two different files after all upload attempts"
);
Ok(())
}
fn create_storage() -> anyhow::Result<LocalFs> {
let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
Ok(storage)
}
#[tokio::test]
async fn download_relish() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("download_relish")?;
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
let contents_bytes = storage
.download_relish(&upload_target, std::io::BufWriter::new(Vec::new()))
.await?
.into_inner()?;
let contents = String::from_utf8(contents_bytes)?;
assert_eq!(
dummy_contents(upload_name),
contents,
"We should upload and download the same contents"
);
let non_existing_path = PathBuf::from("somewhere").join("else");
match storage
.download_relish(&non_existing_path, std::io::BufWriter::new(Vec::new()))
.await
{
Ok(_) => panic!("Should not allow downloading non-existing storage files"),
Err(e) => {
let error_string = e.to_string();
assert!(error_string.contains("does not exist"));
assert!(error_string.contains(&non_existing_path.display().to_string()));
}
}
Ok(())
}
#[tokio::test]
async fn delete_relish() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("delete_relish")?;
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
storage.delete_relish(&upload_target).await?;
assert!(storage.list_relishes().await?.is_empty());
match storage.delete_relish(&upload_target).await {
Ok(()) => panic!("Should not allow deleting non-existing storage files"),
Err(e) => {
let error_string = e.to_string();
assert!(error_string.contains("does not exist"));
assert!(error_string.contains(&upload_target.display().to_string()));
}
}
Ok(())
}
async fn upload_dummy_file(
harness: &RepoHarness,
storage: &LocalFs,
name: &str,
) -> anyhow::Result<PathBuf> {
let storage_path = storage
.root
.join(relative_timeline_path(harness)?)
.join(name);
storage
.upload_relish(
&mut create_file_for_upload(
&storage.pageserver_workdir.join(name),
&dummy_contents(name),
)
.await?,
&storage_path,
)
.await?;
Ok(storage_path)
}
async fn create_file_for_upload(
path: &Path,
contents: &str,
) -> anyhow::Result<io::BufReader<fs::File>> {
std::fs::create_dir_all(path.parent().unwrap())?;
let mut file_for_writing = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.open(path)?;
write!(file_for_writing, "{}", contents)?;
drop(file_for_writing);
Ok(io::BufReader::new(
fs::OpenOptions::new().read(true).open(&path).await?,
))
}
fn dummy_contents(name: &str) -> String {
format!("contents for {}", name)
}
async fn list_relishes_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
let mut relishes = storage.list_relishes().await?;
relishes.sort();
Ok(relishes)
}
}

View File

@@ -1,45 +1,35 @@
//! AWS S3 relish storage wrapper around `rust_s3` library.
//! Currently does not allow multiple pageservers to use the same bucket concurrently: relishes are
//! placed in the root of the bucket.
//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
use std::{
io::Write,
path::{Path, PathBuf},
};
use std::io::Write;
use std::path::Path;
use anyhow::Context;
use s3::{bucket::Bucket, creds::Credentials, region::Region};
use crate::{
layered_repository::METADATA_FILE_NAME,
relish_storage::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo},
relish_storage::{strip_workspace_prefix, RelishStorage},
S3Config,
};
const S3_FILE_SEPARATOR: char = '/';
#[derive(Debug, Eq, PartialEq)]
#[derive(Debug)]
pub struct S3ObjectKey(String);
impl S3ObjectKey {
fn key(&self) -> &str {
&self.0
}
fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
}
}
/// AWS S3 relish storage.
pub struct S3 {
pageserver_workdir: &'static Path,
pub struct RustS3 {
bucket: Bucket,
}
impl S3 {
impl RustS3 {
/// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
let region = aws_config
.bucket_region
.parse::<Region>()
@@ -59,17 +49,19 @@ impl S3 {
credentials,
)
.context("Failed to create the s3 bucket")?,
pageserver_workdir,
})
}
}
#[async_trait::async_trait]
impl RelishStorage for S3 {
impl RelishStorage for RustS3 {
type RelishStoragePath = S3ObjectKey;
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
fn derive_destination(
page_server_workdir: &Path,
relish_local_path: &Path,
) -> anyhow::Result<Self::RelishStoragePath> {
let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
let mut key = String::new();
for segment in relative_path {
key.push(S3_FILE_SEPARATOR);
@@ -78,21 +70,6 @@ impl RelishStorage for S3 {
Ok(S3ObjectKey(key))
}
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
let storage_path_key = &storage_path.0;
let is_metadata =
storage_path_key.ends_with(&format!("{}{}", S3_FILE_SEPARATOR, METADATA_FILE_NAME));
let download_destination = storage_path.download_destination(self.pageserver_workdir);
let (tenant_id, timeline_id) =
parse_ids_from_path(storage_path_key.split(S3_FILE_SEPARATOR), storage_path_key)?;
Ok(RemoteRelishInfo {
tenant_id,
timeline_id,
download_destination,
is_metadata,
})
}
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
let list_response = self
.bucket
@@ -124,11 +101,11 @@ impl RelishStorage for S3 {
))
} else {
tokio::task::spawn_blocking(move || {
to.flush().context("Failed to flush the download buffer")?;
to.flush().context("Failed to fluch the downoad buffer")?;
Ok::<_, anyhow::Error>(to)
})
.await
.context("Failed to join the download buffer flush task")?
.context("Failed to joim the download buffer flush task")?
}
}
@@ -170,226 +147,3 @@ impl RelishStorage for S3 {
}
}
}
#[cfg(test)]
mod tests {
use crate::{
relish_storage::test_utils::{
custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
},
repository::repo_harness::{RepoHarness, TIMELINE_ID},
};
use super::*;
#[test]
fn download_destination() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("download_destination")?;
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
let key = S3ObjectKey(format!(
"{}{}",
S3_FILE_SEPARATOR,
relative_path
.iter()
.map(|segment| segment.to_str().unwrap())
.collect::<Vec<_>>()
.join(&S3_FILE_SEPARATOR.to_string()),
));
assert_eq!(
local_path,
key.download_destination(&repo_harness.conf.workdir),
"Download destination should consist of s3 path joined with the pageserver workdir prefix"
);
Ok(())
}
#[test]
fn storage_path_positive() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("storage_path_positive")?;
let segment_1 = "matching";
let segment_2 = "relish";
let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
let expected_key = S3ObjectKey(format!(
"{SEPARATOR}{}{SEPARATOR}{}",
segment_1,
segment_2,
SEPARATOR = S3_FILE_SEPARATOR,
));
let actual_key = dummy_storage(&repo_harness.conf.workdir)
.storage_path(local_path)
.expect("Matching path should map to S3 path normally");
assert_eq!(
expected_key,
actual_key,
"S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
);
Ok(())
}
#[test]
fn storage_path_negatives() -> anyhow::Result<()> {
#[track_caller]
fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
match storage.storage_path(mismatching_path) {
Ok(wrong_key) => panic!(
"Expected path '{}' to error, but got S3 key: {:?}",
mismatching_path.display(),
wrong_key,
),
Err(e) => e.to_string(),
}
}
let repo_harness = RepoHarness::create("storage_path_negatives")?;
let storage = dummy_storage(&repo_harness.conf.workdir);
let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
assert!(
error_message.contains("Prefix and the path are equal"),
"Message '{}' does not contain the required string",
error_message
);
let mismatching_path = PathBuf::from("somewhere").join("else");
let error_message = storage_path_error(&storage, &mismatching_path);
assert!(
error_message.contains(mismatching_path.to_str().unwrap()),
"Error should mention wrong path"
);
assert!(
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
"Error should mention server workdir"
);
assert!(
error_message.contains("is not prefixed with"),
"Message '{}' does not contain a required string",
error_message
);
Ok(())
}
#[test]
fn info_positive() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("info_positive")?;
let storage = dummy_storage(&repo_harness.conf.workdir);
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
assert_eq!(
RemoteRelishInfo {
tenant_id: repo_harness.tenant_id,
timeline_id: TIMELINE_ID,
download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
is_metadata: false,
},
storage
.info(&s3_key)
.expect("For a valid input, valid S3 info should be parsed"),
"Should be able to parse metadata out of the correctly named remote delta relish"
);
let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
assert_eq!(
RemoteRelishInfo {
tenant_id: repo_harness.tenant_id,
timeline_id: TIMELINE_ID,
download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
is_metadata: true,
},
storage
.info(&s3_key)
.expect("For a valid input, valid S3 info should be parsed"),
"Should be able to parse metadata out of the correctly named remote metadata file"
);
Ok(())
}
#[test]
fn info_negatives() -> anyhow::Result<()> {
#[track_caller]
fn storage_info_error(storage: &S3, s3_key: &S3ObjectKey) -> String {
match storage.info(s3_key) {
Ok(wrong_info) => panic!(
"Expected key {:?} to error, but got relish info: {:?}",
s3_key, wrong_info,
),
Err(e) => e.to_string(),
}
}
let repo_harness = RepoHarness::create("info_negatives")?;
let storage = dummy_storage(&repo_harness.conf.workdir);
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
let totally_wrong_path = "wrong_wrong_wrong";
let error_message =
storage_info_error(&storage, &S3ObjectKey(totally_wrong_path.to_string()));
assert!(error_message.contains(totally_wrong_path));
let wrong_tenant_id = create_s3_key(
&custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?.join("name"),
);
let error_message = storage_info_error(&storage, &wrong_tenant_id);
assert!(error_message.contains(&wrong_tenant_id.0));
let wrong_timeline_id = create_s3_key(
&custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?.join("name"),
);
let error_message = storage_info_error(&storage, &wrong_timeline_id);
assert!(error_message.contains(&wrong_timeline_id.0));
Ok(())
}
#[test]
fn download_destination_matches_original_path() -> anyhow::Result<()> {
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
let key = dummy_storage.storage_path(&original_path)?;
let download_destination = dummy_storage.info(&key)?.download_destination;
assert_eq!(
original_path, download_destination,
"'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
);
Ok(())
}
fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
S3 {
pageserver_workdir,
bucket: Bucket::new(
"dummy-bucket",
"us-east-1".parse().unwrap(),
Credentials::anonymous().unwrap(),
)
.unwrap(),
}
}
fn create_s3_key(relative_relish_path: &Path) -> S3ObjectKey {
S3ObjectKey(
relative_relish_path
.iter()
.fold(String::new(), |mut path_string, segment| {
path_string.push(S3_FILE_SEPARATOR);
path_string.push_str(segment.to_str().unwrap());
path_string
}),
)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,57 @@
use std::time::Duration;
use std::{collections::BinaryHeap, sync::Mutex, thread};
use crate::tenant_mgr;
use crate::{relish_storage::RelishStorage, PageServerConf};
lazy_static::lazy_static! {
static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
}
pub fn schedule_timeline_upload(_local_timeline: ()) {
// UPLOAD_QUEUE
// .lock()
// .unwrap()
// .push(SyncTask::Upload(local_timeline))
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
enum SyncTask {}
pub fn run_storage_sync_thread<
P: std::fmt::Debug,
S: 'static + RelishStorage<RelishStoragePath = P>,
>(
config: &'static PageServerConf,
relish_storage: S,
max_concurrent_sync: usize,
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let handle = thread::Builder::new()
.name("Queue based relish storage sync".to_string())
.spawn(move || {
while !tenant_mgr::shutdown_requested() {
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
log::debug!("Upload queue length: {}", queue_accessor.len());
let next_task = queue_accessor.pop();
drop(queue_accessor);
match next_task {
Some(task) => runtime.block_on(async {
// suppress warnings
let _ = (config, task, &relish_storage, max_concurrent_sync);
todo!("omitted for brevity")
}),
None => {
thread::sleep(Duration::from_secs(1));
continue;
}
}
}
log::debug!("Queue based relish storage sync thread shut down");
Ok(())
})?;
Ok(Some(handle))
}

View File

@@ -1,5 +1,4 @@
use crate::relish::*;
use crate::CheckpointConfig;
use anyhow::Result;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use serde::{Deserialize, Serialize};
@@ -25,9 +24,9 @@ pub trait Repository: Send + Sync {
/// Branch a timeline
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
/// perform one garbage collection iteration, removing old data files from disk.
/// this funtion is periodically called by gc thread.
/// also it can be explicitly requested through page server api 'do_gc' command.
/// perform one garbage collection iteration.
/// garbage collection is periodically performed by gc thread,
/// but it can be explicitly requested through page server api.
///
/// 'timelineid' specifies the timeline to GC, or None for all.
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
@@ -40,54 +39,32 @@ pub trait Repository: Send + Sync {
horizon: u64,
checkpoint_before_gc: bool,
) -> Result<GcResult>;
/// perform one checkpoint iteration, flushing in-memory data on disk.
/// this function is periodically called by checkponter thread.
fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
}
///
/// Result of performing GC
///
#[derive(Default)]
#[derive(Default, Debug)]
pub struct GcResult {
pub ondisk_relfiles_total: u64,
pub ondisk_relfiles_needed_by_cutoff: u64,
pub ondisk_relfiles_needed_by_branches: u64,
pub ondisk_relfiles_not_updated: u64,
pub ondisk_relfiles_needed_as_tombstone: u64,
pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped
pub meta_removed: u64, // removed versions beyond PITR interval for which new page image exists
pub meta_dropped: u64, // removed versions beyond PITR interval of dropped relations
pub meta_total: u64, // total number of metaobject version histories
pub ondisk_nonrelfiles_total: u64,
pub ondisk_nonrelfiles_needed_by_cutoff: u64,
pub ondisk_nonrelfiles_needed_by_branches: u64,
pub ondisk_nonrelfiles_not_updated: u64,
pub ondisk_nonrelfiles_needed_as_tombstone: u64,
pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped
pub pages_removed: u64, // removed versions beyond PITR interval for which new page image exists
pub pages_dropped: u64, // removed versions beyond PITR interval of dropped relations
pub pages_total: u64, // total number of page vaersion histories
pub elapsed: Duration,
}
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.ondisk_relfiles_total += other.ondisk_relfiles_total;
self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff;
self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches;
self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated;
self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone;
self.ondisk_relfiles_removed += other.ondisk_relfiles_removed;
self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped;
self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total;
self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff;
self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches;
self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated;
self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone;
self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed;
self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped;
self.meta_total += other.meta_total;
self.meta_removed += other.meta_removed;
self.meta_dropped += other.meta_dropped;
self.pages_total += other.pages_total;
self.pages_removed += other.pages_removed;
self.pages_dropped += other.pages_dropped;
self.elapsed += other.elapsed;
}
}
@@ -116,14 +93,18 @@ pub trait Timeline: Send + Sync {
/// Get a list of all existing relations
/// Pass RelTag to get relation objects or None to get nonrels.
fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashSet<RelishTag>>;
/// Get a list of all existing relations in given tablespace and database.
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelishTag>>;
/// Get a list of all existing non-relational objects
fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
///
/// Export data as delats and image layers between 'start_lsn' to 'end_lsn'. The
/// start is inclusive, and end is exclusive.
///
fn export_timeline(&self, start_lsn: Lsn, end_lsn: Lsn) -> Result<()>;
/// Get the LSN where this branch was created
fn get_ancestor_lsn(&self) -> Lsn;
@@ -139,7 +120,6 @@ pub trait Timeline: Send + Sync {
fn get_last_record_lsn(&self) -> Lsn;
fn get_prev_record_lsn(&self) -> Lsn;
fn get_start_lsn(&self) -> Lsn;
fn get_disk_consistent_lsn(&self) -> Lsn;
/// Mutate the timeline with a [`TimelineWriter`].
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
@@ -149,7 +129,7 @@ pub trait Timeline: Send + Sync {
///
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
/// know anything about them here in the repository.
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
fn checkpoint(&self) -> Result<()>;
/// Retrieve current logical size of the timeline
///
@@ -187,6 +167,16 @@ pub trait TimelineWriter: Deref<Target = dyn Timeline> {
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
/// Previous last record LSN is stored alongside the latest and can be read.
fn advance_last_record_lsn(&self, lsn: Lsn);
///
/// Complete all delayed commits and advance disk_consistent_lsn
///
fn checkpoint(&self) -> Result<()>;
///
/// Import data from layer files
///
fn import_timeline(&self, snapshot_lsn: Lsn) -> Result<()>;
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -219,102 +209,37 @@ impl WALRecord {
}
}
#[cfg(test)]
pub mod repo_harness {
use std::{fs, path::PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum PageVersion {
/// an 8kb page image
Page(Bytes),
/// WAL record to get from previous page version to this one.
Wal(WALRecord),
}
use crate::{
layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
walredo::{WalRedoError, WalRedoManager},
PageServerConf,
};
///
/// Data needed to reconstruct a page version
///
/// 'page_img' is the old base image of the page to start the WAL replay with.
/// It can be None, if the first WAL record initializes the page (will_init)
/// 'records' contains the records to apply over the base image.
///
pub struct PageReconstructData {
pub records: Vec<(Lsn, WALRecord)>,
pub page_img: Option<Bytes>,
}
use super::*;
use hex_literal::hex;
use zenith_utils::zid::ZTenantId;
pub const TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
pub const NEW_TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
/// Convenience function to create a page image with given string as the only content
#[allow(non_snake_case)]
pub fn TEST_IMG(s: &str) -> Bytes {
let mut buf = BytesMut::new();
buf.extend_from_slice(s.as_bytes());
buf.resize(8192, 0);
buf.freeze()
}
pub struct RepoHarness {
pub conf: &'static PageServerConf,
pub tenant_id: ZTenantId,
}
impl RepoHarness {
pub fn create(test_name: &'static str) -> Result<Self> {
let repo_dir = PageServerConf::test_repo_dir(test_name);
let _ = fs::remove_dir_all(&repo_dir);
fs::create_dir_all(&repo_dir)?;
fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
let conf = PageServerConf::dummy_conf(repo_dir);
// Make a static copy of the config. This can never be free'd, but that's
// OK in a test.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let tenant_id = ZTenantId::generate();
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
Ok(Self { conf, tenant_id })
}
pub fn load(&self) -> Box<dyn Repository> {
let walredo_mgr = Arc::new(TestRedoManager);
Box::new(LayeredRepository::new(
self.conf,
walredo_mgr,
self.tenant_id,
false,
))
}
pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
self.conf.timeline_path(timeline_id, &self.tenant_id)
}
}
// Mock WAL redo manager that doesn't do much
struct TestRedoManager;
impl WalRedoManager for TestRedoManager {
fn request_redo(
&self,
rel: RelishTag,
blknum: u32,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<(Lsn, WALRecord)>,
) -> Result<Bytes, WalRedoError> {
let s = format!(
"redo for {} blk {} to get to {}, with {} and {} records",
rel,
blknum,
lsn,
if base_img.is_some() {
"base image"
} else {
"no base image"
},
records.len()
);
println!("{}", s);
Ok(TEST_IMG(&s))
}
}
/// Return value from Layer::get_page_reconstruct_data
pub enum PageReconstructResult {
/// Got all the data needed to reconstruct the requested page
Complete,
/// This layer didn't contain all the required data, the caller should look up
/// the predecessor layer at the returned LSN and collect more data from there.
Continue(Lsn),
/// This layer didn't contain data needed to reconstruct the page version at
/// the returned LSN. This is usually considered an error, but might be OK
/// in some circumstances.
Missing(Lsn),
}
///
@@ -323,10 +248,21 @@ pub mod repo_harness {
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {
use super::repo_harness::*;
use super::*;
use crate::layered_repository::METADATA_FILE_NAME;
use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
use crate::buffered_repository::{BufferedRepository, METADATA_FILE_NAME};
use crate::walredo::{WalRedoError, WalRedoManager};
use crate::PageServerConf;
use hex_literal::hex;
use postgres_ffi::pg_constants;
use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
use std::fs;
use std::path::PathBuf;
use zenith_utils::zid::ZTenantId;
const TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
const NEW_TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
/// Arbitrary relation tag, for testing.
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
@@ -342,6 +278,16 @@ mod tests {
forknum: 0,
});
/// Convenience function to create a page image with given string as the only content
#[allow(non_snake_case)]
fn TEST_IMG(s: &str) -> Bytes {
let mut buf = BytesMut::new();
buf.extend_from_slice(s.as_bytes());
buf.resize(8192, 0);
buf.freeze()
}
fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
let incremental = timeline.get_current_logical_size();
let non_incremental = timeline
@@ -353,6 +299,45 @@ mod tests {
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
struct RepoHarness {
conf: &'static PageServerConf,
tenant_id: ZTenantId,
}
impl RepoHarness {
fn create(test_name: &'static str) -> Result<Self> {
let repo_dir = PageServerConf::test_repo_dir(test_name);
let _ = fs::remove_dir_all(&repo_dir);
fs::create_dir_all(&repo_dir)?;
fs::create_dir_all(&repo_dir.join("timelines"))?;
let conf = PageServerConf::dummy_conf(repo_dir);
// Make a static copy of the config. This can never be free'd, but that's
// OK in a test.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let tenant_id = ZTenantId::generate();
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
Ok(Self { conf, tenant_id })
}
fn load(&self) -> Box<dyn Repository> {
let walredo_mgr = Arc::new(TestRedoManager);
Box::new(BufferedRepository::new(
self.conf,
walredo_mgr,
self.tenant_id,
false,
))
}
fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
self.conf.timeline_path(timeline_id, &self.tenant_id)
}
}
#[test]
fn test_relsize() -> Result<()> {
let repo = RepoHarness::create("test_relsize")?.load();
@@ -719,7 +704,7 @@ mod tests {
.contains(&TESTREL_A));
// Run checkpoint and garbage collection and check that it's still not visible
newtline.checkpoint(CheckpointConfig::Forced)?;
newtline.checkpoint()?;
repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
assert!(!newtline
@@ -864,4 +849,33 @@ mod tests {
Ok(())
}
// Mock WAL redo manager that doesn't do much
struct TestRedoManager;
impl WalRedoManager for TestRedoManager {
fn request_redo(
&self,
rel: RelishTag,
blknum: u32,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<(Lsn, WALRecord)>,
) -> Result<Bytes, WalRedoError> {
let s = format!(
"redo for {} blk {} to get to {}, with {} and {} records",
rel,
blknum,
lsn,
if base_img.is_some() {
"base image"
} else {
"no base image"
},
records.len()
);
println!("{}", s);
Ok(TEST_IMG(&s))
}
}
}

View File

@@ -11,7 +11,7 @@ use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use anyhow::{anyhow, bail, Result};
use bytes::{Buf, Bytes};
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use crate::relish::*;
@@ -126,7 +126,6 @@ pub fn import_timeline_from_postgres_datadir(
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
}
// TODO: Scan pg_tblspc
writer.advance_last_record_lsn(lsn);
// Import WAL. This is needed even when starting from a shutdown checkpoint, because
@@ -140,6 +139,7 @@ pub fn import_timeline_from_postgres_datadir(
lsn,
&mut pg_control.checkPointCopy.clone(),
)?;
writer.checkpoint()?;
Ok(())
}
@@ -416,7 +416,6 @@ pub fn save_decoded_record(
if checkpoint.update_next_xid(decoded.xl_xid) {
*checkpoint_modified = true;
}
// Iterate through all the blocks that the record modifies, and
// "put" a separate copy of the record for each block.
for blk in decoded.blocks.iter() {
@@ -426,14 +425,38 @@ pub fn save_decoded_record(
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
});
if blk.apply_image
&& blk.has_image
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
{
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize);
image.extend_from_slice(&recdata[img_offs..img_offs + img_len]);
let rec = WALRecord {
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
// Compression of WAL is not yet supported
assert!((blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0);
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?;
} else {
let rec = WALRecord {
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
}
}
let mut buf = decoded.record.clone();

View File

@@ -2,21 +2,20 @@
//! page server.
use crate::branches;
use crate::layered_repository::LayeredRepository;
use crate::buffered_repository::BufferedRepository;
use crate::repository::{Repository, Timeline};
use crate::tenant_threads;
use crate::walredo::PostgresRedoManager;
use crate::PageServerConf;
use anyhow::{anyhow, bail, Context, Result};
use lazy_static::lazy_static;
use log::{debug, info};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt;
use std::fs;
use std::str::FromStr;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex, MutexGuard};
use std::thread::JoinHandle;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
lazy_static! {
@@ -28,8 +27,8 @@ struct Tenant {
repo: Option<Arc<dyn Repository>>,
}
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
pub enum TenantState {
#[derive(Debug)]
enum TenantState {
// This tenant only exists in cloud storage. It cannot be accessed.
CloudOnly,
// This tenant exists in cloud storage, and we are currently downloading it to local disk.
@@ -41,12 +40,10 @@ pub enum TenantState {
// This tenant exists on local disk, and the layer map has been loaded into memory.
// The local disk might have some newer files that don't exist in cloud storage yet.
Active,
// Tenant is active, but there is no walreceiver connection.
Idle,
// This tenant exists on local disk, and the layer map has been loaded into memory.
// The local disk might have some newer files that don't exist in cloud storage yet.
// The tenant cannot be accessed anymore for any reason, but graceful shutdown.
Stopping,
//Stopping,
}
impl fmt::Display for TenantState {
@@ -55,8 +52,6 @@ impl fmt::Display for TenantState {
TenantState::CloudOnly => f.write_str("CloudOnly"),
TenantState::Downloading => f.write_str("Downloading"),
TenantState::Active => f.write_str("Active"),
TenantState::Idle => f.write_str("Idle"),
TenantState::Stopping => f.write_str("Stopping"),
}
}
}
@@ -65,6 +60,19 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
TENANTS.lock().unwrap()
}
struct TenantHandleEntry {
checkpointer_handle: Option<JoinHandle<()>>,
uploader_handle: Option<JoinHandle<()>>,
gc_handle: Option<JoinHandle<()>>,
}
// Logically these handles belong to Repository,
// but it's just simpler to store them separately
lazy_static! {
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
Mutex::new(HashMap::new());
}
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
pub fn init(conf: &'static PageServerConf) {
@@ -91,22 +99,34 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
// Set up an object repository, for actual data storage.
let repo = Arc::new(LayeredRepository::new(
let repo = Arc::new(BufferedRepository::new(
conf,
Arc::new(walredo_mgr),
tenant_id,
true,
));
let checkpointer_handle = BufferedRepository::launch_checkpointer_thread(conf, repo.clone());
let gc_handle = BufferedRepository::launch_gc_thread(conf, repo.clone());
let uploader_handle = BufferedRepository::launch_upload_thread(conf, repo.clone());
let mut handles = TENANT_HANDLES.lock().unwrap();
let h = TenantHandleEntry {
checkpointer_handle: Some(checkpointer_handle),
gc_handle: Some(gc_handle),
uploader_handle: Some(uploader_handle),
};
handles.insert(tenant_id, h);
let mut m = access_tenants();
let tenant = m.get_mut(&tenant_id).unwrap();
tenant.repo = Some(repo);
tenant.state = TenantState::Active;
// TODO Start these threads only if tenant actively receives some WAL
tenant_threads::start_tenant_threads(conf, tenant_id);
}
// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
// Relevant PR: https://github.com/zenithdb/zenith/pull/686
pub fn register_relish_download(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
@@ -120,20 +140,14 @@ pub fn register_relish_download(
{
let mut m = access_tenants();
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
state: TenantState::Downloading,
repo: None,
});
let mut tenant = m.get_mut(&tenant_id).unwrap();
tenant.state = TenantState::Downloading;
match &tenant.repo {
Some(repo) => {
init_timeline(repo.as_ref(), timeline_id);
tenant.state = TenantState::Active;
return;
Some(repo) => init_timeline(repo.as_ref(), timeline_id),
None => {
log::info!("Initialize new repo");
}
None => log::warn!("Initialize new repo"),
}
tenant.state = TenantState::Active;
}
// init repo updates Tenant state
@@ -154,23 +168,29 @@ pub fn shutdown_requested() -> bool {
SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
}
pub fn stop_tenant_threads(tenantid: ZTenantId) {
let mut handles = TENANT_HANDLES.lock().unwrap();
if let Some(h) = handles.get_mut(&tenantid) {
h.checkpointer_handle.take().map(JoinHandle::join);
debug!("checkpointer for tenant {} has stopped", tenantid);
h.uploader_handle.take().map(JoinHandle::join);
debug!("uploader for tenant {} has stopped", tenantid);
h.gc_handle.take().map(JoinHandle::join);
debug!("gc for tenant {} has stopped", tenantid);
}
}
pub fn shutdown_all_tenants() -> Result<()> {
SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
let tenantids = list_tenantids()?;
for tenantid in &tenantids {
set_tenant_state(*tenantid, TenantState::Stopping)?;
}
for tenantid in tenantids {
// Wait for checkpointer and GC to finish their job
tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
stop_tenant_threads(tenantid);
let repo = get_repository_for_tenant(tenantid)?;
debug!("shutdown tenant {}", tenantid);
repo.shutdown()?;
}
Ok(())
}
@@ -202,40 +222,13 @@ pub fn create_repository_for_tenant(
Ok(())
}
// If tenant is not found in the repository, return CloudOnly state
pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
let m = access_tenants();
match m.get(&tenantid) {
Some(tenant) => tenant.state,
None => TenantState::CloudOnly,
}
}
pub fn set_tenant_state(tenantid: ZTenantId, state: TenantState) -> Result<TenantState> {
let mut m = access_tenants();
let tenant = m.get_mut(&tenantid);
match tenant {
Some(tenant) => {
if state == TenantState::Idle && tenant.state != TenantState::Active {
// Only Active tenant can become Idle
return Ok(tenant.state);
}
info!("set_tenant_state: {} -> {}", tenant.state, state);
tenant.state = state;
Ok(tenant.state)
}
None => bail!("Tenant not found for tenant {}", tenantid),
}
}
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
let m = access_tenants();
let tenant = m
.get(&tenantid)
.ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;
.ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid));
match &tenant.repo {
match &tenant.unwrap().repo {
Some(repo) => Ok(Arc::clone(repo)),
None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
}
@@ -259,23 +252,3 @@ fn list_tenantids() -> Result<Vec<ZTenantId>> {
})
.collect()
}
#[derive(Serialize, Deserialize, Clone)]
pub struct TenantInfo {
#[serde(with = "hex")]
pub id: ZTenantId,
pub state: TenantState,
}
pub fn list_tenants() -> Result<Vec<TenantInfo>> {
let m = access_tenants();
m.iter()
.map(|v| {
let (id, tenant) = v;
Ok(TenantInfo {
id: *id,
state: tenant.state,
})
})
.collect()
}

View File

@@ -1,125 +0,0 @@
//! This module contains functions to serve per-tenant background processes,
//! such as checkpointer and GC
use crate::tenant_mgr;
use crate::tenant_mgr::TenantState;
use crate::CheckpointConfig;
use crate::PageServerConf;
use anyhow::Result;
use lazy_static::lazy_static;
use std::collections::HashMap;
use std::sync::Mutex;
use std::thread::JoinHandle;
use std::time::Duration;
use tracing::*;
use zenith_utils::zid::ZTenantId;
struct TenantHandleEntry {
checkpointer_handle: Option<JoinHandle<()>>,
gc_handle: Option<JoinHandle<()>>,
}
// Preserve handles to wait for thread completion
// at shutdown
lazy_static! {
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
Mutex::new(HashMap::new());
}
pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
//ensure that old threads are stopeed
wait_for_tenant_threads_to_stop(tenantid);
let checkpointer_handle = std::thread::Builder::new()
.name("Checkpointer thread".into())
.spawn(move || {
checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
})
.ok();
let gc_handle = std::thread::Builder::new()
.name("GC thread".into())
.spawn(move || {
gc_loop(tenantid, conf).expect("GC thread died");
})
.ok();
// TODO handle thread errors if any
let mut handles = TENANT_HANDLES.lock().unwrap();
let h = TenantHandleEntry {
checkpointer_handle,
gc_handle,
};
handles.insert(tenantid, h);
}
pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
let mut handles = TENANT_HANDLES.lock().unwrap();
if let Some(h) = handles.get_mut(&tenantid) {
h.checkpointer_handle.take().map(JoinHandle::join);
trace!("checkpointer for tenant {} has stopped", tenantid);
h.gc_handle.take().map(JoinHandle::join);
trace!("gc for tenant {} has stopped", tenantid);
}
handles.remove(&tenantid);
}
///
/// Checkpointer thread's main loop
///
fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
loop {
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
break;
}
std::thread::sleep(conf.checkpoint_period);
trace!("checkpointer thread for tenant {} waking up", tenantid);
// checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
// bytes of WAL since last checkpoint.
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
}
trace!(
"checkpointer thread stopped for tenant {} state is {}",
tenantid,
tenant_mgr::get_tenant_state(tenantid)
);
Ok(())
}
///
/// GC thread's main loop
///
fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
loop {
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
break;
}
trace!("gc thread for tenant {} waking up", tenantid);
// Garbage collect old files that are not needed for PITR anymore
if conf.gc_horizon > 0 {
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
}
// TODO Write it in more adequate way using
// condvar.wait_timeout() or something
let mut sleep_time = conf.gc_period.as_secs();
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
sleep_time -= 1;
std::thread::sleep(Duration::from_secs(1));
}
}
trace!(
"GC thread stopped for tenant {} state is {}",
tenantid,
tenant_mgr::get_tenant_state(tenantid)
);
Ok(())
}

View File

@@ -0,0 +1,258 @@
use anyhow::{anyhow, Result};
use lz4_flex;
use std::convert::TryInto;
use std::ops::{Bound, RangeBounds};
use std::path::Path;
use yakv::storage::{Key, Storage, StorageConfig, StorageIterator, Transaction, Value};
const TOAST_SEGMENT_SIZE: usize = 2 * 1024;
const CACHE_SIZE: usize = 32 * 1024; // 256Mb
//const CACHE_SIZE: usize = 128 * 1024; // 1Gb
///
/// Toast storage consistof two KV databases: one for storing main index
/// and second for storing sliced BLOB (values larger than 2kb).
/// BLOBs and main data are stored in different databases to improve
/// data locality and reduce key size for TOAST segments.
///
pub struct ToastStore {
db: Storage, // key-value database
}
pub struct ToastIterator<'a> {
iter: StorageIterator<'a>,
}
#[derive(Clone, Copy)]
pub struct PageData {
data: [u8; 8192],
}
impl PageData {
pub fn find_first_zero_bit(&self, offs: usize) -> usize {
let bytes = self.data;
for i in offs..8192 {
if bytes[i] != 0xFFu8 {
return i * 8 + bytes[i].trailing_ones() as usize;
}
}
usize::MAX
}
}
impl<'a> Iterator for ToastIterator<'a> {
type Item = Result<(Key, Value)>;
fn next(&mut self) -> Option<Self::Item> {
let mut toast: Option<Vec<u8>> = None;
let mut next_segno = 0u16;
for elem in &mut self.iter {
if let Ok((key, value)) = elem {
let key_len = key.len();
let n_segments =
u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
let key = key[..key_len - 4].to_vec();
if n_segments != 0 {
// TOAST
assert_eq!(segno, next_segno);
if next_segno == 0 {
toast = Some(Vec::with_capacity(n_segments as usize * TOAST_SEGMENT_SIZE))
}
toast.as_mut().unwrap().extend_from_slice(&value);
next_segno = segno + 1;
if next_segno == n_segments {
let res = lz4_flex::decompress_size_prepended(&toast.unwrap());
return Some(if let Ok(decompressed_data) = res {
Ok((key, decompressed_data))
} else {
Err(anyhow!(res.unwrap_err()))
});
}
} else {
return Some(Ok((key, value)));
}
} else {
return Some(elem);
}
}
assert_eq!(next_segno, 0);
None
}
}
impl<'a> DoubleEndedIterator for ToastIterator<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
let mut toast: Option<Vec<u8>> = None;
let mut next_segno = 0u16;
while let Some(elem) = self.iter.next_back() {
if let Ok((key, value)) = elem {
assert!(!value.is_empty());
let key_len = key.len();
let n_segments =
u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
let key = key[..key_len - 4].to_vec();
if n_segments != 0 {
// TOAST
assert!(segno + 1 == next_segno || next_segno == 0);
if next_segno == 0 {
let len = (n_segments - 1) as usize * TOAST_SEGMENT_SIZE + value.len();
let mut vec = vec![0u8; len];
vec[len - value.len()..].copy_from_slice(&value);
toast = Some(vec);
} else {
toast.as_mut().unwrap()[segno as usize * TOAST_SEGMENT_SIZE
..(segno + 1) as usize * TOAST_SEGMENT_SIZE]
.copy_from_slice(&value);
}
next_segno = segno;
if next_segno == 0 {
let toast = toast.unwrap();
assert!(!toast.is_empty());
let res = lz4_flex::decompress_size_prepended(&toast);
return Some(if let Ok(decompressed_data) = res {
Ok((key, decompressed_data))
} else {
Err(anyhow!(res.unwrap_err()))
});
}
} else {
return Some(Ok((key, value)));
}
} else {
return Some(elem);
}
}
assert_eq!(next_segno, 0);
None
}
}
//
// FIXME-KK: not using WAL now. Implement asynchronous or delayed commit.
//
impl ToastStore {
pub fn new(path: &Path) -> Result<ToastStore> {
Ok(ToastStore {
db: Storage::open(
&path.join("pageserver.db"),
StorageConfig {
cache_size: CACHE_SIZE,
nosync: false,
mursiw: true,
},
)?,
})
}
pub fn put(&self, key: Key, value: Value) -> Result<()> {
let mut tx = self.db.start_transaction();
self.tx_remove(&mut tx, &key)?;
let value_len = value.len();
let mut key = key;
if value_len >= TOAST_SEGMENT_SIZE {
let compressed_data = lz4_flex::compress_prepend_size(&value);
let compressed_data_len = compressed_data.len();
let mut offs: usize = 0;
let mut segno = 0u16;
let n_segments =
((compressed_data_len + TOAST_SEGMENT_SIZE - 1) / TOAST_SEGMENT_SIZE) as u16;
assert!(n_segments != 0);
key.extend_from_slice(&n_segments.to_be_bytes());
key.extend_from_slice(&[0u8; 2]);
let key_len = key.len();
while offs + TOAST_SEGMENT_SIZE < compressed_data_len {
key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
tx.put(
&key,
&compressed_data[offs..offs + TOAST_SEGMENT_SIZE].to_vec(),
)?;
offs += TOAST_SEGMENT_SIZE;
segno += 1;
}
key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
tx.put(&key, &compressed_data[offs..].to_vec())?;
} else {
key.extend_from_slice(&[0u8; 4]);
tx.put(&key, &value)?;
}
tx.delay()?;
Ok(())
}
pub fn commit(&mut self) -> Result<()> {
let tx = self.db.start_transaction();
tx.commit()?;
Ok(())
}
pub fn iter(&self) -> ToastIterator<'_> {
self.range(..)
}
pub fn range<R: RangeBounds<Key>>(&self, range: R) -> ToastIterator<'_> {
let from = match range.start_bound() {
Bound::Included(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0u8; 4]);
Bound::Included(key)
}
Bound::Excluded(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0u8; 4]);
Bound::Excluded(key)
}
_ => Bound::Unbounded,
};
let till = match range.end_bound() {
Bound::Included(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0xFFu8; 4]);
Bound::Included(key)
}
Bound::Excluded(key) => {
let mut key = key.clone();
key.extend_from_slice(&[0xFFu8; 4]);
Bound::Excluded(key)
}
_ => Bound::Unbounded,
};
ToastIterator {
iter: self.db.range((from, till)),
}
}
pub fn remove(&self, key: Key) -> Result<()> {
let mut tx = self.db.start_transaction();
self.tx_remove(&mut tx, &key)?;
tx.delay()
}
pub fn tx_remove(&self, tx: &mut Transaction, key: &[u8]) -> Result<()> {
let mut min_key = key.to_vec();
let mut max_key = key.to_vec();
min_key.extend_from_slice(&[0u8; 4]);
max_key.extend_from_slice(&[0xFFu8; 4]);
let mut iter = tx.range(&min_key..&max_key);
if let Some(entry) = iter.next() {
let mut key = entry?.0;
let key_len = key.len();
let n_segments = u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
if n_segments != 0 {
// TOAST
for i in 0..n_segments {
key[key_len - 2..].copy_from_slice(&i.to_be_bytes());
tx.remove(&key)?;
}
} else {
tx.remove(&key)?;
}
}
Ok(())
}
pub fn size(&self) -> u64 {
self.db.get_database_info().db_used
}
}

View File

@@ -229,17 +229,18 @@ pub struct DecodedBkpBlock {
pub blkno: u32,
/* copy of the fork_flags field from the XLogRecordBlockHeader */
flags: u8,
pub flags: u8,
/* Information on full-page image, if any */
has_image: bool, /* has image, even for consistency checking */
pub has_image: bool, /* has image, even for consistency checking */
pub apply_image: bool, /* has image that should be restored */
pub will_init: bool, /* record doesn't need previous page version to apply */
//char *bkp_image;
hole_offset: u16,
hole_length: u16,
bimg_len: u16,
bimg_info: u8,
pub hole_offset: u16,
pub hole_length: u16,
pub bimg_offset: u32,
pub bimg_len: u16,
pub bimg_info: u8,
/* Buffer holding the rmgr-specific data associated with this block */
has_data: bool,
@@ -859,8 +860,19 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
}
// 3. Decode blocks.
let mut ptr = record.len() - buf.remaining();
for blk in blocks.iter_mut() {
if blk.has_image {
blk.bimg_offset = ptr as u32;
ptr += blk.bimg_len as usize;
}
if blk.has_data {
ptr += blk.data_len as usize;
}
}
// We don't need them, so just skip blocks_total_len bytes
buf.advance(blocks_total_len as usize);
assert_eq!(ptr, record.len() - buf.remaining());
let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;

View File

@@ -284,14 +284,12 @@ fn walreceiver_main(
if let Some(last_lsn) = status_update {
// TODO: More thought should go into what values are sent here.
let last_lsn = PgLsn::from(u64::from(last_lsn));
// We are using disk consistent LSN as `write_lsn`, i.e. LSN at which page server
// may guarantee persistence of all received data. Safekeeper is not free to remove
// WAL preceding `write_lsn`: it should not be requested by this page server.
let write_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
let write_lsn = last_lsn;
let flush_lsn = last_lsn;
let apply_lsn = PgLsn::from(0);
let ts = SystemTime::now();
const NO_REPLY: u8 = 0;
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
}

View File

@@ -22,6 +22,7 @@ use byteorder::{ByteOrder, LittleEndian};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use lazy_static::lazy_static;
use log::*;
use rand::Rng;
use serde::Serialize;
use std::fs;
use std::fs::OpenOptions;
@@ -53,6 +54,8 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
use postgres_ffi::pg_constants;
use postgres_ffi::XLogRecord;
const WAL_REDO_WORKERS: usize = 1;
///
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
///
@@ -140,7 +143,7 @@ pub struct PostgresRedoManager {
conf: &'static PageServerConf,
runtime: tokio::runtime::Runtime,
process: Mutex<Option<PostgresRedoProcess>>,
workers: [Mutex<Option<PostgresRedoProcess>>; WAL_REDO_WORKERS],
}
#[derive(Debug)]
@@ -153,13 +156,6 @@ struct WalRedoRequest {
records: Vec<(Lsn, WALRecord)>,
}
impl WalRedoRequest {
// Can this request be served by zenith redo funcitons
// or we need to pass it to wal-redo postgres process?
fn can_apply_in_zenith(&self) -> bool {
!matches!(self.rel, RelishTag::Relation(_))
}
}
/// An error happened in WAL redo
#[derive(Debug, thiserror::Error)]
pub enum WalRedoError {
@@ -168,8 +164,6 @@ pub enum WalRedoError {
#[error("cannot perform WAL redo now")]
InvalidState,
#[error("cannot perform WAL redo for this request")]
InvalidRequest,
}
///
@@ -191,6 +185,7 @@ impl WalRedoManager for PostgresRedoManager {
records: Vec<(Lsn, WALRecord)>,
) -> Result<Bytes, WalRedoError> {
let start_time;
let lock_time;
let end_time;
let request = WalRedoRequest {
@@ -202,16 +197,11 @@ impl WalRedoManager for PostgresRedoManager {
};
start_time = Instant::now();
let result;
if request.can_apply_in_zenith() {
result = self.handle_apply_request_zenith(&request);
end_time = Instant::now();
WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
} else {
let mut process_guard = self.process.lock().unwrap();
let lock_time = Instant::now();
let result = {
let mut process_guard = self.workers[rand::thread_rng().gen_range(0..WAL_REDO_WORKERS)]
.lock()
.unwrap();
lock_time = Instant::now();
// launch the WAL redo process on first use
if process_guard.is_none() {
@@ -222,14 +212,13 @@ impl WalRedoManager for PostgresRedoManager {
}
let process = process_guard.as_mut().unwrap();
result = self
.runtime
.block_on(self.handle_apply_request_postgres(process, &request));
self.runtime
.block_on(self.handle_apply_request(process, &request))
};
end_time = Instant::now();
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
end_time = Instant::now();
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
}
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
result
}
@@ -253,52 +242,18 @@ impl PostgresRedoManager {
runtime,
tenantid,
conf,
process: Mutex::new(None),
workers: [(); WAL_REDO_WORKERS].map(|_| Mutex::new(None)),
}
}
///
/// Process one request for WAL redo using wal-redo postgres
/// Process one request for WAL redo.
///
async fn handle_apply_request_postgres(
async fn handle_apply_request(
&self,
process: &mut PostgresRedoProcess,
request: &WalRedoRequest,
) -> Result<Bytes, WalRedoError> {
let blknum = request.blknum;
let lsn = request.lsn;
let base_img = request.base_img.clone();
let records = &request.records;
let nrecords = records.len();
let start = Instant::now();
let apply_result: Result<Bytes, Error>;
if let RelishTag::Relation(rel) = request.rel {
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
let duration = start.elapsed();
debug!(
"postgres applied {} WAL records in {} ms to reconstruct page image at LSN {}",
nrecords,
duration.as_millis(),
lsn
);
apply_result.map_err(WalRedoError::IoError)
} else {
Err(WalRedoError::InvalidRequest)
}
}
///
/// Process one request for WAL redo using custom zenith code
///
fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
let rel = request.rel;
let blknum = request.blknum;
let lsn = request.lsn;
@@ -310,158 +265,178 @@ impl PostgresRedoManager {
let start = Instant::now();
let apply_result: Result<Bytes, Error>;
// Non-relational WAL records are handled here, with custom code that has the
// same effects as the corresponding Postgres WAL redo function.
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let mut page = BytesMut::new();
if let Some(fpi) = base_img {
// If full-page image is provided, then use it...
page.extend_from_slice(&fpi[..]);
if let RelishTag::Relation(rel) = rel {
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
} else {
// otherwise initialize page with zeros
page.extend_from_slice(&ZERO_PAGE);
}
// Apply all collected WAL records
for (_lsn, record) in records {
let mut buf = record.rec.clone();
WAL_REDO_RECORD_COUNTER.inc();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
//move to main data
// TODO probably, we should store some records in our special format
// to avoid this weird parsing on replay
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
if buf.remaining() > skip {
buf.advance(skip);
// Non-relational WAL records are handled here, with custom code that has the
// same effects as the corresponding Postgres WAL redo function.
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let mut page = BytesMut::new();
if let Some(fpi) = base_img {
// If full-page image is provided, then use it...
page.extend_from_slice(&fpi[..]);
} else {
// otherwise initialize page with zeros
page.extend_from_slice(&ZERO_PAGE);
}
// Apply all collected WAL records
for (_lsn, record) in records {
let mut buf = record.rec.clone();
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
// Transaction manager stuff
let rec_segno = match rel {
RelishTag::Slru { slru, segno } => {
assert!(
slru == SlruKind::Clog,
"Not valid XACT relish tag {:?}",
rel
);
segno
}
_ => panic!("Not valid XACT relish tag {:?}", rel),
};
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
}
}
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
}
}
WAL_REDO_RECORD_COUNTER.inc();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
//move to main data
// TODO probably, we should store some records in our special format
// to avoid this weird parsing on replay
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
if buf.remaining() > skip {
buf.advance(skip);
}
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
// Multixact operations
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
if let RelishTag::Slru {
slru,
segno: rec_segno,
} = rel
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
// Transaction manager stuff
let rec_segno = match rel {
RelishTag::Slru { slru, segno } => {
assert!(
slru == SlruKind::Clog,
"Not valid XACT relish tag {:?}",
rel
);
segno
}
_ => panic!("Not valid XACT relish tag {:?}", rel),
};
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
{
if slru == SlruKind::MultiXactMembers {
for i in 0..xlrec.nmembers {
let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
if segno == rec_segno && rpageno == blknum {
// update only target block
let offset = xlrec.moff + i;
let memberoff = mx_offset_to_member_offset(offset);
let flagsoff = mx_offset_to_flags_offset(offset);
let bshift = mx_offset_to_flags_bitshift(offset);
let mut flagsval =
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
flagsval &=
!(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
}
}
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
// only update xids on the requested page
if rec_segno == segno && blknum == rpageno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
}
}
}
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
// Multixact operations
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
if let RelishTag::Slru {
slru,
segno: rec_segno,
} = rel
{
if slru == SlruKind::MultiXactMembers {
for i in 0..xlrec.nmembers {
let pageno =
i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
if segno == rec_segno && rpageno == blknum {
// update only target block
let offset = xlrec.moff + i;
let memberoff = mx_offset_to_member_offset(offset);
let flagsoff = mx_offset_to_flags_offset(offset);
let bshift = mx_offset_to_flags_bitshift(offset);
let mut flagsval =
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
flagsval &= !(((1
<< pg_constants::MXACT_MEMBER_BITS_PER_XACT)
- 1)
<< bshift);
flagsval |= xlrec.members[i as usize].status << bshift;
LittleEndian::write_u32(
&mut page[flagsoff..flagsoff + 4],
flagsval,
);
LittleEndian::write_u32(
&mut page[memberoff..memberoff + 4],
xlrec.members[i as usize].xid,
);
flagsval |= xlrec.members[i as usize].status << bshift;
LittleEndian::write_u32(
&mut page[flagsoff..flagsoff + 4],
flagsval,
);
LittleEndian::write_u32(
&mut page[memberoff..memberoff + 4],
xlrec.members[i as usize].xid,
);
}
}
} else {
// Multixact offsets SLRU
let offs = (xlrec.mid
% pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
* 4) as usize;
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
}
} else {
// Multixact offsets SLRU
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
* 4) as usize;
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
panic!();
}
} else {
panic!();
}
} else {
panic!();
}
}
}
apply_result = Ok::<Bytes, Error>(page.freeze());
apply_result = Ok::<Bytes, Error>(page.freeze());
}
let duration = start.elapsed();
let result: Result<Bytes, WalRedoError>;
debug!(
"zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
nrecords,
duration.as_millis(),
lsn
);
apply_result.map_err(WalRedoError::IoError)
if let Err(e) = apply_result {
error!("could not apply WAL records: {:#}", e);
result = Err(WalRedoError::IoError(e));
} else {
let img = apply_result.unwrap();
result = Ok(img);
}
// The caller is responsible for sending the response
result
}
}

View File

@@ -12,14 +12,7 @@ pub struct DatabaseInfo {
pub port: u16,
pub dbname: String,
pub user: String,
pub password: Option<String>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct ProxyAuthResult {
pub ready: bool,
pub error: Option<String>,
pub conn_info: Option<DatabaseInfo>,
pub password: String,
}
impl DatabaseInfo {
@@ -31,23 +24,12 @@ impl DatabaseInfo {
.next()
.ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
}
}
impl From<DatabaseInfo> for tokio_postgres::Config {
fn from(db_info: DatabaseInfo) -> Self {
let mut config = tokio_postgres::Config::new();
config
.host(&db_info.host)
.port(db_info.port)
.dbname(&db_info.dbname)
.user(&db_info.user);
if let Some(password) = db_info.password {
config.password(password);
}
config
pub fn conn_string(&self) -> String {
format!(
"dbname={} user={} password={}",
self.dbname, self.user, self.password
)
}
}
@@ -62,25 +44,22 @@ impl CPlaneApi {
database: &str,
md5_response: &[u8],
salt: &[u8; 4],
psql_session_id: &str,
) -> Result<ProxyAuthResult> {
) -> Result<DatabaseInfo> {
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
url.query_pairs_mut()
.append_pair("login", user)
.append_pair("database", database)
.append_pair("md5response", std::str::from_utf8(md5_response)?)
.append_pair("salt", &hex::encode(salt))
.append_pair("psql_session_id", psql_session_id);
.append_pair("salt", &hex::encode(salt));
println!("cplane request: {}", url.as_str());
let resp = reqwest::blocking::get(url)?;
if resp.status().is_success() {
let auth_info: ProxyAuthResult = serde_json::from_str(resp.text()?.as_str())?;
println!("got auth info: #{:?}", auth_info);
Ok(auth_info)
let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
println!("got conn info: #{:?}", conn_info);
Ok(conn_info)
} else {
bail!("Auth failed")
}

View File

@@ -145,18 +145,18 @@ fn main() -> anyhow::Result<()> {
println!("Starting mgmt on {}", state.conf.mgmt_address);
let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
let threads = [
let threads = vec![
// Spawn a thread to listen for connections. It will spawn further threads
// for each connection.
thread::Builder::new()
.name("Listener thread".into())
.name("Proxy thread".into())
.spawn(move || proxy::thread_main(state, pageserver_listener))?,
thread::Builder::new()
.name("Mgmt thread".into())
.spawn(move || mgmt::thread_main(state, mgmt_listener))?,
];
for t in threads {
for t in threads.into_iter() {
t.join().unwrap()?;
}

View File

@@ -6,6 +6,7 @@ use anyhow::bail;
use tokio_postgres::NoTls;
use rand::Rng;
use std::io::Write;
use std::{io, sync::mpsc::channel, thread};
use zenith_utils::postgres_backend::Stream;
use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
@@ -27,13 +28,11 @@ pub fn thread_main(
println!("accepted connection from {}", peer_addr);
socket.set_nodelay(true).unwrap();
thread::Builder::new()
.name("Proxy thread".into())
.spawn(move || {
if let Err(err) = proxy_conn_main(state, socket) {
println!("error: {}", err);
}
})?;
thread::spawn(move || {
if let Err(err) = proxy_conn_main(state, socket) {
println!("error: {}", err);
}
});
}
}
@@ -75,12 +74,8 @@ pub fn proxy_conn_main(
// This will set conn.existing_user and we can decide on next actions
conn.handle_startup()?;
let mut psql_session_id_buf = [0u8; 8];
rand::thread_rng().fill(&mut psql_session_id_buf);
conn.psql_session_id = hex::encode(psql_session_id_buf);
// both scenarious here should end up producing database connection string
let conn_info = if conn.is_existing_user() {
let db_info = if conn.is_existing_user() {
conn.handle_existing_user()?
} else {
conn.handle_new_user()?
@@ -88,7 +83,7 @@ pub fn proxy_conn_main(
// XXX: move that inside handle_new_user/handle_existing_user to be able to
// report wrong connection error.
proxy_pass(conn.pgb, conn_info)
proxy_pass(conn.pgb, db_info)
}
impl ProxyConnection {
@@ -160,25 +155,9 @@ impl ProxyConnection {
Ok(())
}
// Wait for proxy kick form the console with conninfo
fn wait_for_conninfo(&mut self) -> anyhow::Result<DatabaseInfo> {
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
let _ = self
.state
.waiters
.lock()
.unwrap()
.insert(self.psql_session_id.clone(), tx);
// Wait for web console response
// TODO: respond with error to client
rx.recv()?
}
fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
// ask password
rand::thread_rng().fill(&mut self.md5_salt);
self.pgb
.write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
self.pgb.state = ProtoState::Authentication; // XXX
@@ -201,41 +180,14 @@ impl ProxyConnection {
self.database.as_str(),
md5_response,
&self.md5_salt,
&self.psql_session_id,
) {
Err(e) => {
self.pgb.write_message(&BeMessage::ErrorResponse(format!(
"cannot authenticate proxy: {}",
e
)))?;
self.pgb
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
bail!("auth failed: {}", e);
}
Ok(auth_info) => {
let conn_info = if auth_info.ready {
// Cluster is ready, so just take `conn_info` and respond to the client.
auth_info
.conn_info
.expect("conn_info should be provided with ready cluster")
} else {
match auth_info.error {
Some(e) => {
self.pgb.write_message(&BeMessage::ErrorResponse(format!(
"cannot authenticate proxy: {}",
e
)))?;
bail!("auth failed: {}", e);
}
None => {
// Cluster exists, but isn't active, await its start and proxy kick
// with `conn_info`.
self.wait_for_conninfo()?
}
}
};
Ok(conn_info) => {
self.pgb
.write_message_noflush(&BeMessage::AuthenticationOk)?;
self.pgb
@@ -251,6 +203,10 @@ impl ProxyConnection {
}
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
let mut psql_session_id_buf = [0u8; 8];
rand::thread_rng().fill(&mut psql_session_id_buf);
self.psql_session_id = hex::encode(psql_session_id_buf);
let hello_message = format!("☀️ Welcome to Zenith!
To proceed with database creation, open the following link:
@@ -269,83 +225,76 @@ databases without opening the browser.
self.pgb
.write_message(&BeMessage::NoticeResponse(hello_message))?;
// We requested the DB creation from the console. Now wait for conninfo
let conn_info = self.wait_for_conninfo()?;
// await for database creation
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
let _ = self
.state
.waiters
.lock()
.unwrap()
.insert(self.psql_session_id.clone(), tx);
// Wait for web console response
// XXX: respond with error to client
let dbinfo = rx.recv()??;
self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
"Connecting to database.".to_string(),
))?;
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
Ok(conn_info)
Ok(dbinfo)
}
}
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
let config = tokio_postgres::Config::from(db_info);
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
let _ = config.connect_raw(&mut socket, NoTls).await?;
Ok(socket)
}
/// Concurrently proxy both directions of the client and server connections
fn proxy(
(client_read, client_write): (ReadStream, WriteStream),
(server_read, server_write): (ReadStream, WriteStream),
client_read: ReadStream,
client_write: WriteStream,
server_read: ReadStream,
server_write: WriteStream,
) -> anyhow::Result<()> {
fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
/// FlushWriter will make sure that every message is sent as soon as possible
struct FlushWriter<W>(W);
impl<W: io::Write> io::Write for FlushWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
// `std::io::copy` is guaranteed to exit if we return an error,
// so we can afford to lose `res` in case `flush` fails
let res = self.0.write(buf);
if res.is_ok() {
self.0.flush()?;
}
res
}
fn flush(&mut self) -> io::Result<()> {
self.0.flush()
}
}
let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer));
writer.shutdown(std::net::Shutdown::Both)?;
res
fn do_proxy(mut reader: ReadStream, mut writer: WriteStream) -> io::Result<()> {
std::io::copy(&mut reader, &mut writer)?;
writer.flush()?;
writer.shutdown(std::net::Shutdown::Both)
}
let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write));
do_proxy(server_read, client_write)?;
client_to_server_jh.join().unwrap()?;
let res1 = do_proxy(server_read, client_write);
let res2 = client_to_server_jh.join().unwrap();
res1?;
res2?;
Ok(())
}
/// Proxy a client connection to a postgres database
fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
let db_stream = {
// We'll get rid of this once migration to async is complete
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let db_stream = runtime.block_on(connect_to_db(db_info))?;
let db_stream = db_stream.into_std()?;
db_stream.set_nonblocking(false)?;
let stream = runtime.block_on(connect_to_db(db_info))?.into_std()?;
stream.set_nonblocking(false)?;
stream
};
let db_stream = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
let (db_read, db_write) = db_stream.split();
let db = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
let client = match pgb.into_stream() {
let stream = match pgb.into_stream() {
Stream::Bidirectional(bidi_stream) => bidi_stream,
_ => bail!("invalid stream"),
};
proxy(client.split(), db.split())
let (client_read, client_write) = stream.split();
proxy(client_read, client_write, db_read, db_write)
}

View File

@@ -14,16 +14,12 @@ asyncpg = "*"
cached-property = "*"
[dev-packages]
flake8 = "*"
mypy = "*"
# Behavior may change slightly between versions. These are run continuously,
# so we pin exact versions to avoid suprising breaks. Update if comfortable.
yapf = "==0.31.0"
mypy = "==0.910"
# Non-pinned packages follow.
pipenv = "*"
flake8 = "*"
types-requests = "*"
types-psycopg2 = "*"
[requires]
# we need at least 3.7, but pipenv doesn't allow to say this directly
# we need at least 3.6, but pipenv doesn't allow to say this directly
python_version = "3"

156
test_runner/Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "63b72760ef37375186a638066ba0ad5804dbace99ddc503ea654e9749070ab24"
"sha256": "3645ae8d2dcf55bd2a54963c44cfeedf577f3b289d1077365214a80a7f36e643"
},
"pipfile-spec": 6,
"requires": {
@@ -162,14 +162,6 @@
"markers": "python_version >= '3'",
"version": "==3.3"
},
"importlib-metadata": {
"hashes": [
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
],
"markers": "python_version < '3.8'",
"version": "==4.8.1"
},
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -299,47 +291,9 @@
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.7"
},
"zipp": {
"hashes": [
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
],
"markers": "python_version >= '3.6'",
"version": "==3.6.0"
}
},
"develop": {
"backports.entry-points-selectable": {
"hashes": [
"sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
"sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
],
"markers": "python_version >= '2.7'",
"version": "==1.1.0"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.10.8"
},
"distlib": {
"hashes": [
"sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
"sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
],
"version": "==0.3.3"
},
"filelock": {
"hashes": [
"sha256:2b5eb3589e7fdda14599e7eb1a50e09b4cc14f34ed98b8ba56d33bfaafcbef2f",
"sha256:34a9f35f95c441e7b38209775d6e0337f9a3759f3565f6c5798f19618527c76f"
],
"markers": "python_version >= '3.6'",
"version": "==3.3.1"
},
"flake8": {
"hashes": [
"sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
@@ -348,14 +302,6 @@
"index": "pypi",
"version": "==4.0.1"
},
"importlib-metadata": {
"hashes": [
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
],
"markers": "python_version < '3.8'",
"version": "==4.8.1"
},
"mccabe": {
"hashes": [
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
@@ -399,22 +345,6 @@
],
"version": "==0.4.3"
},
"pipenv": {
"hashes": [
"sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
"sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
],
"index": "pypi",
"version": "==2021.5.29"
},
"platformdirs": {
"hashes": [
"sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
"sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
],
"markers": "python_version >= '3.6'",
"version": "==2.4.0"
},
"pycodestyle": {
"hashes": [
"sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
@@ -431,14 +361,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.4.0"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"toml": {
"hashes": [
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
@@ -447,58 +369,6 @@
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.10.2"
},
"typed-ast": {
"hashes": [
"sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
"sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
"sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
"sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
"sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
"sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
"sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
"sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
"sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
"sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
"sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
"sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
"sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
"sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
"sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
"sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
"sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
"sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
"sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
"sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
"sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
"sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
"sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
"sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
"sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
"sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
"sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
"sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
"sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
"sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
],
"markers": "python_version < '3.8'",
"version": "==1.4.3"
},
"types-psycopg2": {
"hashes": [
"sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
"sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
],
"index": "pypi",
"version": "==2.9.1"
},
"types-requests": {
"hashes": [
"sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
"sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
],
"index": "pypi",
"version": "==2.25.11"
},
"typing-extensions": {
"hashes": [
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
@@ -508,22 +378,6 @@
"index": "pypi",
"version": "==3.10.0.2"
},
"virtualenv": {
"hashes": [
"sha256:10062e34c204b5e4ec5f62e6ef2473f8ba76513a9a617e873f1f8fb4a519d300",
"sha256:bcc17f0b3a29670dd777d6f0755a4c04f28815395bca279cdcb213b97199a6b8"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==20.8.1"
},
"virtualenv-clone": {
"hashes": [
"sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
"sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.5.7"
},
"yapf": {
"hashes": [
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
@@ -531,14 +385,6 @@
],
"index": "pypi",
"version": "==0.31.0"
},
"zipp": {
"hashes": [
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
],
"markers": "python_version >= '3.6'",
"version": "==3.6.0"
}
}
}

View File

@@ -3,13 +3,10 @@
This directory contains integration tests.
Prerequisites:
- Python 3.7 or later
- Development headers may also be needed to build `psycopg2` from source.
- Python 3.7 is recommended if you want to update tests.
- Python 3.6 or later
- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
packages are stale, as it commonly happens, so manual installation is not
recommended.
Exact version of `pipenv` is not important unless you change dependencies.
Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
command in the venv, e.g. `pipenv run pytest`.
- Zenith and Postgres binaries
@@ -65,87 +62,46 @@ Exit after the first test failure:
`pytest -x ...`
(there are many more pytest options; run `pytest -h` to see them.)
### Writing a test
Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment
is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and
compute Postgres nodes. The connections between them can be configured to use JWT
authentication tokens, and some other configuration options can be tweaked too.
### Building new tests
The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env`
fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
or make other destructive changes in that environment. Also don't assume that
there are no tenants or branches or data in the cluster. For convenience, there is a
branch called `empty`, though. The convention is to create a test-specific branch of
that and load any test data there, instead of the 'main' branch.
The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env`
fixture:
Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
So this code:
```python
def test_foobar(zenith_env_builder: ZenithEnvBuilder):
# Prescribe the environment.
# We want to have 3 safekeeper nodes, and use JWT authentication in the
# connections to the page server
zenith_env_builder.num_safekeepers = 3
zenith_env_builder.set_pageserver_auth(True)
# Now create the environment. This initializes the repository, and starts
# up the page server and the safekeepers
env = zenith_env_builder.init()
# Run the test
...
def test_something(zenith_cli, pg_bin):
pass
```
For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html
... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
At the end of a test, all the nodes in the environment are automatically stopped, so you
don't need to worry about cleaning up. Logs and test data are preserved for the analysis,
in a directory under `../test_output/<testname>`
Fixtures can't be imported using the normal python syntax. Instead, use this:
### Before submitting a patch
#### Obligatory checks
Install dev dependencies via `pipenv --python 3.7 install --dev` (better)
or `pipenv install --dev` (if you don't have Python 3.7 and don't need to change dependencies).
We force code formatting via yapf and type hints via mypy.
Run the following commands in the `test_runner/` directory:
```bash
pipenv run yapf -ri . # All code is reformatted
pipenv run mypy . # Ensure there are no typing errors
```python
pytest_plugins = ("fixtures.something")
```
#### Advisable actions
That will make all the fixtures in the `fixtures/something.py` file available.
Anything that's likely to be used in multiple tests should be built into a fixture.
Note that fixtures can clean up after themselves if they use the `yield` syntax.
Cleanup will happen even if the test fails (raises an unhandled exception).
Python destructors, e.g. `__del__()` aren't recommended for cleanup.
### Code quality
We force code formatting via yapf:
1. Install `yapf` and other tools (`flake8`, `mypy`) with `pipenv install --dev`.
1. Reformat all your code by running `pipenv run yapf -ri .` in the `test_runner/` directory.
Before submitting a patch, please consider:
* Writing a couple of docstrings to clarify the reasoning behind a new test.
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
* Adding more type hints to your code to avoid `Any`, especially:
* For fixture parameters, they are not automatically deduced.
* For function arguments and return values.
#### Changing dependencies
You have to update `Pipfile.lock` if you have changed `Pipfile`:
```bash
pipenv --python 3.7 install --dev # Re-create venv for Python 3.7 and install recent pipenv inside
pipenv run pipenv --version # Should be at least 2021.5.29
pipenv run pipenv lock # Regenerate Pipfile.lock
```
As the minimal supported version is Python 3.7 and we use it in CI,
you have to use a Python 3.7 environment when updating `Pipfile.lock`.
Otherwise some back-compatibility packages will be missing.
It is also important to run recent `pipenv`.
Older versions remove markers from `Pipfile.lock`.
If you don't have Python 3.7, you should install it and its headers (for `psycopg2`)
separately, e.g.:
```bash
# In Ubuntu
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update
sudo apt install python3.7 python3.7-dev
```
* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.

View File

@@ -2,21 +2,18 @@ from contextlib import closing
from typing import Iterator
from uuid import uuid4
import psycopg2
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
import pytest
pytest_plugins = ("fixtures.zenith_fixtures")
def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.pageserver_auth_enabled = True
env = zenith_env_builder.init()
def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
ps = pageserver_auth_enabled
ps = env.pageserver
tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex)
management_token = env.auth_keys.generate_management_token()
tenant_token = ps.auth_keys.generate_tenant_token(ps.initial_tenant)
invalid_tenant_token = ps.auth_keys.generate_tenant_token(uuid4().hex)
management_token = ps.auth_keys.generate_management_token()
# this does not invoke auth check and only decodes jwt and checks it for validity
# check both tokens
@@ -24,13 +21,13 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
ps.safe_psql("status", password=management_token)
# tenant can create branches
ps.safe_psql(f"branch_create {env.initial_tenant} new1 main", password=tenant_token)
ps.safe_psql(f"branch_create {ps.initial_tenant} new1 main", password=tenant_token)
# console can create branches for tenant
ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=management_token)
ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=management_token)
# fail to create branch using token with different tenantid
with pytest.raises(psycopg2.DatabaseError, match='Tenant id mismatch. Permission denied'):
ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=invalid_tenant_token)
ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=invalid_tenant_token)
# create tenant using management token
ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)
@@ -43,22 +40,40 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
zenith_env_builder.pageserver_auth_enabled = True
if with_wal_acceptors:
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
def test_compute_auth_to_pageserver(
zenith_cli: ZenithCli,
wa_factory,
pageserver_auth_enabled: ZenithPageserver,
repo_dir: str,
with_wal_acceptors: bool,
pg_bin: PgBin,
port_distributor: PortDistributor,
):
ps = pageserver_auth_enabled
# since we are in progress of refactoring protocols between compute safekeeper and page server
# use hardcoded management token in safekeeper
management_token = ps.auth_keys.generate_management_token()
branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
env.zenith_cli(["branch", branch, "main"])
zenith_cli.run(["branch", branch, "empty"])
if with_wal_acceptors:
wa_factory.start_n_new(3, management_token)
pg = env.postgres.create_start(branch)
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
cur.execute('CREATE TABLE t(key int primary key, value text)')
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
cur.execute('SELECT sum(key) FROM t')
assert cur.fetchone() == (5000050000, )
with Postgres(
zenith_cli=zenith_cli,
repo_dir=repo_dir,
pg_bin=pg_bin,
tenant_id=ps.initial_tenant,
port=port_distributor.get_port(),
).create_start(
branch,
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
) as pg:
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
cur.execute('CREATE TABLE t(key int primary key, value text)')
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
cur.execute('SELECT sum(key) FROM t')
assert cur.fetchone() == (5000050000, )

View File

@@ -1,5 +1,5 @@
import subprocess
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -8,12 +8,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
#
# Create a couple of branches off the main branch, at a historical point in time.
#
def test_branch_behind(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
# Branch at the point where only 100 rows were inserted
env.zenith_cli(["branch", "test_branch_behind", "empty"])
zenith_cli.run(["branch", "test_branch_behind", "empty"])
pgmain = env.postgres.create_start('test_branch_behind')
pgmain = postgres.create_start('test_branch_behind')
log.info("postgres is running on 'test_branch_behind' branch")
main_pg_conn = pgmain.connect()
@@ -41,7 +40,7 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
log.info(f'LSN after 200100 rows: {lsn_b}')
# Branch at the point where only 100 rows were inserted
env.zenith_cli(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
# Insert many more rows. This generates enough WAL to fill a few segments.
main_cur.execute('''
@@ -56,10 +55,10 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
log.info(f'LSN after 400100 rows: {lsn_c}')
# Branch at the point where only 200100 rows were inserted
env.zenith_cli(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
pg_more = env.postgres.create_start("test_branch_behind_more")
pg_hundred = postgres.create_start("test_branch_behind_hundred")
pg_more = postgres.create_start("test_branch_behind_more")
# On the 'hundred' branch, we should see only 100 rows
hundred_pg_conn = pg_hundred.connect()
@@ -80,8 +79,8 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
# Check bad lsn's for branching
# branch at segment boundary
env.zenith_cli(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
pg = env.postgres.create_start("test_branch_segment_boundary")
zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
pg = postgres.create_start("test_branch_segment_boundary")
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
@@ -90,7 +89,7 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
#
# FIXME: This works currently, but probably shouldn't be allowed
try:
env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
# FIXME: assert false, "branch with invalid LSN should have failed"
except subprocess.CalledProcessError:
log.info("Branch creation with pre-initdb LSN failed (as expected)")

View File

@@ -3,7 +3,7 @@ import os
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -12,10 +12,9 @@ pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test compute node start after clog truncation
#
def test_clog_truncate(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
# Create a branch for us
env.zenith_cli(["branch", "test_clog_truncate", "empty"])
zenith_cli.run(["branch", "test_clog_truncate", "empty"])
# set agressive autovacuum to make sure that truncation will happen
config = [
@@ -28,7 +27,7 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv):
'autovacuum_freeze_max_age=100000'
]
pg = env.postgres.create_start('test_clog_truncate', config_lines=config)
pg = postgres.create_start('test_clog_truncate', config_lines=config)
log.info('postgres is running on test_clog_truncate branch')
# Install extension containing function needed for test
@@ -65,10 +64,10 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv):
# create new branch after clog truncation and start a compute node on it
log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
env.zenith_cli(
zenith_cli.run(
["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])
pg2 = env.postgres.create_start('test_clog_truncate_new')
pg2 = postgres.create_start('test_clog_truncate_new')
log.info('postgres is running on test_clog_truncate_new branch')
# check that new node doesn't contain truncated segment

View File

@@ -1,6 +1,6 @@
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,13 +9,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test starting Postgres with custom options
#
def test_config(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
# Create a branch for us
env.zenith_cli(["branch", "test_config", "empty"])
zenith_cli.run(["branch", "test_config", "empty"])
# change config
pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
log.info('postgres is running on test_config branch')
with closing(pg.connect()) as conn:

View File

@@ -2,7 +2,7 @@ import os
import pathlib
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -11,11 +11,15 @@ pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test CREATE DATABASE when there have been relmapper changes
#
def test_createdb(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli(["branch", "test_createdb", "empty"])
def test_createdb(
zenith_cli: ZenithCli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin,
):
zenith_cli.run(["branch", "test_createdb", "empty"])
pg = env.postgres.create_start('test_createdb')
pg = postgres.create_start('test_createdb')
log.info("postgres is running on 'test_createdb' branch")
with closing(pg.connect()) as conn:
@@ -29,9 +33,9 @@ def test_createdb(zenith_simple_env: ZenithEnv):
lsn = cur.fetchone()[0]
# Create a branch
env.zenith_cli(["branch", "test_createdb2", "test_createdb@" + lsn])
zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
pg2 = env.postgres.create_start('test_createdb2')
pg2 = postgres.create_start('test_createdb2')
# Test that you can connect to the new database on both branches
for db in (pg, pg2):
@@ -41,11 +45,16 @@ def test_createdb(zenith_simple_env: ZenithEnv):
#
# Test DROP DATABASE
#
def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
env = zenith_simple_env
env.zenith_cli(["branch", "test_dropdb", "empty"])
def test_dropdb(
zenith_cli: ZenithCli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin,
test_output_dir,
):
zenith_cli.run(["branch", "test_dropdb", "empty"])
pg = env.postgres.create_start('test_dropdb')
pg = postgres.create_start('test_dropdb')
log.info("postgres is running on 'test_dropdb' branch")
with closing(pg.connect()) as conn:
@@ -68,28 +77,26 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
lsn_after_drop = cur.fetchone()[0]
# Create two branches before and after database drop.
env.zenith_cli(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
pg_before = env.postgres.create_start('test_before_dropdb')
zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
pg_before = postgres.create_start('test_before_dropdb')
env.zenith_cli(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
pg_after = env.postgres.create_start('test_after_dropdb')
zenith_cli.run(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
pg_after = postgres.create_start('test_after_dropdb')
# Test that database exists on the branch before drop
pg_before.connect(dbname='foodb').close()
# Test that database subdir exists on the branch before drop
assert pg_before.pgdata_dir
dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
log.info(dbpath)
assert os.path.isdir(dbpath) == True
# Test that database subdir doesn't exist on the branch after drop
assert pg_after.pgdata_dir
dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
log.info(dbpath)
assert os.path.isdir(dbpath) == False
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(test_output_dir, env, pg)
check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)

View File

@@ -1,6 +1,6 @@
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,11 +9,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test CREATE USER to check shared catalog restore
#
def test_createuser(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli(["branch", "test_createuser", "empty"])
def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
zenith_cli.run(["branch", "test_createuser", "empty"])
pg = env.postgres.create_start('test_createuser')
pg = postgres.create_start('test_createuser')
log.info("postgres is running on 'test_createuser' branch")
with closing(pg.connect()) as conn:
@@ -27,9 +26,9 @@ def test_createuser(zenith_simple_env: ZenithEnv):
lsn = cur.fetchone()[0]
# Create a branch
env.zenith_cli(["branch", "test_createuser2", "test_createuser@" + lsn])
zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
pg2 = env.postgres.create_start('test_createuser2')
pg2 = postgres.create_start('test_createuser2')
# Test that you can connect to new branch as a new user
assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]

View File

@@ -1,4 +1,4 @@
from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -10,11 +10,15 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# it only checks next_multixact_id field in restored pg_control,
# since we don't have functions to check multixact internals.
#
def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
env = zenith_simple_env
def test_multixact(pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin,
zenith_cli,
base_dir,
test_output_dir):
# Create a branch for us
env.zenith_cli(["branch", "test_multixact", "empty"])
pg = env.postgres.create_start('test_multixact')
zenith_cli.run(["branch", "test_multixact", "empty"])
pg = postgres.create_start('test_multixact')
log.info("postgres is running on 'test_multixact' branch")
pg_conn = pg.connect()
@@ -53,8 +57,8 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
assert int(next_multixact_id) > int(next_multixact_id_old)
# Branch at this point
env.zenith_cli(["branch", "test_multixact_new", "test_multixact@" + lsn])
pg_new = env.postgres.create_start('test_multixact_new')
zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
pg_new = postgres.create_start('test_multixact_new')
log.info("postgres is running on 'test_multixact_new' branch")
pg_new_conn = pg_new.connect()
@@ -67,4 +71,4 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
assert next_multixact_id_new == next_multixact_id
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(test_output_dir, env, pg_new)
check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)

View File

@@ -1,6 +1,6 @@
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -16,11 +16,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# just a hint that the page hasn't been modified since that LSN, and the page
# server should return the latest page version regardless of the LSN.
#
def test_old_request_lsn(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
def test_old_request_lsn(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin):
# Create a branch for us
env.zenith_cli(["branch", "test_old_request_lsn", "empty"])
pg = env.postgres.create_start('test_old_request_lsn')
zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
pg = postgres.create_start('test_old_request_lsn')
log.info('postgres is running on test_old_request_lsn branch')
pg_conn = pg.connect()
@@ -30,7 +32,7 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
cur.execute("SHOW zenith.zenith_timeline")
timeline = cur.fetchone()[0]
psconn = env.pageserver.connect()
psconn = pageserver.connect()
pscur = psconn.cursor()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
@@ -57,7 +59,7 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
# Make a lot of updates on a single row, generating a lot of WAL. Trigger
# garbage collections so that the page server will remove old page versions.
for i in range(10):
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
for j in range(100):
cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')

View File

@@ -3,28 +3,25 @@ from uuid import uuid4
import pytest
import psycopg2
import requests
from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
from typing import cast
from fixtures.zenith_fixtures import ZenithPageserver, ZenithPageserverHttpClient
pytest_plugins = ("fixtures.zenith_fixtures")
def test_status_psql(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
assert env.pageserver.safe_psql('status') == [
def test_status_psql(pageserver):
assert pageserver.safe_psql('status') == [
('hello world', ),
]
def test_branch_list_psql(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
# Create a branch for us
env.zenith_cli(["branch", "test_branch_list_main", "empty"])
zenith_cli.run(["branch", "test_branch_list_main", "empty"])
conn = env.pageserver.connect()
conn = pageserver.connect()
cur = conn.cursor()
cur.execute(f'branch_list {env.initial_tenant}')
cur.execute(f'branch_list {pageserver.initial_tenant}')
branches = json.loads(cur.fetchone()[0])
# Filter out branches created by other tests
branches = [x for x in branches if x['name'].startswith('test_branch_list')]
@@ -37,10 +34,10 @@ def test_branch_list_psql(zenith_simple_env: ZenithEnv):
assert 'ancestor_lsn' in branches[0]
# Create another branch, and start Postgres on it
env.zenith_cli(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
env.zenith_cli(['pg', 'create', 'test_branch_list_experimental'])
zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
cur.execute(f'branch_list {env.initial_tenant}')
cur.execute(f'branch_list {pageserver.initial_tenant}')
new_branches = json.loads(cur.fetchone()[0])
# Filter out branches created by other tests
new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
@@ -56,22 +53,19 @@ def test_branch_list_psql(zenith_simple_env: ZenithEnv):
conn.close()
def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
# don't use zenith_simple_env, because there might be other tenants there,
# left over from other tests.
env = zenith_env_builder.init()
res = env.zenith_cli(["tenant", "list"])
def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
res = zenith_cli.run(["tenant", "list"])
res.check_returncode()
tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
assert tenants == [env.initial_tenant]
tenants = res.stdout.splitlines()
assert tenants == [pageserver.initial_tenant]
conn = env.pageserver.connect()
conn = pageserver.connect()
cur = conn.cursor()
# check same tenant cannot be created twice
with pytest.raises(psycopg2.DatabaseError, match=f'tenant {env.initial_tenant} already exists'):
cur.execute(f'tenant_create {env.initial_tenant}')
with pytest.raises(psycopg2.DatabaseError,
match=f'tenant {pageserver.initial_tenant} already exists'):
cur.execute(f'tenant_create {pageserver.initial_tenant}')
# create one more tenant
tenant1 = uuid4().hex
@@ -80,20 +74,20 @@ def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
cur.execute('tenant_list')
# compare tenants list
new_tenants = sorted(map(lambda t: cast(str, t['id']), json.loads(cur.fetchone()[0])))
assert sorted([env.initial_tenant, tenant1]) == new_tenants
new_tenants = sorted(json.loads(cur.fetchone()[0]))
assert sorted([pageserver.initial_tenant, tenant1]) == new_tenants
def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
client.check_status()
# check initial tenant is there
assert initial_tenant in {t['id'] for t in client.tenant_list()}
assert initial_tenant in set(client.tenant_list())
# create new tenant and check it is also there
tenant_id = uuid4()
client.tenant_create(tenant_id)
assert tenant_id.hex in {t['id'] for t in client.tenant_list()}
assert tenant_id.hex in set(client.tenant_list())
# create branch
branch_name = uuid4().hex
@@ -103,17 +97,12 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}
def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
client = env.pageserver.http_client()
check_client(client, env.initial_tenant)
def test_pageserver_http_api_client(pageserver: ZenithPageserver):
client = pageserver.http_client()
check_client(client, pageserver.initial_tenant)
def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.pageserver_auth_enabled = True
env = zenith_env_builder.init()
management_token = env.auth_keys.generate_management_token()
client = env.pageserver.http_client(auth_token=management_token)
check_client(client, env.initial_tenant)
def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
client = pageserver_auth_enabled.http_client(
auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
check_client(client, pageserver_auth_enabled.initial_tenant)

View File

@@ -4,7 +4,7 @@ import time
from contextlib import closing
from multiprocessing import Process, Value
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -13,13 +13,16 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
# times, with fault_probability chance of getting a wal acceptor down or up
# along the way. 2 of 3 are always alive, so the work keeps going.
def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
# One safekeeper is enough for this test.
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init()
def test_pageserver_restart(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
wa_factory: WalAcceptorFactory):
env.zenith_cli(["branch", "test_pageserver_restart", "main"])
pg = env.postgres.create_start('test_pageserver_restart')
# One safekeeper is enough for this test.
wa_factory.start_n_new(1)
zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
pg = postgres.create_start('test_pageserver_restart', wal_acceptors=wa_factory.get_connstrs())
pg_conn = pg.connect()
cur = pg_conn.cursor()
@@ -47,8 +50,8 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
# Stop and restart pageserver. This is a more or less graceful shutdown, although
# the page server doesn't currently have a shutdown routine so there's no difference
# between stopping and crashing.
env.pageserver.stop()
env.pageserver.start()
pageserver.stop()
pageserver.start()
# Stopping the pageserver breaks the connection from the postgres backend to
# the page server, and causes the next query on the connection to fail. Start a new
@@ -62,5 +65,5 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
assert cur.fetchone() == (100000, )
# Stop the page server by force, and restart it
env.pageserver.stop()
env.pageserver.start()
pageserver.stop()
pageserver.start()

View File

@@ -1,15 +1,14 @@
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin):
env = zenith_simple_env
def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli):
# Create a branch for us
env.zenith_cli(["branch", "test_pgbench", "empty"])
zenith_cli.run(["branch", "test_pgbench", "empty"])
pg = env.postgres.create_start('test_pgbench')
pg = postgres.create_start('test_pgbench')
log.info("postgres is running on 'test_pgbench' branch")
connstr = pg.connstr()

View File

@@ -1,5 +1,5 @@
import subprocess
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -10,11 +10,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# This is very similar to the 'test_branch_behind' test, but instead of
# creating branches, creates read-only nodes.
#
def test_readonly_node(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli(["branch", "test_readonly_node", "empty"])
def test_readonly_node(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
zenith_cli.run(["branch", "test_readonly_node", "empty"])
pgmain = env.postgres.create_start('test_readonly_node')
pgmain = postgres.create_start('test_readonly_node')
print("postgres is running on 'test_readonly_node' branch")
main_pg_conn = pgmain.connect()
@@ -53,12 +52,11 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
print('LSN after 400100 rows: ' + lsn_c)
# Create first read-only node at the point where only 100 rows were inserted
pg_hundred = env.postgres.create_start("test_readonly_node_hundred",
branch=f'test_readonly_node@{lsn_a}')
pg_hundred = postgres.create_start("test_readonly_node_hundred",
branch=f'test_readonly_node@{lsn_a}')
# And another at the point where 200100 rows were inserted
pg_more = env.postgres.create_start("test_readonly_node_more",
branch=f'test_readonly_node@{lsn_b}')
pg_more = postgres.create_start("test_readonly_node_more", branch=f'test_readonly_node@{lsn_b}')
# On the 'hundred' node, we should see only 100 rows
hundred_pg_conn = pg_hundred.connect()
@@ -77,15 +75,15 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
assert main_cur.fetchone() == (400100, )
# Check creating a node at segment boundary
pg = env.postgres.create_start("test_branch_segment_boundary",
branch="test_readonly_node@0/3000000")
pg = postgres.create_start("test_branch_segment_boundary",
branch="test_readonly_node@0/3000000")
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
# Create node at pre-initdb lsn
try:
env.zenith_cli(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
assert False, "compute node startup with invalid LSN should have failed"
zenith_cli.run(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
assert false, "compute node startup with invalid LSN should have failed"
except Exception:
print("Node creation with pre-initdb LSN failed (as expected)")

View File

@@ -1,7 +1,7 @@
import pytest
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -11,15 +11,22 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# Test restarting and recreating a postgres instance
#
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
zenith_env_builder.pageserver_auth_enabled = True
def test_restart_compute(
zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin,
wa_factory,
with_wal_acceptors: bool,
):
wal_acceptor_connstrs = None
zenith_cli.run(["branch", "test_restart_compute", "empty"])
if with_wal_acceptors:
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
wa_factory.start_n_new(3)
wal_acceptor_connstrs = wa_factory.get_connstrs()
env.zenith_cli(["branch", "test_restart_compute", "main"])
pg = env.postgres.create_start('test_restart_compute')
pg = postgres.create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
log.info("postgres is running on 'test_restart_compute' branch")
with closing(pg.connect()) as conn:
@@ -32,7 +39,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor
log.info(f"res = {r}")
# Remove data directory and restart
pg.stop_and_destroy().create_start('test_restart_compute')
pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
@@ -51,7 +58,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor
log.info(f"res = {r}")
# Again remove data directory and restart
pg.stop_and_destroy().create_start('test_restart_compute')
pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
# That select causes lots of FPI's and increases probability of wakeepers
# lagging behind after query completion
@@ -65,7 +72,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor
log.info(f"res = {r}")
# And again remove data directory and restart
pg.stop_and_destroy().create_start('test_restart_compute')
pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
with closing(pg.connect()) as conn:
with conn.cursor() as cur:

View File

@@ -1,7 +1,6 @@
from contextlib import closing
import psycopg2.extras
import time
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -23,14 +22,13 @@ def print_gc_result(row):
# This test is pretty tightly coupled with the current implementation of layered
# storage, in layered_repository.rs.
#
def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli(["branch", "test_layerfiles_gc", "empty"])
pg = env.postgres.create_start('test_layerfiles_gc')
def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
zenith_cli.run(["branch", "test_layerfiles_gc", "empty"])
pg = postgres.create_start('test_layerfiles_gc')
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
with closing(env.pageserver.connect()) as psconn:
with closing(pageserver.connect()) as psconn:
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
# Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -59,7 +57,7 @@ def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
cur.execute("DELETE FROM foo")
log.info("Running GC before test")
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)
# remember the number of files
@@ -72,7 +70,7 @@ def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
# removing the old image and delta layer.
log.info("Inserting one row and running GC")
cur.execute("INSERT INTO foo VALUES (1)")
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
@@ -86,7 +84,7 @@ def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
cur.execute("INSERT INTO foo VALUES (2)")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
@@ -98,7 +96,7 @@ def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
cur.execute("INSERT INTO foo VALUES (2)")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
@@ -107,7 +105,7 @@ def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
# Run GC again, with no changes in the database. Should not remove anything.
log.info("Run GC again, with nothing to do")
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)
assert row['layer_relfiles_total'] == layer_relfiles_remain
@@ -120,7 +118,7 @@ def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
log.info("Drop table and run GC again")
cur.execute("DROP TABLE foo")
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row)

View File

@@ -2,41 +2,51 @@ from contextlib import closing
import pytest
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.zenith_fixtures import (
TenantFactory,
ZenithCli,
PostgresFactory,
)
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
if with_wal_acceptors:
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
def test_tenants_normal_work(
zenith_cli: ZenithCli,
tenant_factory: TenantFactory,
postgres: PostgresFactory,
wa_factory,
with_wal_acceptors: bool,
):
"""Tests tenants with and without wal acceptors"""
tenant_1 = env.create_tenant()
tenant_2 = env.create_tenant()
tenant_1 = tenant_factory.create()
tenant_2 = tenant_factory.create()
env.zenith_cli([
zenith_cli.run([
"branch",
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
"main",
f"--tenantid={tenant_1}"
])
env.zenith_cli([
zenith_cli.run([
"branch",
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
"main",
f"--tenantid={tenant_2}"
])
if with_wal_acceptors:
wa_factory.start_n_new(3)
pg_tenant1 = env.postgres.create_start(
pg_tenant1 = postgres.create_start(
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
None, # branch name, None means same as node name
tenant_1,
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
)
pg_tenant2 = env.postgres.create_start(
pg_tenant2 = postgres.create_start(
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
None, # branch name, None means same as node name
tenant_2,
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
)
for pg in [pg_tenant1, pg_tenant2]:

View File

@@ -1,20 +1,19 @@
from contextlib import closing
from uuid import UUID
import psycopg2.extras
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
def test_timeline_size(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
def test_timeline_size(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
# Branch at the point where only 100 rows were inserted
env.zenith_cli(["branch", "test_timeline_size", "empty"])
zenith_cli.run(["branch", "test_timeline_size", "empty"])
client = env.pageserver.http_client()
res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
client = pageserver.http_client()
res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
pgmain = env.postgres.create_start("test_timeline_size")
pgmain = postgres.create_start("test_timeline_size")
log.info("postgres is running on 'test_timeline_size' branch")
with closing(pgmain.connect()) as conn:
@@ -29,9 +28,9 @@ def test_timeline_size(zenith_simple_env: ZenithEnv):
FROM generate_series(1, 10) g
""")
res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
cur.execute("TRUNCATE foo")
res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]

View File

@@ -1,6 +1,6 @@
import os
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,11 +9,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test branching, when a transaction is in prepared state
#
def test_twophase(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli(["branch", "test_twophase", "empty"])
def test_twophase(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin: PgBin):
zenith_cli.run(["branch", "test_twophase", "empty"])
pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
log.info("postgres is running on 'test_twophase' branch")
conn = pg.connect()
@@ -58,10 +60,10 @@ def test_twophase(zenith_simple_env: ZenithEnv):
assert len(twophase_files) == 2
# Create a branch with the transaction in prepared state
env.zenith_cli(["branch", "test_twophase_prepared", "test_twophase"])
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
# Start compute on the new branch
pg2 = env.postgres.create_start(
pg2 = postgres.create_start(
'test_twophase_prepared',
config_lines=['max_prepared_transactions=5'],
)

View File

@@ -1,4 +1,4 @@
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -8,12 +8,14 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# Test that the VM bit is cleared correctly at a HEAP_DELETE and
# HEAP_UPDATE record.
#
def test_vm_bit_clear(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
def test_vm_bit_clear(pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin,
zenith_cli,
base_dir):
# Create a branch for us
env.zenith_cli(["branch", "test_vm_bit_clear", "empty"])
pg = env.postgres.create_start('test_vm_bit_clear')
zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
pg = postgres.create_start('test_vm_bit_clear')
log.info("postgres is running on 'test_vm_bit_clear' branch")
pg_conn = pg.connect()
@@ -36,7 +38,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv):
cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1')
# Branch at this point, to test that later
env.zenith_cli(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])
zenith_cli.run(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])
# Clear the buffer cache, to force the VM page to be re-fetched from
# the page server
@@ -64,7 +66,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv):
# a dirty VM page is evicted. If the VM bit was not correctly cleared by the
# earlier WAL record, the full-page image hides the problem. Starting a new
# server at the right point-in-time avoids that full-page image.
pg_new = env.postgres.create_start('test_vm_bit_clear_new')
pg_new = postgres.create_start('test_vm_bit_clear_new')
log.info("postgres is running on 'test_vm_bit_clear_new' branch")
pg_new_conn = pg_new.connect()

View File

@@ -7,7 +7,7 @@ import uuid
from contextlib import closing
from multiprocessing import Process, Value
from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin
from fixtures.utils import lsn_to_hex, mkdir_if_needed
from fixtures.log_helper import log
@@ -16,13 +16,14 @@ pytest_plugins = ("fixtures.zenith_fixtures")
# basic test, write something in setup with wal acceptors, ensure that commits
# succeed and data is written
def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
env.zenith_cli(["branch", "test_wal_acceptors_normal_work", "main"])
pg = env.postgres.create_start('test_wal_acceptors_normal_work')
def test_normal_work(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
wa_factory):
zenith_cli.run(["branch", "test_wal_acceptors_normal_work", "empty"])
wa_factory.start_n_new(3)
pg = postgres.create_start('test_wal_acceptors_normal_work',
wal_acceptors=wa_factory.get_connstrs())
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
@@ -36,19 +37,21 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
# Run page server and multiple acceptors, and multiple compute nodes running
# against different timelines.
def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
def test_many_timelines(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
wa_factory):
n_timelines = 2
wa_factory.start_n_new(3)
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
# start postgres on each timeline
pgs = []
for branch in branches:
env.zenith_cli(["branch", branch, "main"])
pgs.append(env.postgres.create_start(branch))
zenith_cli.run(["branch", branch, "empty"])
pgs.append(postgres.create_start(branch, wal_acceptors=wa_factory.get_connstrs()))
# Do everything in different loops to have actions on different timelines
# interleaved.
@@ -69,16 +72,19 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
# times, with fault_probability chance of getting a wal acceptor down or up
# along the way. 2 of 3 are always alive, so the work keeps going.
def test_restarts(zenith_env_builder: ZenithEnvBuilder):
def test_restarts(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
wa_factory: WalAcceptorFactory):
fault_probability = 0.01
n_inserts = 1000
n_acceptors = 3
zenith_env_builder.num_safekeepers = n_acceptors
env = zenith_env_builder.init()
wa_factory.start_n_new(n_acceptors)
env.zenith_cli(["branch", "test_wal_acceptors_restarts", "main"])
pg = env.postgres.create_start('test_wal_acceptors_restarts')
zenith_cli.run(["branch", "test_wal_acceptors_restarts", "empty"])
pg = postgres.create_start('test_wal_acceptors_restarts',
wal_acceptors=wa_factory.get_connstrs())
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
@@ -92,7 +98,7 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder):
if random.random() <= fault_probability:
if failed_node is None:
failed_node = env.safekeepers[random.randrange(0, n_acceptors)]
failed_node = wa_factory.instances[random.randrange(0, n_acceptors)]
failed_node.stop()
else:
failed_node.start()
@@ -110,12 +116,12 @@ def delayed_wal_acceptor_start(wa):
# When majority of acceptors is offline, commits are expected to be frozen
def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 2
env = zenith_env_builder.init()
def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
wa_factory.start_n_new(2)
env.zenith_cli(["branch", "test_wal_acceptors_unavailability", "main"])
pg = env.postgres.create_start('test_wal_acceptors_unavailability')
zenith_cli.run(["branch", "test_wal_acceptors_unavailability", "empty"])
pg = postgres.create_start('test_wal_acceptors_unavailability',
wal_acceptors=wa_factory.get_connstrs())
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
@@ -127,9 +133,9 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
cur.execute("INSERT INTO t values (1, 'payload')")
# shutdown one of two acceptors, that is, majority
env.safekeepers[0].stop()
wa_factory.instances[0].stop()
proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[0], ))
proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[0], ))
proc.start()
start = time.time()
@@ -139,9 +145,9 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
proc.join()
# for the world's balance, do the same with second acceptor
env.safekeepers[1].stop()
wa_factory.instances[1].stop()
proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[1], ))
proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[1], ))
proc.start()
start = time.time()
@@ -180,13 +186,17 @@ def stop_value():
# do inserts while concurrently getting up/down subsets of acceptors
def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):
def test_race_conditions(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
wa_factory,
stop_value):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
wa_factory.start_n_new(3)
env.zenith_cli(["branch", "test_wal_acceptors_race_conditions", "main"])
pg = env.postgres.create_start('test_wal_acceptors_race_conditions')
zenith_cli.run(["branch", "test_wal_acceptors_race_conditions", "empty"])
pg = postgres.create_start('test_wal_acceptors_race_conditions',
wal_acceptors=wa_factory.get_connstrs())
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
@@ -195,7 +205,7 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):
cur.execute('CREATE TABLE t(key int primary key, value text)')
proc = Process(target=xmas_garland, args=(env.safekeepers, stop_value))
proc = Process(target=xmas_garland, args=(wa_factory.instances, stop_value))
proc.start()
for i in range(1000):
@@ -210,8 +220,7 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):
class ProposerPostgres:
"""Object for running safekeepers sync with walproposer"""
def __init__(self, env: ZenithEnv, pgdata_dir: str, pg_bin, timeline_id: str, tenant_id: str):
self.env = env
def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str):
self.pgdata_dir: str = pgdata_dir
self.pg_bin: PgBin = pg_bin
self.timeline_id: str = timeline_id
@@ -257,20 +266,16 @@ class ProposerPostgres:
# insert wal in all safekeepers and run sync on proposer
def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
# We don't really need the full environment for this test, just the
# safekeepers would be enough.
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory):
wa_factory.start_n_new(3)
timeline_id = uuid.uuid4().hex
tenant_id = uuid.uuid4().hex
# write config for proposer
pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata")
pg = ProposerPostgres(env, pgdata_dir, pg_bin, timeline_id, tenant_id)
pg.create_dir_config(env.get_safekeeper_connstrs())
pgdata_dir = os.path.join(repo_dir, "proposer_pgdata")
pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id)
pg.create_dir_config(wa_factory.get_connstrs())
# valid lsn, which is not in the segment start, nor in zero segment
epoch_start_lsn = 0x16B9188 # 0/16B9188
@@ -279,7 +284,7 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
# append and commit WAL
lsn_after_append = []
for i in range(3):
res = env.safekeepers[i].append_logical_message(
res = wa_factory.instances[i].append_logical_message(
tenant_id,
timeline_id,
{
@@ -303,15 +308,13 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):
def test_timeline_status(zenith_cli, pageserver, postgres, wa_factory: WalAcceptorFactory):
wa_factory.start_n_new(1)
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init()
zenith_cli.run(["branch", "test_timeline_status", "empty"])
pg = postgres.create_start('test_timeline_status', wal_acceptors=wa_factory.get_connstrs())
env.zenith_cli(["branch", "test_timeline_status", "main"])
pg = env.postgres.create_start('test_timeline_status')
wa = env.safekeepers[0]
wa = wa_factory.instances[0]
wa_http_cli = wa.http_client()
wa_http_cli.check_status()

View File

@@ -1,11 +1,9 @@
import asyncio
import asyncpg
import random
import time
from fixtures.zenith_fixtures import ZenithEnvBuilder, Postgres, Safekeeper
from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres
from fixtures.log_helper import getLogger
from fixtures.utils import lsn_from_hex, lsn_to_hex
from typing import List
log = getLogger('root.wal_acceptor_async')
@@ -104,43 +102,11 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
await pg_conn.close()
async def wait_for_lsn(safekeeper: Safekeeper,
tenant_id: str,
timeline_id: str,
wait_lsn: str,
polling_interval=1,
timeout=600):
"""
Poll flush_lsn from safekeeper until it's greater or equal than
provided wait_lsn. To do that, timeline_status is fetched from
safekeeper every polling_interval seconds.
"""
started_at = time.time()
client = safekeeper.http_client()
flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
log.info(
f'Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}'
)
while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn):
elapsed = time.time() - started_at
if elapsed > timeout:
raise RuntimeError(
f"timed out waiting for safekeeper at port {safekeeper.port.pg} to reach {wait_lsn}, current lsn is {flush_lsn}"
)
await asyncio.sleep(polling_interval)
flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
log.debug(f'safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}')
# This test will run several iterations and check progress in each of them.
# On each iteration 1 acceptor is stopped, and 2 others should allow
# background workers execute transactions. In the end, state should remain
# consistent.
async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_workers=10):
async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10):
n_accounts = 100
init_amount = 100000
max_transfer = 100
@@ -148,9 +114,6 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w
iterations = 6
pg_conn = await pg.connect_async()
tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant")
timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline")
bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount)
# create tables and initial balances
await bank.initdb()
@@ -162,18 +125,14 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w
workers.append(asyncio.create_task(worker))
for it in range(iterations):
victim_idx = it % len(acceptors)
victim = acceptors[victim_idx]
victim = acceptors[it % len(acceptors)]
victim.stop()
flush_lsn = await pg_conn.fetchval('SELECT pg_current_wal_flush_lsn()')
flush_lsn = lsn_to_hex(flush_lsn)
log.info(f'Postgres flush_lsn {flush_lsn}')
# Wait until alive safekeepers catch up with postgres
for idx, safekeeper in enumerate(acceptors):
if idx != victim_idx:
await wait_for_lsn(safekeeper, tenant_id, timeline_id, flush_lsn)
# Wait till previous victim recovers so it is ready for the next
# iteration by making any writing xact.
conn = await pg.connect_async()
await conn.execute('UPDATE bank_accs SET amount = amount WHERE uid = 1', timeout=120)
await conn.close()
stats.reset()
await asyncio.sleep(period_time)
@@ -192,14 +151,18 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w
# restart acceptors one by one, while executing and validating bank transactions
def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
def test_restarts_under_load(zenith_cli,
pageserver: ZenithPageserver,
postgres: PostgresFactory,
wa_factory: WalAcceptorFactory):
env.zenith_cli(["branch", "test_wal_acceptors_restarts_under_load", "main"])
pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load')
wa_factory.start_n_new(3)
asyncio.run(run_restarts_under_load(pg, env.safekeepers))
zenith_cli.run(["branch", "test_wal_acceptors_restarts_under_load", "empty"])
pg = postgres.create_start('test_wal_acceptors_restarts_under_load',
wal_acceptors=wa_factory.get_connstrs())
asyncio.run(run_restarts_under_load(pg, wa_factory.instances))
# TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
pg.stop()

View File

@@ -1,30 +1,27 @@
import json
import uuid
from psycopg2.extensions import cursor as PgCursor
from fixtures.zenith_fixtures import ZenithEnv
from typing import cast
from fixtures.zenith_fixtures import ZenithCli, ZenithPageserver
pytest_plugins = ("fixtures.zenith_fixtures")
def helper_compare_branch_list(page_server_cur: PgCursor, env: ZenithEnv, initial_tenant: str):
def helper_compare_branch_list(page_server_cur, zenith_cli, initial_tenant: str):
"""
Compare branches list returned by CLI and directly via API.
Filters out branches created by other tests.
"""
page_server_cur.execute(f'branch_list {initial_tenant}')
branches_api = sorted(
map(lambda b: cast(str, b['name']), json.loads(page_server_cur.fetchone()[0])))
branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
res = env.zenith_cli(["branch"])
res = zenith_cli.run(["branch"])
res.check_returncode()
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
res = env.zenith_cli(["branch", f"--tenantid={initial_tenant}"])
res = zenith_cli.run(["branch", f"--tenantid={initial_tenant}"])
res.check_returncode()
branches_cli_with_tenant_arg = sorted(
map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
@@ -35,26 +32,25 @@ def helper_compare_branch_list(page_server_cur: PgCursor, env: ZenithEnv, initia
assert branches_api == branches_cli == branches_cli_with_tenant_arg
def test_cli_branch_list(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
page_server_conn = env.pageserver.connect()
def test_cli_branch_list(pageserver: ZenithPageserver, zenith_cli):
page_server_conn = pageserver.connect()
page_server_cur = page_server_conn.cursor()
# Initial sanity check
helper_compare_branch_list(page_server_cur, env, env.initial_tenant)
helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
# Create a branch for us
res = env.zenith_cli(["branch", "test_cli_branch_list_main", "empty"])
res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"])
assert res.stderr == ''
helper_compare_branch_list(page_server_cur, env, env.initial_tenant)
helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
# Create a nested branch
res = env.zenith_cli(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
assert res.stderr == ''
helper_compare_branch_list(page_server_cur, env, env.initial_tenant)
helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
# Check that all new branches are visible via CLI
res = env.zenith_cli(["branch"])
res = zenith_cli.run(["branch"])
assert res.stderr == ''
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
@@ -62,46 +58,44 @@ def test_cli_branch_list(zenith_simple_env: ZenithEnv):
assert 'test_cli_branch_list_nested' in branches_cli
def helper_compare_tenant_list(page_server_cur: PgCursor, env: ZenithEnv):
def helper_compare_tenant_list(page_server_cur, zenith_cli: ZenithCli):
page_server_cur.execute(f'tenant_list')
tenants_api = sorted(
map(lambda t: cast(str, t['id']), json.loads(page_server_cur.fetchone()[0])))
tenants_api = sorted(json.loads(page_server_cur.fetchone()[0]))
res = env.zenith_cli(["tenant", "list"])
res = zenith_cli.run(["tenant", "list"])
assert res.stderr == ''
tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
tenants_cli = sorted(res.stdout.splitlines())
assert tenants_api == tenants_cli
def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
page_server_conn = env.pageserver.connect()
def test_cli_tenant_list(pageserver: ZenithPageserver, zenith_cli: ZenithCli):
page_server_conn = pageserver.connect()
page_server_cur = page_server_conn.cursor()
# Initial sanity check
helper_compare_tenant_list(page_server_cur, env)
helper_compare_tenant_list(page_server_cur, zenith_cli)
# Create new tenant
tenant1 = uuid.uuid4().hex
res = env.zenith_cli(["tenant", "create", tenant1])
res = zenith_cli.run(["tenant", "create", tenant1])
res.check_returncode()
# check tenant1 appeared
helper_compare_tenant_list(page_server_cur, env)
helper_compare_tenant_list(page_server_cur, zenith_cli)
# Create new tenant
tenant2 = uuid.uuid4().hex
res = env.zenith_cli(["tenant", "create", tenant2])
res = zenith_cli.run(["tenant", "create", tenant2])
res.check_returncode()
# check tenant2 appeared
helper_compare_tenant_list(page_server_cur, env)
helper_compare_tenant_list(page_server_cur, zenith_cli)
res = env.zenith_cli(["tenant", "list"])
res = zenith_cli.run(["tenant", "list"])
res.check_returncode()
tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
tenants = sorted(res.stdout.splitlines())
assert env.initial_tenant in tenants
assert pageserver.initial_tenant in tenants
assert tenant1 in tenants
assert tenant2 in tenants

View File

@@ -1,20 +1,26 @@
import os
from fixtures.utils import mkdir_if_needed
from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
pytest_plugins = ("fixtures.zenith_fixtures")
def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
env = zenith_simple_env
def test_isolation(pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin,
zenith_cli,
test_output_dir,
pg_distrib_dir,
base_dir,
capsys):
# Create a branch for us
env.zenith_cli(["branch", "test_isolation", "empty"])
zenith_cli.run(["branch", "test_isolation", "empty"])
# Connect to postgres and create a database called "regression".
# isolation tests use prepared transactions, so enable them
pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
pg = postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
pg.safe_psql('CREATE DATABASE isolation_regression')
# Create some local directories for pg_isolation_regress to run in.
@@ -38,7 +44,7 @@ def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys
'--schedule={}'.format(schedule),
]
env_vars = {
env = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
@@ -48,4 +54,4 @@ def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys
# We don't capture the output. It's not too chatty, and it always
# logs the exact same data to `regression.out` anyway.
with capsys.disabled():
pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
pg_bin.run(pg_isolation_regress_command, env=env, cwd=runpath)

View File

@@ -1,19 +1,25 @@
import os
from fixtures.utils import mkdir_if_needed
from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
pytest_plugins = ("fixtures.zenith_fixtures")
def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys):
env = zenith_simple_env
def test_pg_regress(pageserver: ZenithPageserver,
postgres: PostgresFactory,
pg_bin,
zenith_cli,
test_output_dir,
pg_distrib_dir,
base_dir,
capsys):
# Create a branch for us
env.zenith_cli(["branch", "test_pg_regress", "empty"])
zenith_cli.run(["branch", "test_pg_regress", "empty"])
# Connect to postgres and create a database called "regression".
pg = env.postgres.create_start('test_pg_regress')
pg = postgres.create_start('test_pg_regress')
pg.safe_psql('CREATE DATABASE regression')
# Create some local directories for pg_regress to run in.
@@ -38,7 +44,7 @@ def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin,
'--inputdir={}'.format(src_path),
]
env_vars = {
env = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
@@ -48,11 +54,11 @@ def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin,
# We don't capture the output. It's not too chatty, and it always
# logs the exact same data to `regression.out` anyway.
with capsys.disabled():
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
# checkpoint one more time to ensure that the lsn we get is the latest one
pg.safe_psql('CHECKPOINT')
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(test_output_dir, env, pg)
check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)

View File

@@ -1,23 +1,26 @@
import os
from fixtures.utils import mkdir_if_needed
from fixtures.zenith_fixtures import (ZenithEnv,
check_restored_datadir_content,
base_dir,
pg_distrib_dir)
from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
env = zenith_simple_env
def test_zenith_regress(postgres: PostgresFactory,
pg_bin,
zenith_cli,
test_output_dir,
pg_distrib_dir,
base_dir,
capsys,
pageserver_port: PageserverPort):
# Create a branch for us
env.zenith_cli(["branch", "test_zenith_regress", "empty"])
zenith_cli.run(["branch", "test_zenith_regress", "empty"])
# Connect to postgres and create a database called "regression".
pg = env.postgres.create_start('test_zenith_regress')
pg = postgres.create_start('test_zenith_regress')
pg.safe_psql('CREATE DATABASE regression')
# Create some local directories for pg_regress to run in.
@@ -43,7 +46,7 @@ def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, c
]
log.info(pg_regress_command)
env_vars = {
env = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
@@ -53,11 +56,11 @@ def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, c
# We don't capture the output. It's not too chatty, and it always
# logs the exact same data to `regression.out` anyway.
with capsys.disabled():
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
# checkpoint one more time to ensure that the lsn we get is the latest one
pg.safe_psql('CHECKPOINT')
lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
# Check that we restore the content of the datadir correctly
check_restored_datadir_content(test_output_dir, env, pg)
check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver_port.pg)

View File

@@ -31,11 +31,11 @@ To use, declare the 'zenbenchmark' fixture in the test function. Run the
bencmark, and then record the result by calling zenbenchmark.record. For example:
import timeit
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
def test_mybench(zenith_simple_env: env, zenbenchmark):
def test_mybench(postgres: PostgresFactory, pageserver: ZenithPageserver, zenbenchmark):
# Initialize the test
...
@@ -54,10 +54,16 @@ in the test initialization, or measure disk usage after the test query.
"""
# All the results are collected in this list, as a tuple:
# (test_name: str, metric_name: str, metric_value: float, unit: str)
#
# TODO: It would perhaps be better to store the results as additional
# properties in the pytest TestReport objects, to make them visible to
# other pytest tools.
global zenbenchmark_results
zenbenchmark_results = []
class ZenithBenchmarkResults:
""" An object for recording benchmark results. """
def __init__(self):
@@ -71,10 +77,6 @@ class ZenithBenchmarkResults:
self.results.append((test_name, metric_name, metric_value, unit))
# Will be recreated in each session.
zenbenchmark_results: ZenithBenchmarkResults = ZenithBenchmarkResults()
# Session scope fixture that initializes the results object
@pytest.fixture(autouse=True, scope='session')
def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
@@ -135,7 +137,6 @@ class ZenithBenchmarker:
matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$',
all_metrics,
re.MULTILINE)
assert matches
return int(round(float(matches.group(1))))
def get_peak_mem(self, pageserver) -> int:
@@ -146,7 +147,6 @@ class ZenithBenchmarker:
all_metrics = pageserver.http_client().get_metrics()
# See comment in get_io_writes()
matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE)
assert matches
return int(round(float(matches.group(1))))
def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):

View File

@@ -63,9 +63,3 @@ def global_counter() -> int:
def lsn_to_hex(num: int) -> str:
""" Convert lsn from int to standard hex notation. """
return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
def lsn_from_hex(lsn_hex: str) -> int:
""" Convert lsn from hex notation to int. """
l, r = lsn_hex.split('/')
return (int(l, 16) << 32) + int(r, 16)

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
import os
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
@@ -16,17 +16,21 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
# 3. Disk space used
# 4. Peak memory usage
#
def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
env = zenith_simple_env
def test_bulk_insert(postgres: PostgresFactory,
pageserver: ZenithPageserver,
pg_bin,
zenith_cli,
zenbenchmark,
repo_dir: str):
# Create a branch for us
env.zenith_cli(["branch", "test_bulk_insert", "empty"])
zenith_cli.run(["branch", "test_bulk_insert", "empty"])
pg = env.postgres.create_start('test_bulk_insert')
pg = postgres.create_start('test_bulk_insert')
log.info("postgres is running on 'test_bulk_insert' branch")
# Open a connection directly to the page server that we'll use to force
# flushing the layers to disk
psconn = env.pageserver.connect()
psconn = pageserver.connect()
pscur = psconn.cursor()
# Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -38,19 +42,19 @@ def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
cur.execute("create table huge (i int, j int);")
# Run INSERT, recording the time and I/O it takes
with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
with zenbenchmark.record_duration('insert'):
cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
# Flush the layers from memory to disk. This is included in the reported
# time and I/O
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
# Record peak memory usage
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
# Report disk space used by the repository
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
env.initial_tenant,
timeline_size = zenbenchmark.get_timeline_size(repo_dir,
pageserver.initial_tenant,
timeline)
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')

View File

@@ -1,7 +1,11 @@
import timeit
import pytest
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.zenith_fixtures import (
TenantFactory,
ZenithCli,
PostgresFactory,
)
pytest_plugins = ("fixtures.benchmark_fixture")
@@ -16,37 +20,37 @@ pytest_plugins = ("fixtures.benchmark_fixture")
@pytest.mark.parametrize('tenants_count', [1, 5, 10])
@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa'])
def test_bulk_tenant_create(
zenith_env_builder: ZenithEnvBuilder,
zenith_cli: ZenithCli,
tenant_factory: TenantFactory,
postgres: PostgresFactory,
wa_factory,
use_wal_acceptors: str,
tenants_count: int,
zenbenchmark,
):
"""Measure tenant creation time (with and without wal acceptors)"""
if use_wal_acceptors == 'with_wa':
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init()
time_slices = []
for i in range(tenants_count):
start = timeit.default_timer()
tenant = env.create_tenant()
env.zenith_cli([
tenant = tenant_factory.create()
zenith_cli.run([
"branch",
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
"main",
f"--tenantid={tenant}"
])
# FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now?
#if use_wal_acceptors == 'with_wa':
# wa_factory.start_n_new(3)
if use_wal_acceptors == 'with_wa':
wa_factory.start_n_new(3)
pg_tenant = env.postgres.create_start(
pg_tenant = postgres.create_start(
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
None, # branch name, None means same as node name
tenant,
wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None,
)
end = timeit.default_timer()

View File

@@ -1,6 +1,6 @@
import os
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
@@ -11,17 +11,21 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
# As of this writing, we're duplicate those giant WAL records for each page,
# which makes the delta layer about 32x larger than it needs to be.
#
def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
env = zenith_simple_env
def test_gist_buffering_build(postgres: PostgresFactory,
pageserver: ZenithPageserver,
pg_bin,
zenith_cli,
zenbenchmark,
repo_dir: str):
# Create a branch for us
env.zenith_cli(["branch", "test_gist_buffering_build", "empty"])
zenith_cli.run(["branch", "test_gist_buffering_build", "empty"])
pg = env.postgres.create_start('test_gist_buffering_build')
pg = postgres.create_start('test_gist_buffering_build')
log.info("postgres is running on 'test_gist_buffering_build' branch")
# Open a connection directly to the page server that we'll use to force
# flushing the layers to disk
psconn = env.pageserver.connect()
psconn = pageserver.connect()
pscur = psconn.cursor()
# Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -37,7 +41,7 @@ def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
)
# Build the index.
with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
with zenbenchmark.record_duration('build'):
cur.execute(
"create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)"
@@ -45,13 +49,13 @@ def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
# Flush the layers from memory to disk. This is included in the reported
# time and I/O
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 1000000")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 1000000")
# Record peak memory usage
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
# Report disk space used by the repository
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
env.initial_tenant,
timeline_size = zenbenchmark.get_timeline_size(repo_dir,
pageserver.initial_tenant,
timeline)
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')

View File

@@ -1,6 +1,6 @@
import os
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
@@ -15,17 +15,21 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
# 2. Time to run 5000 pgbench transactions
# 3. Disk space used
#
def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
env = zenith_simple_env
def test_pgbench(postgres: PostgresFactory,
pageserver: ZenithPageserver,
pg_bin,
zenith_cli,
zenbenchmark,
repo_dir: str):
# Create a branch for us
env.zenith_cli(["branch", "test_pgbench_perf", "empty"])
zenith_cli.run(["branch", "test_pgbench_perf", "empty"])
pg = env.postgres.create_start('test_pgbench_perf')
pg = postgres.create_start('test_pgbench_perf')
log.info("postgres is running on 'test_pgbench_perf' branch")
# Open a connection directly to the page server that we'll use to force
# flushing the layers to disk
psconn = env.pageserver.connect()
psconn = pageserver.connect()
pscur = psconn.cursor()
# Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -37,13 +41,13 @@ def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
connstr = pg.connstr()
# Initialize pgbench database, recording the time and I/O it takes
with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
with zenbenchmark.record_duration('init'):
pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])
# Flush the layers from memory to disk. This is included in the reported
# time and I/O
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
# Run pgbench for 5000 transactions
with zenbenchmark.record_duration('5000_xacts'):
@@ -51,8 +55,8 @@ def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
# Flush the layers to disk again. This is *not' included in the reported time,
# though.
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
# Report disk space used by the repository
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')

View File

@@ -12,23 +12,27 @@
# Amplification problem at its finest.
import os
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
env = zenith_simple_env
def test_write_amplification(postgres: PostgresFactory,
pageserver: ZenithPageserver,
pg_bin,
zenith_cli,
zenbenchmark,
repo_dir: str):
# Create a branch for us
env.zenith_cli(["branch", "test_write_amplification", "empty"])
zenith_cli.run(["branch", "test_write_amplification", "empty"])
pg = env.postgres.create_start('test_write_amplification')
pg = postgres.create_start('test_write_amplification')
log.info("postgres is running on 'test_write_amplification' branch")
# Open a connection directly to the page server that we'll use to force
# flushing the layers to disk
psconn = env.pageserver.connect()
psconn = pageserver.connect()
pscur = psconn.cursor()
with closing(pg.connect()) as conn:
@@ -37,7 +41,7 @@ def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
cur.execute("SHOW zenith.zenith_timeline")
timeline = cur.fetchone()[0]
with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
with zenbenchmark.record_duration('run'):
# NOTE: Because each iteration updates every table already created,
@@ -70,10 +74,10 @@ def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
# slower, adding some delays in this loop. But forcing
# the the checkpointing and GC makes the test go faster,
# with the same total I/O effect.
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
# Report disk space used by the repository
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
env.initial_tenant,
timeline_size = zenbenchmark.get_timeline_size(repo_dir,
pageserver.initial_tenant,
timeline)
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')

View File

@@ -2,4 +2,3 @@
minversion = 6.0
log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
log_date_format = %Y-%m-%d %H:%M:%S
log_cli = true

View File

@@ -22,11 +22,7 @@ disallow_untyped_decorators = false
disallow_untyped_defs = false
strict = true
[mypy-asyncpg.*]
# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
ignore_missing_imports = true
[mypy-cached_property.*]
[mypy-psycopg2.*]
ignore_missing_imports = true
[mypy-pytest.*]

View File

@@ -1,7 +1,6 @@
import pytest
import os
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.log_helper import log
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -20,13 +19,11 @@ run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None,
@run_broken
def test_broken(zenith_simple_env: ZenithEnv, pg_bin):
env = zenith_simple_env
def test_broken(zenith_cli, pageserver, postgres, pg_bin):
# Create a branch for us
env.zenith_cli(["branch", "test_broken", "empty"])
zenith_cli.run(["branch", "test_broken", "empty"])
env.postgres.create_start("test_broken")
postgres.create_start("test_broken")
log.info('postgres is running')
log.info('THIS NEXT COMMAND WILL FAIL:')

View File

@@ -17,7 +17,7 @@ use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
use walkeeper::http;
use walkeeper::s3_offload;
use walkeeper::wal_service;
use walkeeper::SafeKeeperConf;
use walkeeper::WalAcceptorConf;
fn main() -> Result<()> {
zenith_metrics::set_common_metrics_prefix("safekeeper");
@@ -54,7 +54,7 @@ fn main() -> Result<()> {
Arg::with_name("ttl")
.long("ttl")
.takes_value(true)
.help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"),
.help("interval for keeping WAL as walkeeper node, after which them will be uploaded to S3 and removed locally"),
)
.arg(
Arg::with_name("recall")
@@ -78,11 +78,8 @@ fn main() -> Result<()> {
)
.get_matches();
let mut conf = SafeKeeperConf {
// Always set to './'. We will chdir into the directory specified on the
// command line, so that when the server is running, all paths are relative
// to that.
workdir: PathBuf::from("./"),
let mut conf = WalAcceptorConf {
data_dir: PathBuf::from("./"),
daemonize: false,
no_sync: false,
pageserver_addr: None,
@@ -94,8 +91,10 @@ fn main() -> Result<()> {
};
if let Some(dir) = arg_matches.value_of("datadir") {
conf.data_dir = PathBuf::from(dir);
// change into the data directory.
std::env::set_current_dir(PathBuf::from(dir))?;
std::env::set_current_dir(&conf.data_dir)?;
}
if arg_matches.is_present("no-sync") {
@@ -126,11 +125,12 @@ fn main() -> Result<()> {
conf.recall_period = Some(humantime::parse_duration(recall)?);
}
start_safekeeper(conf)
start_wal_acceptor(conf)
}
fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let log_file = logging::init("safekeeper.log", conf.daemonize)?;
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
let log_filename = conf.data_dir.join("safekeeper.log");
let log_file = logging::init(log_filename, conf.daemonize)?;
let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);

View File

@@ -10,7 +10,7 @@ use zenith_utils::lsn::Lsn;
use crate::safekeeper::AcceptorState;
use crate::timeline::CreateControlFile;
use crate::timeline::GlobalTimelines;
use crate::SafeKeeperConf;
use crate::WalAcceptorConf;
use zenith_utils::http::endpoint;
use zenith_utils::http::error::ApiError;
use zenith_utils::http::json::json_response;
@@ -22,9 +22,9 @@ async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
Ok(json_response(StatusCode::OK, "")?)
}
fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
fn get_conf(request: &Request<Body>) -> &WalAcceptorConf {
request
.data::<Arc<SafeKeeperConf>>()
.data::<Arc<WalAcceptorConf>>()
.expect("unknown state type")
.as_ref()
}
@@ -49,8 +49,6 @@ struct TimelineStatus {
commit_lsn: Lsn,
#[serde(serialize_with = "display_serialize")]
truncate_lsn: Lsn,
#[serde(serialize_with = "display_serialize")]
flush_lsn: Lsn,
}
/// Report info about timeline.
@@ -66,7 +64,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
)
.map_err(ApiError::from_err)?;
let sk_state = tli.get_info();
let (flush_lsn, _) = tli.get_end_of_wal();
let status = TimelineStatus {
tenant_id,
@@ -74,13 +71,12 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
acceptor_state: sk_state.acceptor_state,
commit_lsn: sk_state.commit_lsn,
truncate_lsn: sk_state.truncate_lsn,
flush_lsn,
};
Ok(json_response(StatusCode::OK, status)?)
}
/// Safekeeper http router.
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
pub fn make_router(conf: WalAcceptorConf) -> RouterBuilder<hyper::Body, ApiError> {
let router = endpoint::make_router();
router
.data(Arc::new(conf))

View File

@@ -23,15 +23,8 @@ pub mod defaults {
}
#[derive(Debug, Clone)]
pub struct SafeKeeperConf {
// Repository directory, relative to current working directory.
// Normally, the safekeeper changes the current working directory
// to the repository, and 'workdir' is always '.'. But we don't do
// that during unit testing, because the current directory is global
// to the process but different unit tests work on different
// data directories to avoid clashing with each other.
pub workdir: PathBuf,
pub struct WalAcceptorConf {
pub data_dir: PathBuf,
pub daemonize: bool,
pub no_sync: bool,
pub listen_pg_addr: String,

View File

@@ -16,7 +16,7 @@ use crate::safekeeper::ProposerAcceptorMessage;
use crate::send_wal::SendWalHandler;
use crate::timeline::TimelineTools;
use crate::SafeKeeperConf;
use crate::WalAcceptorConf;
use zenith_utils::connstring::connection_host_port;
use zenith_utils::postgres_backend::PostgresBackend;
use zenith_utils::pq_proto::{BeMessage, FeMessage};
@@ -33,7 +33,7 @@ pub struct ReceiveWalConn<'pg> {
/// Periodically request pageserver to call back.
/// If pageserver already has replication channel, it will just ignore this request
///
fn request_callback(conf: SafeKeeperConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
let ps_addr = conf.pageserver_addr.unwrap();
let ps_connstr = format!(
"postgresql://no_user:{}@{}/no_db",

View File

@@ -2,7 +2,7 @@
//! with the "START_REPLICATION" message.
use crate::send_wal::SendWalHandler;
use crate::timeline::{ReplicaState, Timeline, TimelineTools};
use crate::timeline::{Timeline, TimelineTools};
use anyhow::{anyhow, Context, Result};
use bytes::Bytes;
use log::*;
@@ -20,7 +20,7 @@ use std::{str, thread};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::PostgresBackend;
use zenith_utils::pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody};
use zenith_utils::pq_proto::{BeMessage, FeMessage, XLogDataBody};
use zenith_utils::sock_split::ReadStream;
pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
@@ -32,7 +32,7 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
type FullTransactionId = u64;
/// Hot standby feedback received from replica
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct HotStandbyFeedback {
pub ts: TimestampTz,
pub xmin: FullTransactionId,
@@ -49,16 +49,6 @@ impl HotStandbyFeedback {
}
}
/// Standby status update
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StandbyReply {
pub write_lsn: Lsn, // disk consistent lSN
pub flush_lsn: Lsn, // LSN committedby quorum
pub apply_lsn: Lsn, // not used
pub reply_ts: TimestampTz,
pub reply_requested: bool,
}
/// A network connection that's speaking the replication protocol.
pub struct ReplicationConn {
/// This is an `Option` because we will spawn a background thread that will
@@ -66,15 +56,16 @@ pub struct ReplicationConn {
stream_in: Option<ReadStream>,
}
/// Scope guard to unregister replication connection from timeline
struct ReplicationConnGuard {
replica: usize, // replica internal ID assigned by timeline
timeline: Arc<Timeline>,
// TODO: move this to crate::timeline when there's more users
// TODO: design a proper Timeline mock api
trait HsFeedbackSubscriber {
fn add_hs_feedback(&self, _feedback: HotStandbyFeedback) {}
}
impl Drop for ReplicationConnGuard {
fn drop(&mut self) {
self.timeline.update_replica_state(self.replica, None);
impl HsFeedbackSubscriber for Arc<Timeline> {
#[inline(always)]
fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
Timeline::add_hs_feedback(self, feedback);
}
}
@@ -88,33 +79,26 @@ impl ReplicationConn {
/// Handle incoming messages from the network.
/// This is spawned into the background by `handle_start_replication`.
fn background_thread(mut stream_in: impl Read, timeline: Arc<Timeline>) -> Result<()> {
let mut state = ReplicaState::new();
let replica = timeline.add_replica(state);
let _guard = ReplicationConnGuard {
replica,
timeline: timeline.clone(),
};
fn background_thread(
mut stream_in: impl Read,
subscriber: impl HsFeedbackSubscriber,
) -> Result<()> {
// Wait for replica's feedback.
while let Some(msg) = FeMessage::read(&mut stream_in)? {
match &msg {
FeMessage::CopyData(m) => {
// There's two possible data messages that the client is supposed to send here:
// `HotStandbyFeedback` and `StandbyStatusUpdate`.
// `HotStandbyFeedback` and `StandbyStatusUpdate`. We only handle hot standby
// feedback.
match m.first().cloned() {
Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
// Note: deserializing is on m[1..] because we skip the tag byte.
state.hs_feedback = HotStandbyFeedback::des(&m[1..])
let feedback = HotStandbyFeedback::des(&m[1..])
.context("failed to deserialize HotStandbyFeedback")?;
timeline.update_replica_state(replica, Some(state));
}
Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
let reply = StandbyReply::des(&m[1..])
.context("failed to deserialize StandbyReply")?;
state.disk_consistent_lsn = reply.write_lsn;
timeline.update_replica_state(replica, Some(state));
subscriber.add_hs_feedback(feedback);
}
Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => (),
_ => warn!("unexpected message {:?}", msg),
}
}
@@ -203,7 +187,7 @@ impl ReplicationConn {
// switch to copy
pgb.write_message(&BeMessage::CopyBothResponse)?;
let mut end_pos = Lsn(0);
let mut end_pos: Lsn;
let mut wal_file: Option<File> = None;
loop {
@@ -218,18 +202,7 @@ impl ReplicationConn {
} else {
/* normal mode */
let timeline = swh.timeline.get();
if let Some(lsn) = timeline.wait_for_lsn(start_pos) {
end_pos = lsn
} else {
// timeout expired: request pageserver status
pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
sent_ptr: end_pos.0,
timestamp: get_current_timestamp(),
request_reply: true,
}))
.context("Failed to send KeepAlive message")?;
continue;
}
end_pos = timeline.wait_for_lsn(start_pos);
}
if end_pos == END_REPLICATION_MARKER {
break;
@@ -243,7 +216,7 @@ impl ReplicationConn {
let segno = start_pos.segment_number(wal_seg_size);
let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
let timeline_id = swh.timeline.get().timelineid.to_string();
let wal_file_path = swh.conf.workdir.join(timeline_id).join(wal_file_name);
let wal_file_path = swh.conf.data_dir.join(timeline_id).join(wal_file_name);
Self::open_wal_file(&wal_file_path)?
}
};
@@ -284,3 +257,18 @@ impl ReplicationConn {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
// A no-op impl for tests
impl HsFeedbackSubscriber for () {}
#[test]
fn test_replication_conn_background_thread_eof() {
// Test that background_thread recognizes EOF
let stream: &[u8] = &[];
ReplicationConn::background_thread(stream, ()).unwrap();
}
}

View File

@@ -18,9 +18,9 @@ use tokio::runtime;
use tokio::time::sleep;
use walkdir::WalkDir;
use crate::SafeKeeperConf;
use crate::WalAcceptorConf;
pub fn thread_main(conf: SafeKeeperConf) {
pub fn thread_main(conf: WalAcceptorConf) {
// Create a new thread pool
//
// FIXME: keep it single-threaded for now, make it easier to debug with gdb,
@@ -42,7 +42,7 @@ async fn offload_files(
bucket: &Bucket,
listing: &HashSet<String>,
dir_path: &Path,
conf: &SafeKeeperConf,
conf: &WalAcceptorConf,
) -> Result<u64> {
let horizon = SystemTime::now() - conf.ttl.unwrap();
let mut n: u64 = 0;
@@ -54,7 +54,7 @@ async fn offload_files(
&& IsXLogFileName(entry.file_name().to_str().unwrap())
&& entry.metadata().unwrap().created().unwrap() <= horizon
{
let relpath = path.strip_prefix(&conf.workdir).unwrap();
let relpath = path.strip_prefix(&conf.data_dir).unwrap();
let s3path = String::from("walarchive/") + relpath.to_str().unwrap();
if !listing.contains(&s3path) {
let mut file = File::open(&path)?;
@@ -70,7 +70,7 @@ async fn offload_files(
Ok(n)
}
async fn main_loop(conf: &SafeKeeperConf) -> Result<()> {
async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
let region = Region::Custom {
region: env::var("S3_REGION").unwrap(),
endpoint: env::var("S3_ENDPOINT").unwrap(),
@@ -97,7 +97,7 @@ async fn main_loop(conf: &SafeKeeperConf) -> Result<()> {
.flat_map(|b| b.contents.iter().map(|o| o.key.clone()))
.collect();
let n = offload_files(&bucket, &listing, &conf.workdir, conf).await?;
let n = offload_files(&bucket, &listing, &conf.data_dir, conf).await?;
info!("Offload {} files to S3", n);
sleep(conf.ttl.unwrap()).await;
}

View File

@@ -19,10 +19,7 @@ use lazy_static::lazy_static;
use crate::replication::HotStandbyFeedback;
use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
use zenith_metrics::{
register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec,
DISK_WRITE_SECONDS_BUCKETS,
};
use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec};
use zenith_utils::bin_ser::LeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::pq_proto::SystemId;
@@ -194,8 +191,6 @@ pub struct AppendResponse {
// We report back our awareness about which WAL is committed, as this is
// a criterion for walproposer --sync mode exit
pub commit_lsn: Lsn,
// Min disk consistent lsn of pageservers (portion of WAL applied and written to the disk by pageservers)
pub disk_consistent_lsn: Lsn,
pub hs_feedback: HotStandbyFeedback,
}
@@ -302,27 +297,11 @@ lazy_static! {
&["ztli"]
)
.expect("Failed to register safekeeper_commit_lsn gauge vec");
static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!(
"safekeeper_write_wal_bytes",
"Bytes written to WAL in a single request, grouped by timeline",
&["timeline_id"],
vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]
)
.expect("Failed to register safekeeper_write_wal_bytes histogram vec");
static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_write_wal_seconds",
"Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline",
&["timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_write_wal_seconds histogram vec");
}
struct SafeKeeperMetrics {
flush_lsn: Gauge,
commit_lsn: Gauge,
write_wal_bytes: Histogram,
write_wal_seconds: Histogram,
}
impl SafeKeeperMetrics {
@@ -331,8 +310,6 @@ impl SafeKeeperMetrics {
SafeKeeperMetrics {
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]),
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]),
write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&ztli_str]),
write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&ztli_str]),
}
}
@@ -340,8 +317,6 @@ impl SafeKeeperMetrics {
SafeKeeperMetrics {
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&["n/a"]),
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&["n/a"]),
write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&["n/a"]),
write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&["n/a"]),
}
}
}
@@ -483,7 +458,6 @@ where
epoch: self.s.acceptor_state.epoch,
commit_lsn: Lsn(0),
flush_lsn: Lsn(0),
disk_consistent_lsn: Lsn(0),
hs_feedback: HotStandbyFeedback::empty(),
};
return Ok(AcceptorProposerMessage::AppendResponse(resp));
@@ -495,14 +469,8 @@ where
// do the job
let mut last_rec_lsn = Lsn(0);
if !msg.wal_data.is_empty() {
self.metrics
.write_wal_bytes
.observe(msg.wal_data.len() as f64);
{
let _timer = self.metrics.write_wal_seconds.start_timer();
self.storage
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
}
self.storage
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
// figure out last record's end lsn for reporting (if we got the
// whole record)
@@ -599,7 +567,6 @@ where
epoch: self.s.acceptor_state.epoch,
flush_lsn: self.flush_lsn,
commit_lsn: self.s.commit_lsn,
disk_consistent_lsn: Lsn(0),
// will be filled by caller code to avoid bothering safekeeper
hs_feedback: HotStandbyFeedback::empty(),
};

View File

@@ -1,4 +1,4 @@
//! Part of Safekeeper pretending to be Postgres, streaming xlog to
//! Part of WAL acceptor pretending to be Postgres, streaming xlog to
//! pageserver/any other consumer.
//!
@@ -6,7 +6,7 @@ use crate::json_ctrl::handle_json_ctrl;
use crate::receive_wal::ReceiveWalConn;
use crate::replication::ReplicationConn;
use crate::timeline::{Timeline, TimelineTools};
use crate::SafeKeeperConf;
use crate::WalAcceptorConf;
use anyhow::{anyhow, bail, Result};
use bytes::Bytes;
use std::str::FromStr;
@@ -20,7 +20,7 @@ use crate::timeline::CreateControlFile;
/// Handler for streaming WAL from acceptor
pub struct SendWalHandler {
pub conf: SafeKeeperConf,
pub conf: WalAcceptorConf,
/// assigned application name
pub appname: Option<String>,
pub tenantid: Option<ZTenantId>,
@@ -85,7 +85,7 @@ impl postgres_backend::Handler for SendWalHandler {
}
impl SendWalHandler {
pub fn new(conf: SafeKeeperConf) -> Self {
pub fn new(conf: WalAcceptorConf) -> Self {
SendWalHandler {
conf,
appname: None,

View File

@@ -11,10 +11,9 @@ use std::collections::HashMap;
use std::fs::{self, File, OpenOptions};
use std::io::{Seek, SeekFrom, Write};
use std::sync::{Arc, Condvar, Mutex};
use std::time::Duration;
use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS};
use zenith_utils::bin_ser::LeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
@@ -22,39 +21,10 @@ use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo,
Storage, SK_FORMAT_VERSION, SK_MAGIC,
};
use crate::SafeKeeperConf;
use crate::WalAcceptorConf;
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
const CONTROL_FILE_NAME: &str = "safekeeper.control";
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
/// Replica status: host standby feedback + disk consistent lsn
#[derive(Debug, Clone, Copy)]
pub struct ReplicaState {
/// combined disk_consistent_lsn of pageservers
pub disk_consistent_lsn: Lsn,
/// combined hot standby feedback from all replicas
pub hs_feedback: HotStandbyFeedback,
}
impl Default for ReplicaState {
fn default() -> Self {
Self::new()
}
}
impl ReplicaState {
pub fn new() -> ReplicaState {
ReplicaState {
disk_consistent_lsn: Lsn(u64::MAX),
hs_feedback: HotStandbyFeedback {
ts: 0,
xmin: u64::MAX,
catalog_xmin: u64::MAX,
},
}
}
}
/// Shared state associated with database instance (tenant)
struct SharedState {
@@ -63,8 +33,8 @@ struct SharedState {
/// For receiving-sending wal cooperation
/// quorum commit LSN we've notified walsenders about
notified_commit_lsn: Lsn,
/// State of replicas
replicas: Vec<Option<ReplicaState>>,
/// combined hot standby feedback from all replicas
hs_feedback: HotStandbyFeedback,
}
// A named boolean.
@@ -74,70 +44,23 @@ pub enum CreateControlFile {
False,
}
lazy_static! {
static ref PERSIST_SYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_persist_sync_control_file_seconds",
"Seconds to persist and sync control file, grouped by timeline",
&["timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_persist_sync_control_file_seconds histogram vec");
static ref PERSIST_NOSYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_persist_nosync_control_file_seconds",
"Seconds to persist and sync control file, grouped by timeline",
&["timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_persist_nosync_control_file_seconds histogram vec");
}
impl SharedState {
/// Get combined stateof all alive replicas
pub fn get_replicas_state(&self) -> ReplicaState {
let mut acc = ReplicaState::new();
for state in self.replicas.iter().flatten() {
acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts);
acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin);
acc.hs_feedback.catalog_xmin =
min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin);
acc.disk_consistent_lsn = Lsn::min(acc.disk_consistent_lsn, state.disk_consistent_lsn);
}
acc
}
/// Assign new replica ID. We choose first empty cell in the replicas vector
/// or extend the vector if there are not free items.
pub fn add_replica(&mut self, state: ReplicaState) -> usize {
if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) {
self.replicas[pos] = Some(state);
return pos;
}
let pos = self.replicas.len();
self.replicas.push(Some(state));
pos
}
/// Restore SharedState from control file. Locks the control file along the
/// way to prevent running more than one instance of safekeeper on the same
/// data dir.
/// If create=false and file doesn't exist, bails out.
fn create_restore(
conf: &SafeKeeperConf,
conf: &WalAcceptorConf,
timelineid: ZTimelineId,
create: CreateControlFile,
) -> Result<Self> {
let (cf, state) = SharedState::load_control_file(conf, timelineid, create)?;
let timelineid_str = format!("{}", timelineid);
let storage = FileStorage {
control_file: cf,
conf: conf.clone(),
persist_sync_control_file_seconds: PERSIST_SYNC_CONTROL_FILE_SECONDS
.with_label_values(&[&timelineid_str]),
persist_nosync_control_file_seconds: PERSIST_NOSYNC_CONTROL_FILE_SECONDS
.with_label_values(&[&timelineid_str]),
};
let (flush_lsn, tli) = if state.server.wal_seg_size != 0 {
let wal_dir = conf.workdir.join(format!("{}", timelineid));
let wal_dir = conf.data_dir.join(format!("{}", timelineid));
find_end_of_wal(
&wal_dir,
state.server.wal_seg_size as usize,
@@ -151,19 +74,30 @@ impl SharedState {
Ok(Self {
notified_commit_lsn: Lsn(0),
sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state),
replicas: Vec::new(),
hs_feedback: HotStandbyFeedback {
ts: 0,
xmin: u64::MAX,
catalog_xmin: u64::MAX,
},
})
}
/// Accumulate hot standby feedbacks from replicas
pub fn add_hs_feedback(&mut self, feedback: HotStandbyFeedback) {
self.hs_feedback.xmin = min(self.hs_feedback.xmin, feedback.xmin);
self.hs_feedback.catalog_xmin = min(self.hs_feedback.catalog_xmin, feedback.catalog_xmin);
self.hs_feedback.ts = max(self.hs_feedback.ts, feedback.ts);
}
/// Fetch and lock control file (prevent running more than one instance of safekeeper)
/// If create=false and file doesn't exist, bails out.
fn load_control_file(
conf: &SafeKeeperConf,
conf: &WalAcceptorConf,
timelineid: ZTimelineId,
create: CreateControlFile,
) -> Result<(File, SafeKeeperState)> {
let control_file_path = conf
.workdir
.data_dir
.join(timelineid.to_string())
.join(CONTROL_FILE_NAME);
info!(
@@ -244,27 +178,20 @@ impl Timeline {
}
}
/// Timed wait for an LSN to be committed.
/// Wait for an LSN to be committed.
///
/// Returns the last committed LSN, which will be at least
/// as high as the LSN waited for, or None if timeout expired.
/// as high as the LSN waited for.
///
pub fn wait_for_lsn(&self, lsn: Lsn) -> Option<Lsn> {
pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
let mut shared_state = self.mutex.lock().unwrap();
loop {
let commit_lsn = shared_state.notified_commit_lsn;
// This must be `>`, not `>=`.
if commit_lsn > lsn {
return Some(commit_lsn);
return commit_lsn;
}
let result = self
.cond
.wait_timeout(shared_state, POLL_STATE_TIMEOUT)
.unwrap();
if result.1.timed_out() {
return None;
}
shared_state = result.0
shared_state = self.cond.wait(shared_state).unwrap();
}
}
@@ -292,11 +219,9 @@ impl Timeline {
// commit_lsn if we are catching up safekeeper.
commit_lsn = shared_state.sk.commit_lsn;
// if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
// if this is AppendResponse, fill in proper hot standby feedback
if let AcceptorProposerMessage::AppendResponse(ref mut resp) = rmsg {
let state = shared_state.get_replicas_state();
resp.hs_feedback = state.hs_feedback;
resp.disk_consistent_lsn = state.disk_consistent_lsn;
resp.hs_feedback = shared_state.hs_feedback.clone();
}
}
// Ping wal sender that new data might be available.
@@ -308,14 +233,15 @@ impl Timeline {
self.mutex.lock().unwrap().sk.s.clone()
}
pub fn add_replica(&self, state: ReplicaState) -> usize {
// Accumulate hot standby feedbacks from replicas
pub fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
let mut shared_state = self.mutex.lock().unwrap();
shared_state.add_replica(state)
shared_state.add_hs_feedback(feedback);
}
pub fn update_replica_state(&self, id: usize, state: Option<ReplicaState>) {
let mut shared_state = self.mutex.lock().unwrap();
shared_state.replicas[id] = state;
pub fn get_hs_feedback(&self) -> HotStandbyFeedback {
let shared_state = self.mutex.lock().unwrap();
shared_state.hs_feedback.clone()
}
pub fn get_end_of_wal(&self) -> (Lsn, u32) {
@@ -328,7 +254,7 @@ impl Timeline {
pub trait TimelineTools {
fn set(
&mut self,
conf: &SafeKeeperConf,
conf: &WalAcceptorConf,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
create: CreateControlFile,
@@ -340,7 +266,7 @@ pub trait TimelineTools {
impl TimelineTools for Option<Arc<Timeline>> {
fn set(
&mut self,
conf: &SafeKeeperConf,
conf: &WalAcceptorConf,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
create: CreateControlFile,
@@ -369,7 +295,7 @@ impl GlobalTimelines {
/// Get a timeline with control file loaded from the global TIMELINES map.
/// If control file doesn't exist and create=false, bails out.
pub fn get(
conf: &SafeKeeperConf,
conf: &WalAcceptorConf,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
create: CreateControlFile,
@@ -398,19 +324,11 @@ impl GlobalTimelines {
#[derive(Debug)]
struct FileStorage {
control_file: File,
conf: SafeKeeperConf,
persist_sync_control_file_seconds: Histogram,
persist_nosync_control_file_seconds: Histogram,
conf: WalAcceptorConf,
}
impl Storage for FileStorage {
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()> {
let _timer = if sync {
&self.persist_sync_control_file_seconds
} else {
&self.persist_nosync_control_file_seconds
}
.start_timer();
self.control_file.seek(SeekFrom::Start(0))?;
s.ser_into(&mut self.control_file)?;
if sync {
@@ -450,12 +368,12 @@ impl Storage for FileStorage {
let wal_file_name = XLogFileName(server.tli, segno, wal_seg_size);
let wal_file_path = self
.conf
.workdir
.data_dir
.join(ztli.to_string())
.join(wal_file_name.clone());
let wal_file_partial_path = self
.conf
.workdir
.data_dir
.join(ztli.to_string())
.join(wal_file_name.clone() + ".partial");

View File

@@ -8,11 +8,11 @@ use std::net::{TcpListener, TcpStream};
use std::thread;
use crate::send_wal::SendWalHandler;
use crate::SafeKeeperConf;
use crate::WalAcceptorConf;
use zenith_utils::postgres_backend::{AuthType, PostgresBackend};
/// Accept incoming TCP connections and spawn them into a background thread.
pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
pub fn thread_main(conf: WalAcceptorConf, listener: TcpListener) -> Result<()> {
loop {
match listener.accept() {
Ok((socket, peer_addr)) => {
@@ -31,7 +31,7 @@ pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
/// This is run by `thread_main` above, inside a background thread.
///
fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
fn handle_socket(socket: TcpStream, conf: WalAcceptorConf) -> Result<()> {
socket.set_nodelay(true)?;
let mut conn_handler = SendWalHandler::new(conf);

View File

@@ -15,7 +15,6 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbf
# FIXME: 'pageserver' is needed for BranchInfo. Refactor
pageserver = { path = "../pageserver" }
control_plane = { path = "../control_plane" }
walkeeper = { path = "../walkeeper" }
postgres_ffi = { path = "../postgres_ffi" }
zenith_utils = { path = "../zenith_utils" }
workspace_hack = { path = "../workspace_hack" }

View File

@@ -1,54 +1,19 @@
use anyhow::{anyhow, bail};
use anyhow::anyhow;
use anyhow::{Context, Result};
use clap::{App, AppSettings, Arg, ArgMatches, SubCommand};
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env;
use control_plane::local_env::LocalEnv;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage::PageServerNode;
use pageserver::defaults::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
};
use pageserver::defaults::{DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_PORT};
use std::collections::HashMap;
use std::process::exit;
use std::str::FromStr;
use walkeeper::defaults::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
};
use zenith_utils::auth::{Claims, Scope};
use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use pageserver::branches::BranchInfo;
// Default name of a safekeeper node, if not specified on the command line.
const DEFAULT_SAFEKEEPER_NAME: &str = "single";
fn default_conf() -> String {
format!(
r#"
# Default built-in configuration, defined in main.rs
[pageserver]
pg_port = {pageserver_pg_port}
http_port = {pageserver_http_port}
auth_type = '{pageserver_auth_type}'
[[safekeepers]]
name = '{safekeeper_name}'
pg_port = {safekeeper_pg_port}
http_port = {safekeeper_http_port}
"#,
pageserver_pg_port = DEFAULT_PAGESERVER_PG_PORT,
pageserver_http_port = DEFAULT_PAGESERVER_HTTP_PORT,
pageserver_auth_type = AuthType::Trust,
safekeeper_name = DEFAULT_SAFEKEEPER_NAME,
safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT,
safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT,
)
}
///
/// Branches tree element used as a value in the HashMap.
///
@@ -67,16 +32,11 @@ struct BranchTreeEl {
// * Providing CLI api to the pageserver
// * TODO: export/import to/from usual postgres
fn main() -> Result<()> {
let pg_node_arg = Arg::with_name("node")
let node_arg = Arg::with_name("node")
.index(1)
.help("Node name")
.required(true);
let safekeeper_node_arg = Arg::with_name("node")
.index(1)
.help("Node name")
.required(false);
let timeline_arg = Arg::with_name("timeline")
.index(2)
.help("Branch name or a point-in time specification")
@@ -93,25 +53,29 @@ fn main() -> Result<()> {
.required(false)
.value_name("port");
let stop_mode_arg = Arg::with_name("stop-mode")
.short("m")
.takes_value(true)
.possible_values(&["fast", "immediate"])
.help("If 'immediate', don't flush repository data at shutdown")
.required(false)
.value_name("stop-mode");
let matches = App::new("Zenith CLI")
.setting(AppSettings::ArgRequiredElseHelp)
.subcommand(
SubCommand::with_name("init")
.about("Initialize a new Zenith repository")
.arg(
Arg::with_name("config")
.long("config")
Arg::with_name("pageserver-pg-port")
.long("pageserver-pg-port")
.required(false)
.value_name("config"),
.value_name("pageserver-pg-port"),
)
.arg(
Arg::with_name("pageserver-http-port")
.long("pageserver-http-port")
.required(false)
.value_name("pageserver-http-port"),
)
.arg(
Arg::with_name("enable-auth")
.long("enable-auth")
.takes_value(false)
.help("Enable authentication using ZenithJWT")
),
)
.subcommand(
SubCommand::with_name("branch")
@@ -126,35 +90,15 @@ fn main() -> Result<()> {
.subcommand(SubCommand::with_name("list"))
.subcommand(SubCommand::with_name("create").arg(Arg::with_name("tenantid").required(false).index(1)))
)
.subcommand(
SubCommand::with_name("pageserver")
.setting(AppSettings::ArgRequiredElseHelp)
.about("Manage pageserver")
.subcommand(SubCommand::with_name("status"))
.subcommand(SubCommand::with_name("start").about("Start local pageserver"))
.subcommand(SubCommand::with_name("stop").about("Stop local pageserver")
.arg(stop_mode_arg.clone()))
.subcommand(SubCommand::with_name("restart").about("Restart local pageserver"))
)
.subcommand(
SubCommand::with_name("safekeeper")
.setting(AppSettings::ArgRequiredElseHelp)
.about("Manage safekeepers")
.subcommand(SubCommand::with_name("start")
.about("Start local safekeeper")
.arg(safekeeper_node_arg.clone())
)
.subcommand(SubCommand::with_name("stop")
.about("Stop local safekeeper")
.arg(safekeeper_node_arg.clone())
.arg(stop_mode_arg.clone())
)
.subcommand(SubCommand::with_name("restart")
.about("Restart local safekeeper")
.arg(safekeeper_node_arg.clone())
.arg(stop_mode_arg.clone())
)
.subcommand(SubCommand::with_name("status"))
.subcommand(SubCommand::with_name("start").about("Start local pageserver"))
.subcommand(SubCommand::with_name("stop").about("Stop local pageserver")
.arg(Arg::with_name("immediate")
.help("Don't flush repository data at shutdown")
.required(false)
)
)
.subcommand(SubCommand::with_name("restart").about("Restart local pageserver"))
.subcommand(
SubCommand::with_name("pg")
.setting(AppSettings::ArgRequiredElseHelp)
@@ -162,7 +106,7 @@ fn main() -> Result<()> {
.subcommand(SubCommand::with_name("list").arg(tenantid_arg.clone()))
.subcommand(SubCommand::with_name("create")
.about("Create a postgres compute node")
.arg(pg_node_arg.clone())
.arg(node_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(port_arg.clone())
@@ -174,13 +118,13 @@ fn main() -> Result<()> {
))
.subcommand(SubCommand::with_name("start")
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
.arg(pg_node_arg.clone())
.arg(node_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(port_arg.clone()))
.subcommand(
SubCommand::with_name("stop")
.arg(pg_node_arg.clone())
.arg(node_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(
@@ -192,36 +136,37 @@ fn main() -> Result<()> {
)
)
.subcommand(
SubCommand::with_name("start")
.about("Start page server and safekeepers")
)
.subcommand(
SubCommand::with_name("stop")
.about("Stop page server and safekeepers")
.arg(stop_mode_arg.clone())
)
.get_matches();
// Create config file
if let ("init", Some(init_match)) = matches.subcommand() {
let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
// load and parse the file
std::fs::read_to_string(std::path::Path::new(config_path))
.with_context(|| format!("Could not read configuration file \"{}\"", config_path))?
} else {
// Built-in default config
default_conf()
let tenantid = ZTenantId::generate();
let pageserver_pg_port = match init_match.value_of("pageserver-pg-port") {
Some(v) => v.parse()?,
None => DEFAULT_PG_LISTEN_PORT,
};
let pageserver_http_port = match init_match.value_of("pageserver-http-port") {
Some(v) => v.parse()?,
None => DEFAULT_HTTP_LISTEN_PORT,
};
let mut env = LocalEnv::create_config(&toml_file)
.with_context(|| "Failed to create zenith configuration")?;
env.init()
.with_context(|| "Failed to initialize zenith repository")?;
let auth_type = if init_match.is_present("enable-auth") {
AuthType::ZenithJWT
} else {
AuthType::Trust
};
local_env::init(
pageserver_pg_port,
pageserver_http_port,
tenantid,
auth_type,
)
.with_context(|| "Failed to create config file")?;
}
// all other commands would need config
let env = match LocalEnv::load_config() {
let env = match local_env::load_config() {
Ok(conf) => conf,
Err(e) => {
eprintln!("Error loading config: {}", e);
@@ -230,12 +175,11 @@ fn main() -> Result<()> {
};
match matches.subcommand() {
("init", Some(_sub_m)) => {
// The options were handled above already
("init", Some(init_match)) => {
let pageserver = PageServerNode::from_env(&env);
if let Err(e) = pageserver.init(
// default_tenantid was generated by the `env.init()` call above
Some(&env.default_tenantid.unwrap().to_string()),
Some(&env.tenantid.to_string()),
init_match.is_present("enable-auth"),
) {
eprintln!("pageserver init failed: {}", e);
exit(1);
@@ -255,27 +199,43 @@ fn main() -> Result<()> {
}
}
("start", Some(sub_match)) => {
if let Err(e) = handle_start_all(sub_match, &env) {
eprintln!("start command failed: {}", e);
("start", Some(_sub_m)) => {
let pageserver = PageServerNode::from_env(&env);
if let Err(e) = pageserver.start() {
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
("stop", Some(sub_match)) => {
if let Err(e) = handle_stop_all(sub_match, &env) {
eprintln!("stop command failed: {}", e);
("stop", Some(stop_match)) => {
let pageserver = PageServerNode::from_env(&env);
let immediate = stop_match.is_present("immediate");
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
}
("pageserver", Some(sub_match)) => {
if let Err(e) = handle_pageserver(sub_match, &env) {
eprintln!("pageserver command failed: {}", e);
("restart", Some(_sub_m)) => {
let pageserver = PageServerNode::from_env(&env);
//TODO what shutdown strategy should we use here?
if let Err(e) = pageserver.stop(false) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
if let Err(e) = pageserver.start() {
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
("status", Some(_sub_m)) => {}
("pg", Some(pg_match)) => {
if let Err(e) = handle_pg(pg_match, &env) {
eprintln!("pg operation failed: {:?}", e);
@@ -283,13 +243,6 @@ fn main() -> Result<()> {
}
}
("safekeeper", Some(sub_match)) => {
if let Err(e) = handle_safekeeper(sub_match, &env) {
eprintln!("safekeeper command failed: {}", e);
exit(1);
}
}
_ => {}
};
@@ -427,23 +380,12 @@ fn get_branch_infos(
Ok(branch_infos)
}
// Helper function to parse --tenantid option, or get the default from config file
fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<ZTenantId> {
if let Some(tenantid_cmd) = sub_match.value_of("tenantid") {
Ok(ZTenantId::from_str(tenantid_cmd)?)
} else if let Some(tenantid_conf) = env.default_tenantid {
Ok(tenantid_conf)
} else {
bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file");
}
}
fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
match tenant_match.subcommand() {
("list", Some(_)) => {
for t in pageserver.tenant_list()? {
println!("{} {}", t.id, t.state);
for tenant in pageserver.tenant_list()? {
println!("{}", tenant);
}
}
("create", Some(create_match)) => {
@@ -463,18 +405,22 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
let tenantid = get_tenantid(branch_match, env)?;
if let Some(branchname) = branch_match.value_of("branchname") {
let startpoint_str = branch_match
.value_of("start-point")
.ok_or_else(|| anyhow!("Missing start-point"))?;
let tenantid: ZTenantId = branch_match
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?;
println!(
"Created branch '{}' at {:?} for tenant: {}",
branch.name, branch.latest_valid_lsn, tenantid,
);
} else {
let tenantid: ZTenantId = branch_match
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
// No arguments, list branches for tenant
let branches = pageserver.branch_list(&tenantid)?;
print_branches_tree(branches)?;
@@ -488,7 +434,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match pg_match.subcommand() {
("list", Some(list_match)) => {
let tenantid = get_tenantid(list_match, env)?;
let tenantid: ZTenantId = list_match
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| {
eprintln!("Failed to load branch info: {}", e);
@@ -520,7 +468,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
}
}
("create", Some(create_match)) => {
let tenantid = get_tenantid(create_match, env)?;
let tenantid: ZTenantId = create_match
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
let node_name = create_match.value_of("node").unwrap_or("main");
let timeline_name = create_match.value_of("timeline").unwrap_or(node_name);
@@ -531,7 +481,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
cplane.new_node(tenantid, node_name, timeline_name, port)?;
}
("start", Some(start_match)) => {
let tenantid = get_tenantid(start_match, env)?;
let tenantid: ZTenantId = start_match
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
let node_name = start_match.value_of("node").unwrap_or("main");
let timeline_name = start_match.value_of("timeline");
@@ -542,10 +494,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let node = cplane.nodes.get(&(tenantid, node_name.to_owned()));
let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) {
let auth_token = if matches!(env.auth_type, AuthType::ZenithJWT) {
let claims = Claims::new(Some(tenantid), Scope::Tenant);
Some(env.generate_auth_token(&claims)?)
Some(encode_from_key_path(&claims, &env.private_key_path)?)
} else {
None
};
@@ -572,9 +523,11 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
}
}
("stop", Some(stop_match)) => {
let tenantid = get_tenantid(stop_match, env)?;
let node_name = stop_match.value_of("node").unwrap_or("main");
let destroy = stop_match.is_present("destroy");
let tenantid: ZTenantId = stop_match
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
let node = cplane
.nodes
@@ -588,147 +541,3 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
Ok(())
}
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
match sub_match.subcommand() {
("start", Some(_sub_m)) => {
if let Err(e) = pageserver.start() {
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
("stop", Some(stop_match)) => {
let immediate = stop_match.value_of("stop-mode") == Some("immediate");
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
}
("restart", Some(_sub_m)) => {
//TODO what shutdown strategy should we use here?
if let Err(e) = pageserver.stop(false) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
if let Err(e) = pageserver.start() {
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
_ => {}
}
Ok(())
}
fn get_safekeeper(env: &local_env::LocalEnv, name: &str) -> Result<SafekeeperNode> {
if let Some(node) = env.safekeepers.iter().find(|node| node.name == name) {
Ok(SafekeeperNode::from_env(env, node))
} else {
bail!("could not find safekeeper '{}'", name)
}
}
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
("start", Some(sub_match)) => {
let node_name = sub_match
.value_of("node")
.unwrap_or(DEFAULT_SAFEKEEPER_NAME);
let safekeeper = get_safekeeper(env, node_name)?;
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
}
("stop", Some(sub_match)) => {
let node_name = sub_match
.value_of("node")
.unwrap_or(DEFAULT_SAFEKEEPER_NAME);
let immediate = sub_match.value_of("stop-mode") == Some("immediate");
let safekeeper = get_safekeeper(env, node_name)?;
if let Err(e) = safekeeper.stop(immediate) {
eprintln!("safekeeper stop failed: {}", e);
exit(1);
}
}
("restart", Some(sub_match)) => {
let node_name = sub_match
.value_of("node")
.unwrap_or(DEFAULT_SAFEKEEPER_NAME);
let safekeeper = get_safekeeper(env, node_name)?;
//TODO what shutdown strategy should we use here?
if let Err(e) = safekeeper.stop(false) {
eprintln!("safekeeper stop failed: {}", e);
exit(1);
}
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
}
_ => {}
}
Ok(())
}
fn handle_start_all(_sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
// Postgres nodes are not started automatically
if let Err(e) = pageserver.start() {
eprintln!("pageserver start failed: {}", e);
exit(1);
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper '{}' start failed: {}", safekeeper.name, e);
exit(1);
}
}
Ok(())
}
fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let immediate = sub_match.value_of("stop-mode") == Some("immediate");
let pageserver = PageServerNode::from_env(env);
// Stop all compute nodes
let cplane = ComputeControlPlane::load(env.clone())?;
for (_k, node) in cplane.nodes {
if let Err(e) = node.stop(false) {
eprintln!("postgres stop failed: {}", e);
}
}
if let Err(e) = pageserver.stop(immediate) {
eprintln!("pageserver stop failed: {}", e);
}
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.stop(immediate) {
eprintln!("safekeeper '{}' stop failed: {}", safekeeper.name, e);
}
}
Ok(())
}

View File

@@ -74,10 +74,6 @@ lazy_static! {
.expect("Failed to register maxrss_kb int gauge");
}
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
];
// Records I/O stats in a "cross-platform" way.
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
// An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.

View File

@@ -11,6 +11,7 @@ byteorder = "1.4.3"
bytes = "1.0.1"
hyper = { version = "0.14.7", features = ["full"] }
lazy_static = "1.4.0"
log = "0.4.14"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
routerify = "2"
serde = { version = "1.0", features = ["derive"] }
@@ -18,7 +19,8 @@ serde_json = "1"
thiserror = "1.0"
tokio = "1.11"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
tracing-log = "0.1"
tracing-subscriber = "0.2"
zenith_metrics = { path = "../zenith_metrics" }
workspace_hack = { path = "../workspace_hack" }

View File

@@ -104,8 +104,8 @@ impl JwtAuth {
}
pub fn from_key_path(key_path: &Path) -> Result<Self> {
let public_key = fs::read(key_path)?;
Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?))
let public_key = fs::read_to_string(key_path)?;
Ok(Self::new(DecodingKey::from_rsa_pem(public_key.as_bytes())?))
}
pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
@@ -114,7 +114,8 @@ impl JwtAuth {
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
let key = EncodingKey::from_rsa_pem(key_data)?;
pub fn encode_from_key_path(claims: &Claims, key_path: &Path) -> Result<String> {
let key_data = fs::read_to_string(key_path)?;
let key = EncodingKey::from_rsa_pem(key_data.as_bytes())?;
Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
}

View File

@@ -9,7 +9,6 @@ use routerify::ext::RequestExt;
use routerify::RequestInfo;
use routerify::{Middleware, Router, RouterBuilder, RouterService};
use std::net::TcpListener;
use tracing::info;
use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
use zenith_metrics::{Encoder, TextEncoder};
@@ -33,7 +32,7 @@ lazy_static! {
}
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
log::info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
Ok(res)
}
@@ -164,7 +163,7 @@ pub fn serve_thread_main(
router_builder: RouterBuilder<hyper::Body, ApiError>,
listener: TcpListener,
) -> anyhow::Result<()> {
info!("Starting a http endpoint at {}", listener.local_addr()?);
log::info!("Starting a http endpoint at {}", listener.local_addr()?);
// Create a Service from the router above to handle incoming requests.
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();

Some files were not shown because too many files have changed in this diff Show More