Merge remote-tracking branch 'origin/main' into HEAD

This commit is contained in:
Heikki Linnakangas
2022-03-14 10:26:05 +02:00
69 changed files with 4146 additions and 1788 deletions

View File

@@ -48,7 +48,7 @@ jobs:
echo Python
python3 --version
poetry run python3 --version
echo Pipenv
echo Poetry
poetry --version
echo Pgbench
$PG_BIN/pgbench --version

183
Cargo.lock generated
View File

@@ -260,6 +260,18 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
[[package]]
name = "bstr"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
dependencies = [
"lazy_static",
"memchr",
"regex-automata",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.9.1"
@@ -281,6 +293,15 @@ dependencies = [
"serde",
]
[[package]]
name = "cast"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
dependencies = [
"rustc_version",
]
[[package]]
name = "cc"
version = "1.0.72"
@@ -447,6 +468,76 @@ dependencies = [
"rustc_version",
]
[[package]]
name = "criterion"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10"
dependencies = [
"atty",
"cast",
"clap 2.34.0",
"criterion-plot",
"csv",
"itertools",
"lazy_static",
"num-traits",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_cbor",
"serde_derive",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
dependencies = [
"cast",
"itertools",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
dependencies = [
"cfg-if",
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9"
dependencies = [
"cfg-if",
"crossbeam-utils",
"lazy_static",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.7"
@@ -477,6 +568,28 @@ dependencies = [
"subtle",
]
[[package]]
name = "csv"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
dependencies = [
"bstr",
"csv-core",
"itoa 0.4.8",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]]
name = "daemonize"
version = "0.4.1"
@@ -1260,6 +1373,12 @@ version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
[[package]]
name = "oorandom"
version = "11.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "opaque-debug"
version = "0.3.0"
@@ -1445,6 +1564,34 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "plotters"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c"
[[package]]
name = "plotters-svg"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9"
dependencies = [
"plotters-backend",
]
[[package]]
name = "postgres"
version = "0.19.1"
@@ -1665,6 +1812,31 @@ dependencies = [
"rand_core",
]
[[package]]
name = "rayon"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
dependencies = [
"autocfg",
"crossbeam-deque",
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
dependencies = [
"crossbeam-channel",
"crossbeam-deque",
"crossbeam-utils",
"lazy_static",
"num_cpus",
]
[[package]]
name = "rcgen"
version = "0.8.14"
@@ -2234,6 +2406,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "tinyvec"
version = "1.5.1"
@@ -2856,6 +3038,7 @@ dependencies = [
"bincode",
"byteorder",
"bytes",
"criterion",
"git-version",
"hex",
"hex-literal",

View File

@@ -57,12 +57,12 @@ pageserver init succeeded
Starting pageserver at 'localhost:64000' in '.zenith'
Pageserver started
initializing for single for 7676
Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
Safekeeper started
# start postgres compute node
> ./target/debug/zenith pg start main
Starting new postgres main on main...
Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
waiting for server to start.... done
@@ -70,8 +70,8 @@ server started
# check list of running postgres instances
> ./target/debug/zenith pg list
BRANCH ADDRESS LSN STATUS
main 127.0.0.1:55432 0/1609610 running
NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS
main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running
```
4. Now it is possible to connect to postgres and run some queries:
@@ -91,13 +91,13 @@ postgres=# select * from t;
5. And create branches and run postgres on them:
```sh
# create branch named migration_check
> ./target/debug/zenith branch migration_check main
Created branch 'migration_check' at 0/1609610
> ./target/debug/zenith timeline branch --branch-name migration_check
Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
# check branches tree
> ./target/debug/zenith branch
main
┗━ @0/1609610: migration_check
> ./target/debug/zenith timeline list
main [5b014a9e41b4b63ce1a1febc04503636]
┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
# start postgres on that branch
> ./target/debug/zenith pg start migration_check

View File

@@ -37,7 +37,7 @@ impl ComputeControlPlane {
// pgdatadirs
// |- tenants
// | |- <tenant_id>
// | | |- <branch name>
// | | |- <node name>
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
let pageserver = Arc::new(PageServerNode::from_env(&env));
@@ -52,7 +52,7 @@ impl ComputeControlPlane {
.with_context(|| format!("failed to list {}", tenant_dir.path().display()))?
{
let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?;
nodes.insert((node.tenantid, node.name.clone()), Arc::new(node));
nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node));
}
}
@@ -73,40 +73,14 @@ impl ComputeControlPlane {
.unwrap_or(self.base_port)
}
// FIXME: see also parse_point_in_time in branches.rs.
fn parse_point_in_time(
&self,
tenantid: ZTenantId,
s: &str,
) -> Result<(ZTimelineId, Option<Lsn>)> {
let mut strings = s.split('@');
let name = strings.next().unwrap();
let lsn = strings
.next()
.map(Lsn::from_str)
.transpose()
.context("invalid LSN in point-in-time specification")?;
// Resolve the timeline ID, given the human-readable branch name
let timeline_id = self
.pageserver
.branch_get_by_name(&tenantid, name)?
.timeline_id;
Ok((timeline_id, lsn))
}
pub fn new_node(
&mut self,
tenantid: ZTenantId,
tenant_id: ZTenantId,
name: &str,
timeline_spec: &str,
timeline_id: ZTimelineId,
lsn: Option<Lsn>,
port: Option<u16>,
) -> Result<Arc<PostgresNode>> {
// Resolve the human-readable timeline spec into timeline ID and LSN
let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;
let port = port.unwrap_or_else(|| self.get_port());
let node = Arc::new(PostgresNode {
name: name.to_owned(),
@@ -114,9 +88,9 @@ impl ComputeControlPlane {
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
is_test: false,
timelineid,
timeline_id,
lsn,
tenantid,
tenant_id,
uses_wal_proposer: false,
});
@@ -124,7 +98,7 @@ impl ComputeControlPlane {
node.setup_pg_conf(self.env.pageserver.auth_type)?;
self.nodes
.insert((tenantid, node.name.clone()), Arc::clone(&node));
.insert((tenant_id, node.name.clone()), Arc::clone(&node));
Ok(node)
}
@@ -139,9 +113,9 @@ pub struct PostgresNode {
pub env: LocalEnv,
pageserver: Arc<PageServerNode>,
is_test: bool,
pub timelineid: ZTimelineId,
pub timeline_id: ZTimelineId,
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
pub tenantid: ZTenantId,
pub tenant_id: ZTenantId,
uses_wal_proposer: bool,
}
@@ -173,8 +147,8 @@ impl PostgresNode {
// Read a few options from the config file
let context = format!("in config file {}", cfg_path_str);
let port: u16 = conf.parse_field("port", &context)?;
let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
let uses_wal_proposer = conf.get("wal_acceptors").is_some();
// parse recovery_target_lsn, if any
@@ -188,9 +162,9 @@ impl PostgresNode {
env: env.clone(),
pageserver: Arc::clone(pageserver),
is_test: false,
timelineid,
timeline_id,
lsn: recovery_target_lsn,
tenantid,
tenant_id,
uses_wal_proposer,
})
}
@@ -241,9 +215,9 @@ impl PostgresNode {
);
let sql = if let Some(lsn) = lsn {
format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn)
format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
} else {
format!("basebackup {} {}", self.tenantid, self.timelineid)
format!("basebackup {} {}", self.tenant_id, self.timeline_id)
};
let mut client = self
@@ -329,8 +303,8 @@ impl PostgresNode {
conf.append("shared_preload_libraries", "zenith");
conf.append_line("");
conf.append("zenith.page_server_connstring", &pageserver_connstr);
conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
conf.append("zenith.zenith_tenant", &self.tenant_id.to_string());
conf.append("zenith.zenith_timeline", &self.timeline_id.to_string());
if let Some(lsn) = self.lsn {
conf.append("recovery_target_lsn", &lsn.to_string());
}
@@ -408,7 +382,7 @@ impl PostgresNode {
}
pub fn pgdata(&self) -> PathBuf {
self.env.pg_data_dir(&self.tenantid, &self.name)
self.env.pg_data_dir(&self.tenant_id, &self.name)
}
pub fn status(&self) -> &str {

View File

@@ -3,16 +3,18 @@
//! Now it also provides init method which acts like a stub for proper installation
//! script which will use local paths.
use anyhow::{bail, Context};
use anyhow::{bail, ensure, Context};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::env;
use std::fmt::Write;
use std::fs;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId};
use zenith_utils::zid::{
HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId,
};
use crate::safekeeper::SafekeeperNode;
@@ -23,7 +25,7 @@ use crate::safekeeper::SafekeeperNode;
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
// an example.
//
#[derive(Serialize, Deserialize, Clone, Debug)]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
// compute nodes).
@@ -48,7 +50,7 @@ pub struct LocalEnv {
// Default tenant ID to use with the 'zenith' command line utility, when
// --tenantid is not explicitly specified.
#[serde(default)]
pub default_tenantid: Option<HexZTenantId>,
pub default_tenant_id: Option<HexZTenantId>,
// used to issue tokens during e.g pg start
#[serde(default)]
@@ -58,9 +60,16 @@ pub struct LocalEnv {
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
#[serde(default)]
// A `HashMap<String, HashMap<ZTenantId, ZTimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
branch_name_mappings: HashMap<String, Vec<(HexZTenantId, HexZTimelineId)>>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct PageServerConf {
// node id
@@ -88,7 +97,7 @@ impl Default for PageServerConf {
}
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct SafekeeperConf {
pub id: ZNodeId,
@@ -144,6 +153,72 @@ impl LocalEnv {
self.base_data_dir.join("safekeepers").join(data_dir_name)
}
pub fn register_branch_mapping(
&mut self,
branch_name: String,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
) -> anyhow::Result<()> {
let existing_values = self
.branch_name_mappings
.entry(branch_name.clone())
.or_default();
let tenant_id = HexZTenantId::from(tenant_id);
let timeline_id = HexZTimelineId::from(timeline_id);
let existing_ids = existing_values
.iter()
.find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
if let Some((_, old_timeline_id)) = existing_ids {
if old_timeline_id == &timeline_id {
Ok(())
} else {
bail!(
"branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
branch_name,
old_timeline_id,
timeline_id
);
}
} else {
existing_values.push((tenant_id, timeline_id));
Ok(())
}
}
pub fn get_branch_timeline_id(
&self,
branch_name: &str,
tenant_id: ZTenantId,
) -> Option<ZTimelineId> {
let tenant_id = HexZTenantId::from(tenant_id);
self.branch_name_mappings
.get(branch_name)?
.iter()
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
.map(|&(_, timeline_id)| timeline_id)
.map(ZTimelineId::from)
}
pub fn timeline_name_mappings(&self) -> HashMap<ZTenantTimelineId, String> {
self.branch_name_mappings
.iter()
.flat_map(|(name, tenant_timelines)| {
tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
(
ZTenantTimelineId::new(
ZTenantId::from(tenant_id),
ZTimelineId::from(timeline_id),
),
name.clone(),
)
})
})
.collect()
}
/// Create a LocalEnv from a config file.
///
/// Unlike 'load_config', this function fills in any defaults that are missing
@@ -183,8 +258,8 @@ impl LocalEnv {
}
// If no initial tenant ID was given, generate it.
if env.default_tenantid.is_none() {
env.default_tenantid = Some(HexZTenantId::from(ZTenantId::generate()));
if env.default_tenant_id.is_none() {
env.default_tenant_id = Some(HexZTenantId::from(ZTenantId::generate()));
}
env.base_data_dir = base_path();
@@ -214,6 +289,39 @@ impl LocalEnv {
Ok(env)
}
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
// Currently, the user first passes a config file with 'zenith init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
let mut conf_content = r#"# This file describes a locale deployment of the page server
# and safekeeeper node. It is read by the 'zenith' command-line
# utility.
"#
.to_string();
// Convert the LocalEnv to a toml file.
//
// This could be as simple as this:
//
// conf_content += &toml::to_string_pretty(env)?;
//
// But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
let target_config_path = base_path.join("config");
fs::write(&target_config_path, conf_content).with_context(|| {
format!(
"Failed to write config file into path '{}'",
target_config_path.display()
)
})
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
let private_key_path = if self.private_key_path.is_absolute() {
@@ -232,15 +340,15 @@ impl LocalEnv {
pub fn init(&mut self) -> anyhow::Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
if base_path == Path::new("") {
bail!("repository base path is missing");
}
if base_path.exists() {
bail!(
"directory '{}' already exists. Perhaps already initialized?",
base_path.to_str().unwrap()
);
}
ensure!(
base_path != Path::new(""),
"repository base path is missing"
);
ensure!(
!base_path.exists(),
"directory '{}' already exists. Perhaps already initialized?",
base_path.display()
);
fs::create_dir(&base_path)?;
@@ -292,36 +400,7 @@ impl LocalEnv {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
}
let mut conf_content = String::new();
// Currently, the user first passes a config file with 'zenith init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
write!(
&mut conf_content,
r#"# This file describes a locale deployment of the page server
# and safekeeeper node. It is read by the 'zenith' command-line
# utility.
"#
)?;
// Convert the LocalEnv to a toml file.
//
// This could be as simple as this:
//
// conf_content += &toml::to_string_pretty(env)?;
//
// But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
fs::write(base_path.join("config"), conf_content)?;
Ok(())
self.persist_config(base_path)
}
}

View File

@@ -1,3 +1,4 @@
use std::convert::TryFrom;
use std::io::Write;
use std::net::TcpStream;
use std::path::PathBuf;
@@ -5,22 +6,23 @@ use std::process::Command;
use std::time::Duration;
use std::{io, result, thread};
use anyhow::bail;
use anyhow::{bail, Context};
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse};
use pageserver::timelines::TimelineInfo;
use postgres::{Config, NoTls};
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use zenith_utils::http::error::HttpErrorBody;
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::ZTenantId;
use zenith_utils::zid::{HexZTenantId, HexZTimelineId, ZTenantId, ZTimelineId};
use crate::local_env::LocalEnv;
use crate::{fill_rust_env_vars, read_pidfile};
use pageserver::branches::BranchInfo;
use pageserver::tenant_mgr::TenantInfo;
use zenith_utils::connstring::connection_address;
@@ -98,9 +100,10 @@ impl PageServerNode {
pub fn init(
&self,
create_tenant: Option<&str>,
create_tenant: Option<ZTenantId>,
initial_timeline_id: Option<ZTimelineId>,
config_overrides: &[&str],
) -> anyhow::Result<()> {
) -> anyhow::Result<ZTimelineId> {
let mut cmd = Command::new(self.env.pageserver_bin()?);
let id = format!("id={}", self.env.pageserver.id);
@@ -137,19 +140,24 @@ impl PageServerNode {
]);
}
if let Some(tenantid) = create_tenant {
args.extend(["--create-tenant", tenantid])
let create_tenant = create_tenant.map(|id| id.to_string());
if let Some(tenant_id) = create_tenant.as_deref() {
args.extend(["--create-tenant", tenant_id])
}
let status = fill_rust_env_vars(cmd.args(args))
.status()
.expect("pageserver init failed");
let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
let initial_timeline_id_string = initial_timeline_id.to_string();
args.extend(["--initial-timeline-id", &initial_timeline_id_string]);
if !status.success() {
let init_output = fill_rust_env_vars(cmd.args(args))
.output()
.context("pageserver init failed")?;
if !init_output.status.success() {
bail!("pageserver init failed");
}
Ok(())
Ok(initial_timeline_id)
}
pub fn repo_path(&self) -> PathBuf {
@@ -310,7 +318,7 @@ impl PageServerNode {
}
pub fn check_status(&self) -> Result<()> {
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
self.http_request(Method::GET, format!("{}/status", self.http_base_url))
.send()?
.error_from_body()?;
Ok(())
@@ -318,64 +326,76 @@ impl PageServerNode {
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
Ok(self
.http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))
.send()?
.error_from_body()?
.json()?)
}
pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> {
Ok(self
.http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant"))
pub fn tenant_create(
&self,
new_tenant_id: Option<ZTenantId>,
) -> anyhow::Result<Option<ZTenantId>> {
let tenant_id_string = self
.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
.json(&TenantCreateRequest {
tenant_id: tenantid,
new_tenant_id: new_tenant_id.map(HexZTenantId::from),
})
.send()?
.error_from_body()?
.json()?)
.json::<Option<String>>()?;
tenant_id_string
.map(|id| {
id.parse().with_context(|| {
format!(
"Failed to parse tennat creation response as tenant id: {}",
id
)
})
})
.transpose()
}
pub fn branch_list(&self, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
Ok(self
pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
let timeline_infos: Vec<TimelineInfoResponse> = self
.http_request(
Method::GET,
format!("{}/branch/{}", self.http_base_url, tenantid),
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)
.send()?
.error_from_body()?
.json()?)
.json()?;
timeline_infos
.into_iter()
.map(TimelineInfo::try_from)
.collect()
}
pub fn branch_create(
pub fn timeline_create(
&self,
branch_name: &str,
startpoint: &str,
tenantid: &ZTenantId,
) -> Result<BranchInfo> {
Ok(self
.http_request(Method::POST, format!("{}/branch", self.http_base_url))
.json(&BranchCreateRequest {
tenant_id: tenantid.to_owned(),
name: branch_name.to_owned(),
start_point: startpoint.to_owned(),
tenant_id: ZTenantId,
new_timeline_id: Option<ZTimelineId>,
ancestor_start_lsn: Option<Lsn>,
ancestor_timeline_id: Option<ZTimelineId>,
) -> anyhow::Result<Option<TimelineInfo>> {
let timeline_info_response = self
.http_request(
Method::POST,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)
.json(&TimelineCreateRequest {
new_timeline_id: new_timeline_id.map(HexZTimelineId::from),
ancestor_start_lsn,
ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from),
})
.send()?
.error_from_body()?
.json()?)
}
.json::<Option<TimelineInfoResponse>>()?;
pub fn branch_get_by_name(
&self,
tenantid: &ZTenantId,
branch_name: &str,
) -> Result<BranchInfo> {
Ok(self
.http_request(
Method::GET,
format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name),
)
.send()?
.error_for_status()?
.json()?)
timeline_info_response
.map(TimelineInfo::try_from)
.transpose()
}
}

186
docs/rfcs/002-storage.md Normal file
View File

@@ -0,0 +1,186 @@
# Zenith storage node — alternative
## **Design considerations**
Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud.
Proposed architecture addresses:
- High availability -- tolerates n/2 - 1 failures
- Multi-tenancy -- one storage for all databases
- Elasticity -- increase storage size on the go by adding nodes
- Snapshots / backups / PITR with S3 offload
- Compression
Minuses are:
- Quite a lot of work
- Single page access may touch few disk pages
- Some bloat in data — may slowdown sequential scans
## **Summary**
Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories:
```
|-chunk_42/
|-store/ -- contains lsm with pages/pagediffs ranging from
| page_key_lo to page_key_hi
|-wal/
| |- db_1234/ db-specific wal files with pages from page_key_lo
| to page_key_hi
|
|-chunk.meta -- small file with snapshot references
(page_key_prefix+lsn+name)
and PITR regions (page_key_start, page_key_end)
```
## **Chunk**
Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields:
- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs)
- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance
- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later
- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space.
- `(forkno, segno, pageno)` -- page coordinates in postgres data files
- `lsn_timeline` -- postgres feature, increments when PITR was done.
- `lsn` -- lsn of current page version.
Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version.
LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability.
Content of SSTable can be following:
```jsx
(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data)
(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff)
(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff)
(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff)
(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data)
(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff)
(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff)
(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data)
```
So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page.
### **Page deletion**
To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers.
### **Recovery**
Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk.
### **Checkpointing**
No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer.
### **Full page writes (torn page protection)**
Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue.
### **Snapshot**
That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity.
It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance.
Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space.
**Starting db from snapshot**
When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database).
**Snapshot export/import**
Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network.
### **PITR area**
In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc.
PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there.
### **Compression**
Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity.
### **Chunk metadata**
Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers.
### **Chunk splitting**
*(NB: following paragraph is about how to avoid page splitting)*
When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following:
1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries.
2. Prohibit WAL deletion and old SSTables deletion on original chunk.
3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks.
4. Start WAL replay on new chunks.
5. Update global metadata about new chunk boundaries.
6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes.
7. New chunk may start serving read queries when following conditions are met:
a) it receives at least on WAL record from processing node
b) it replayed all WAL up to the new received one
c) checked by downlinks that there were no WAL gaps.
Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting.
### Fixed chunks
Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions.
### **Chunk lsm internals**
So how to implement chunk's lsm?
- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype.
- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla.
Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS.
Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important.
# Storage fleet
# **Storage fleet**
- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range).
<img width="937" alt="Screenshot_2021-02-22_at_16 49 17" src="https://user-images.githubusercontent.com/284219/108729836-ffcbd200-753b-11eb-9412-db802ec30021.png">
Few databases are stored in one chunk, replicated three times
- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster.
<img width="940" alt="Screenshot_2021-02-22_at_16 49 10" src="https://user-images.githubusercontent.com/284219/108729815-fb071e00-753b-11eb-86e0-be6703e47d82.png">
Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel.
## **Chunk placement strategies**
There are few scenarios where we may want to move chunks around the cluster:
- disk usage on some node is big
- some disk experienced a failure
- some node experienced a failure or need maintenance
## **Chunk replication**
Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split.

267
docs/rfcs/003-laptop-cli.md Normal file
View File

@@ -0,0 +1,267 @@
# Command line interface (end-user)
Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots.
# Possible usage scenarios
## Install zenith, run a postgres
```
> brew install pg-zenith
> zenith pg create # creates pgdata with default pattern pgdata$i
> zenith pg list
ID PGDATA USED STORAGE ENDPOINT
primary1 pgdata1 0G zenith-local localhost:5432
```
## Import standalone postgres to zenith
```
> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
[====================------------] 60% | 20MB/s
> zenith snapshot list
ID SIZE PARENT
oldpg 5G -
> zenith pg create --snapshot oldpg
Started postgres on localhost:5432
> zenith pg list
ID PGDATA USED STORAGE ENDPOINT
primary1 pgdata1 5G zenith-local localhost:5432
> zenith snapshot destroy oldpg
Ok
```
Also, we may start snapshot import implicitly by looking at snapshot schema
```
> zenith pg create --snapshot basebackup://replication@localhost:5432/
Downloading snapshot... Done.
Started postgres on localhost:5432
Destroying snapshot... Done.
```
## Pull snapshot with some publicly shared database
Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
```
> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
```
## Create snapshot and push it to the cloud
```
> zenith snapshot create pgdata1@snap1
> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
```
## Rollback database to the snapshot
One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
```
> zenith pg list
ID PGDATA USED STORAGE ENDPOINT
primary1 pgdata1 5G zenith-local localhost:5432
> zenith snapshot create pgdata1@snap1
> zenith snapshot list
ID SIZE PARENT
oldpg 5G -
pgdata1@snap1 6G -
pgdata1@CURRENT 6G -
> zenith pg checkout pgdata1@snap1
Stopping postgres on pgdata1.
Rolling back pgdata1@CURRENT to pgdata1@snap1.
Starting postgres on pgdata1.
> zenith snapshot list
ID SIZE PARENT
oldpg 5G -
pgdata1@snap1 6G -
pgdata1@HEAD{0} 6G -
pgdata1@CURRENT 6G -
```
Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout).
## Configure PITR area (Point In Time Recovery).
PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
```
> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
```
Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
# Manual
## storage
Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
**zenith storage attach** -t [native|s3] -c key=value -n name
Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
**zenith storage list**
Show currently attached storages. For example:
```
> zenith storage list
NAME USED TYPE OPTIONS PATH
local 5.1G zenith-local /opt/zenith/store/local
local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr
zcloud 60G zenith-remote zenith.tech/stas/mystore
s3tank 80G S3
```
**zenith storage detach**
**zenith storage show**
## pg
Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself.
Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together.
**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
--no-start: just init datadir without creating
--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
**zenith pg destroy**
**zenith pg start** [--replica] pgdata
Start postgres with proper extensions preloaded/installed.
**zenith pg checkout**
Rollback data directory to some previous snapshot.
**zenith pg stop** pg_id
**zenith pg list**
```
ROLE PGDATA USED STORAGE ENDPOINT
primary my_pg 5.1G local localhost:5432
replica-1 localhost:5433
replica-2 localhost:5434
primary my_pg2 3.2G local.compr localhost:5435
- my_pg3 9.2G local.compr -
```
**zenith pg show**
```
my_pg:
storage: local
space used on local: 5.1G
space used on all storages: 15.1G
snapshots:
on local:
snap1: 1G
snap2: 1G
on zcloud:
snap2: 1G
on s3tank:
snap5: 2G
pitr:
on s3tank:
pitr_one_month: 45G
```
**zenith pg start-rest/graphql** pgdata
Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
## snapshot
Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
**zenith snapshot create** pgdata_name@snap_name
Creates a new snapshot in the same storage where pgdata_name exists.
**zenith snapshot push** --to url pgdata_name@snap_name
Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
**zenith snapshot recv**
Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
**zenith snapshot pull** --from url or path
Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
**zenith snapshot import** --from basebackup://<...> or path
Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
**zenith snapshot export**
Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
**zenith snapshot diff** snap1 snap2
Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
**zenith snapshot destroy**
## pitr
Pitr represents wal stream and ttl policy for that stream
XXX: any suggestions on a better name?
**zenith pitr create** name
--ttl = inf | period
--size-limit = inf | limit
--storage = storage_name
**zenith pitr extract-snapshot** pitr_name --lsn xxx
Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
**zenith pitr gc** pitr_name
Force garbage collection on some PITR area.
**zenith pitr list**
**zenith pitr destroy**
## console
**zenith console**
Opens browser targeted at web console with the more or less same functionality as described here.

218
docs/rfcs/004-durability.md Normal file
View File

@@ -0,0 +1,218 @@
Durability & Consensus
======================
When a transaction commits, a commit record is generated in the WAL.
When do we consider the WAL record as durable, so that we can
acknowledge the commit to the client and be reasonably certain that we
will not lose the transaction?
Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
A WAL record is considered durable, when it has been written to a
majority of WAL safekeeper nodes. In this document, I use 5
safekeepers, because I have five fingers. A WAL record is durable,
when at least 3 safekeepers have written it to disk.
First, assume that only one primary node can be running at a
time. This can be achieved by Kubernetes or etcd or some
cloud-provider specific facility, or we can implement it
ourselves. These options are discussed in later chapters. For now,
assume that there is a Magic STONITH Fairy that ensures that.
In addition to the WAL safekeeper nodes, the WAL is archived in
S3. WAL that has been archived to S3 can be removed from the
safekeepers, so the safekeepers don't need a lot of disk space.
+----------------+
+-----> | WAL safekeeper |
| +----------------+
| +----------------+
+-----> | WAL safekeeper |
+------------+ | +----------------+
| Primary | | +----------------+
| Processing | ---------+-----> | WAL safekeeper |
| Node | | +----------------+
+------------+ | +----------------+
\ +-----> | WAL safekeeper |
\ | +----------------+
\ | +----------------+
\ +-----> | WAL safekeeper |
\ +----------------+
\
\
\
\
\ +--------+
\ | |
+--> | S3 |
| |
+--------+
Every WAL safekeeper holds a section of WAL, and a VCL value.
The WAL can be divided into three portions:
VCL LSN
| |
V V
.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
Archived WAL Completed WAL In-flight WAL
Note that all this WAL kept in a safekeeper is a contiguous section.
This is different from Aurora: In Aurora, there can be holes in the
WAL, and there is a Gossip protocol to fill the holes. That could be
implemented in the future, but let's keep it simple for now. WAL needs
to be written to a safekeeper in order. However, during crash
recovery, In-flight WAL that has already been stored in a safekeeper
can be truncated or overwritten.
The Archived WAL has already been stored in S3, and can be removed from
the safekeeper.
The Completed WAL has been written to at least three safekeepers. The
algorithm ensures that it is not lost, when at most two nodes fail at
the same time.
The In-flight WAL has been persisted in the safekeeper, but if a crash
happens, it may still be overwritten or truncated.
The VCL point is determined in the Primary. It is not strictly
necessary to store it in the safekeepers, but it allows some
optimizations and sanity checks and is probably generally useful for
the system as whole. The VCL values stored in the safekeepers can lag
behind the VCL computed by the primary.
Primary node Normal operation
-----------------------------
1. Generate some WAL.
2. Send the WAL to all the safekeepers that you can reach.
3. As soon as a quorum of safekeepers have acknowledged that they have
received and durably stored the WAL up to that LSN, update local VCL
value in memory, and acknowledge commits to the clients.
4. Send the new VCL to all the safekeepers that were part of the quorum.
(Optional)
Primary Crash recovery
----------------------
When a new Primary node starts up, before it can generate any new WAL
it needs to contact a majority of the WAL safekeepers to compute the
VCL. Remember that there is a Magic STONITH fairy that ensures that
only node process can be doing this at a time.
1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you
can reach. This is the Winner safekeeper, and its LSN becomes the new VCL.
2. Update the other safekeepers you can reach, by copying all the WAL
from the Winner, starting from each safekeeper's old VCL point. Any old
In-Flight WAL from previous Epoch is truncated away.
3. Increment Epoch, and send the new Epoch to the quorum of
safekeepers. (This ensures that if any of the safekeepers that we
could not reach later come back online, they will be considered as
older than this in any future recovery)
You can now start generating new WAL, starting from the newly-computed
VCL.
Optimizations
-------------
As described, the Primary node sends all the WAL to all the WAL safekeepers. That
can be a lot of network traffic. Instead of sending the WAL directly from Primary,
some safekeepers can be daisy-chained off other safekeepers, or there can be a
broadcast mechanism among them. There should still be a direct connection from the
each safekeeper to the Primary for the acknowledgments though.
Similarly, the responsibility for archiving WAL to S3 can be delegated to one of
the safekeepers, to reduce the load on the primary.
Magic STONITH fairy
-------------------
Now that we have a system that works as long as only one primary node is running at a time, how
do we ensure that?
1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary
when it's holding a valid lease. If the primary node dies, the lease expires after a timeout
period, and a new node is allowed to become the primary.
2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you
cannot do this safely. In practice, it would probably be OK if you make the lease times and
timeouts long enough. This has the advantage that we don't need to introduce a new
component to the architecture.
3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The
next chapter describes this option.
Built-in Paxos
--------------
The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes
as both Proposers and Learners.
Each WAL safekeeper holds an Epoch value in addition to the VCL and
the WAL. Each request by the primary to safekeep WAL is accompanied by
an Epoch value. If a safekeeper receives a request with Epoch that
doesn't match its current Accepted Epoch, it must ignore (NACK) it.
(In different Paxos papers, Epochs are called "terms" or "round
numbers")
When a node wants to become the primary, it generates a new Epoch
value that is higher than any previously observed Epoch value, and
globally unique.
Accepted Epoch: 555 VCL LSN
| |
V V
.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
Archived WAL Completed WAL In-flight WAL
Primary node startup:
1. Contact all WAL safekeepers that you can reach (if you cannot
connect to a quorum of them, you can give up immediately). Find the
latest Epoch among them.
2. Generate a new globally unique Epoch, greater than the latest Epoch
found in previous step.
2. Send the new Epoch in a Prepare message to a quorum of
safekeepers. (PAXOS Prepare message)
3. Each safekeeper responds with a Promise. If a safekeeper has
already made a promise with a higher Epoch, it doesn't respond (or
responds with a NACK). After making a promise, the safekeeper stops
responding to any write requests with earlier Epoch.
4. Once you have received a majority of promises, you know that the
VCL cannot advance on the old Epoch anymore. This effectively kills
any old primary server.
5. Find the highest written LSN among the quorum of safekeepers (these
can be included in the Promise messages already). This is the new
VCL. If a new node starts the election process after this point,
it will compute the same or higher VCL.
6. Copy the WAL from the safekeeper with the highest LSN to the other
safekeepers in the quorum, using the new Epoch. (PAXOS Accept
phase)
7. You can now start generating new WAL starting from the VCL. If
another process starts the election process after this point and
gains control of a majority of the safekeepers, we will no longer
be able to advance the VCL.

View File

@@ -0,0 +1,103 @@
# Zenith local
Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome.
#### Why do we need it?
- For distribution - this easy to use binary will help us to build adoption among developers.
- For internal use - to test all components together.
In my understanding, we consider it to be just a mock-up version of zenith-cloud.
> Question: How much should we care about durability and security issues for a local setup?
#### Why is it better than a simple local postgres?
- Easy one-line setup. As simple as `cargo install zenith && zenith start`
- Quick and cheap creation of compute nodes over the same storage.
> Question: How can we describe a use-case for this feature?
- Zenith-local can work with S3 directly.
- Push and pull images (snapshots) to remote S3 to exchange data with other users.
- Quick and cheap snapshot checkouts to switch back and forth in the database history.
> Question: Do we want it in the very first release? This feature seems quite complicated.
#### Distribution:
Ideally, just one binary that incorporates all elements we need.
> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL.
#### Components:
- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way.
CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
- **zenith-console** - WEB UI with same functionality as CLI.
>Note: not for the first release.
- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
> Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
> Question: Do we use it together with local page store or they are interchangeable?
WIP code is ???
- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
#### REST API:
Service endpoint: `http://localhost:3000`
Resources:
- /storages - Where data lives: zenith-pageserver or zenith-s3
- /pgs - Postgres - zenith-computenode
- /snapshots - snapshots **TODO**
>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
Methods and their mapping to CLI:
- /storages - zenith-pageserver or zenith-s3
CLI | REST API
------------- | -------------
storage attach -n name --type [native\s3] --path=[datadir\URL] | PUT -d { "name": "name", "type": "native", "path": "/tmp" } /storages
storage detach -n name | DELETE /storages/:storage_name
storage list | GET /storages
storage show -n name | GET /storages/:storage_name
- /pgs - zenith-computenode
CLI | REST API
------------- | -------------
pg create -n name --s storage_name | PUT -d { "name": "name", "storage_name": "storage_name" } /pgs
pg destroy -n name | DELETE /pgs/:pg_name
pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"} /pgs/:pg_name /actions
pg stop -n name | POST -d {"action": "stop"} /pgs/:pg_name /actions
pg promote -n name | POST -d {"action": "promote"} /pgs/:pg_name /actions
pg list | GET /pgs
pg show -n name | GET /pgs/:pg_name
- /snapshots **TODO**
CLI | REST API
------------- | -------------

View File

@@ -0,0 +1,64 @@
Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
# CLI v2 (after chatting with Carl)
Zenith introduces the notion of a repository.
```bash
zenith init
zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
```
Once you have a cluster catalog you can explore it
```bash
zenith log -- returns a list of commits
zenith status -- returns if there are changes in the catalog that can be committed
zenith commit -- commits the changes and generates a new commit hash
zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
```
To make changes in the catalog you need to run compute nodes
```bash
-- here is how you a compute node
zenith start /home/pipedpiper/northwind:main -- starts a compute instance
zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
-- you can start a compute node against any hash or branch
zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port)
-- you can start a compute node against any hash or branch
zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start anothe compute instance (on different port)
-- After running some DML you can run
-- zenith status and see how there are two WAL streams one on top of
-- the main branch
zenith status
-- and another on top of the experimental branch
zenith status -b experimental
-- you can commit each branch separately
zenith commit main
-- or
zenith commit -c /home/pipedpiper/northwind:experimental
```
Starting compute instances against cloud environments
```bash
-- you can start a compute instance against the cloud environment
-- in this case all of the changes will be streamed into the cloud
zenith start https://zenith:tech/pipedpiper/northwind:main
zenith start https://zenith:tech/pipedpiper/northwind:main
zenith status -c https://zenith:tech/pipedpiper/northwind:main
zenith commit -c https://zenith:tech/pipedpiper/northwind:main
zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
```
Pushing data into the cloud
```bash
-- pull all the commits from the cloud
zenith pull
-- push all the commits to the cloud
zenith push
```

View File

@@ -0,0 +1,140 @@
# Repository format
A Zenith repository is similar to a traditional PostgreSQL backup
archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
multiple versions of a PostgreSQL database cluster.
The distinguishing feature is that you can launch a Zenith Postgres
server directly against a branch in the repository, without having to
"restore" it first. Also, Zenith manages the storage automatically,
there is no separation between full and incremental backups nor WAL
archive. Zenith relies heavily on the WAL, and uses concepts similar
to incremental backups and WAL archiving internally, but it is hidden
from the user.
## Directory structure, version 1
This first version is pretty straightforward but not very
efficient. Just something to get us started.
The repository directory looks like this:
.zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
.zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
.zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
.zenith/refs/branches/mybranch
.zenith/refs/tags/foo
.zenith/refs/tags/bar
.zenith/datadirs/<timeline uuid>
### Timelines
A timeline is similar to PostgeSQL's timeline, but is identified by a
UUID instead of a 32-bit timeline Id. For user convenience, it can be
given a name that refers to the UUID (called a branch).
All WAL is generated on a timeline. You can launch a read-only node
against a tag or arbitrary LSN on a timeline, but in order to write,
you need to create a timeline.
Each timeline is stored in a directory under .zenith/timelines. It
consists of a WAL archive, containing all the WAL in the standard
PostgreSQL format, under the wal/ subdirectory.
The 'snapshots/' subdirectory, contains "base backups" of the data
directory at a different LSNs. Each snapshot is simply a copy of the
Postgres data directory.
When a new timeline is forked from a previous timeline, the ancestor
timeline's UUID is stored in the 'history' file.
### Refs
There are two kinds of named objects in the repository: branches and
tags. A branch is a human-friendly name for a timeline UUID, and a
tag is a human-friendly name for a specific LSN on a timeline
(timeline UUID + LSN). Like in git, these are just for user
convenience; you can also use timeline UUIDs and LSNs directly.
Refs do have one additional purpose though: naming a timeline or LSN
prevents it from being automatically garbage collected.
The refs directory contains a small text file for each tag/branch. It
contains the UUID of the timeline (and LSN, for tags).
### Datadirs
.zenith/datadirs contains PostgreSQL data directories. You can launch
a Postgres instance on one of them with:
```
postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
```
All the actual data is kept in the timeline directories, under
.zenith/timelines. The data directories are only needed for active
PostgreQSL instances. After an instance is stopped, the data directory
can be safely removed. "zenith start" will recreate it quickly from
the data in .zenith/timelines, if it's missing.
## Version 2
The format described above isn't very different from a traditional
daily base backup + WAL archive configuration. The main difference is
the nicer naming of branches and tags.
That's not very efficient. For performance, we need something like
incremental backups that don't require making a full copy of all
data. So only store modified files or pages. And instead of having to
replay all WAL from the last snapshot, "slice" the WAL into
per-relation WAL files and only recover what's needed when a table is
accessed.
In version 2, the file format in the "snapshots" subdirectory gets
more advanced. The exact format is TODO. But it should support:
- storing WAL records of individual relations/pages
- storing a delta from an older snapshot
- compression
## Operations
### Garbage collection
When you run "zenith gc", old timelines that are no longer needed are
removed. That involves collecting the list of "unreachable" objects,
starting from the named branches and tags.
Also, if enough WAL has been generated on a timeline since last
snapshot, a new snapshot or delta is created.
### zenith push/pull
Compare the tags and branches on both servers, and copy missing ones.
For each branch, compare the timeline it points to in both servers. If
one is behind the other, copy the missing parts.
FIXME: how do you prevent confusion if you have to clones of the same
repository, launch an instance on the same branch in both clones, and
later try to push/pull between them? Perhaps create a new timeline
every time you start up an instance? Then you would detect that the
timelines have diverged. That would match with the "epoch" concept
that we have in the WAL safekeepr
### zenith checkout/commit
In this format, there is no concept of a "working tree", and hence no
concept of checking out or committing. All modifications are done on
a branch or a timeline. As soon as you launch a server, the changes are
appended to the timeline.
You can easily fork off a temporary timeline to emulate a "working tree".
You can later remove it and have it garbage collected, or to "commit",
re-point the branch to the new timeline.
If we want to have a worktree and "zenith checkout/commit" concept, we can
emulate that with a temporary timeline. Create the temporary timeline at
"zenith checkout", and have "zenith commit" modify the branch to point to
the new timeline.

View File

@@ -0,0 +1,93 @@
How it works now
----------------
1. Create repository, start page server on it
```
$ zenith init
...
created main branch
new zenith repository was created in .zenith
$ zenith pageserver start
Starting pageserver at '127.0.0.1:64000' in .zenith
Page server started
```
2. Create a branch, and start a Postgres instance on it
```
$ zenith branch heikki main
branching at end of WAL: 0/15ECF68
$ zenith pg create heikki
Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
$ zenith pg start pg1
Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
waiting for server to start.... done
server started
```
3. Connect to it and run queries
```
$ psql "dbname=postgres port=55432"
psql (14devel)
Type "help" for help.
postgres=#
```
Proposal: Serverless on your Laptop
-----------------------------------
We've been talking about doing the "pg create" step automatically at
"pg start", to eliminate that step. What if we go further, go
serverless on your laptop, so that the workflow becomes just:
1. Create repository, start page server on it (same as before)
```
$ zenith init
...
created main branch
new zenith repository was created in .zenith
$ zenith pageserver start
Starting pageserver at '127.0.0.1:64000' in .zenith
Page server started
```
2. Create branch
```
$ zenith branch heikki main
branching at end of WAL: 0/15ECF68
```
3. Connect to it:
```
$ psql "dbname=postgres port=5432 branch=heikki"
psql (14devel)
Type "help" for help.
postgres=#
```
The trick behind the scenes is that when you launch the page server,
it starts to listen on port 5432. When you connect to it with psql, it
looks at the 'branch' parameter that you passed in the connection
string. It automatically performs the "pg create" and "pg start" steps
for that branch, and then forwards the connection to the Postgres
instance that it launched. After you disconnect, if there are no more
active connections to the server running on the branch, it can
automatically shut it down again.
This is how serverless would work in the cloud. We can do it on your
laptop, too.

View File

@@ -0,0 +1,66 @@
# Push and pull between pageservers
Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal.
## Origin management
The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
```
zenith origin add <name> <connection_uri>
zenith origin list
zenith origin remove <name>
```
Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
Behind the scenes, this commands may update toml file inside .zenith directory.
## Push
### Pushing branch
```
zenith push mybranch cloudserver # push to eponymous branch in cloudserver
zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
```
Exact mechanics would be slightly different in the following situations:
1) Destination branch does not exist.
That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*].
The exact mechanics may be the following:
* CLI asks local pageserver to perform push and hands over connection uri: `perform_push <branch_name> <uri>`.
* local pageserver connects to the remote pageserver and runs `branch_push <branch_name> <timetine_id>`
Handler for branch_create would create destination timeline and switch connection to copyboth mode.
* Sending pageserver may start iterator on that timeline and send all the records as copy messages.
2) Destination branch exists and latest_valid_lsn is less than ours.
In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan.
3) Destination branch exists and latest_valid_lsn is bigger than ours.
In this case, we can't push to that branch. We can only pull.
### Pulling branch
Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands:
* CLI calls `perform_pull <branch_name> <uri>` on local pageserver.
* local pageserver calls `branch_pull <branch_name> <timetine_id>` on remote pageserver.
* remote pageserver sends records in our direction
But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push.
[*] It looks to me that there are two different possible approaches to handling unrelated timelines:
1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not.
2) Transparently create and manage several repositories in one pageserver.
But that is the topic for a separate RFC/discussion.

View File

@@ -0,0 +1,56 @@
While working on export/import commands, I understood that they fit really well into "snapshot-first design".
We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith.
So here is an attemt to design consistent CLI for diferent usage scenarios:
#### 1. Start empty pageserver.
That is what we have now.
Init empty pageserver using `initdb` in temporary directory.
`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/.
Save`storage_dest` and other parameters in config.
Push snapshots to `storage_dest` in background.
```
zenith init --storage_dest=S3_PREFIX
zenith start
```
#### 2. Restart pageserver (manually or crash-recovery).
Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
Push snapshots to `storage_dest` in background.
```
zenith start
```
#### 3. Import.
Start pageserver from existing snapshot.
Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...`
Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation.
Save`storage_dest` parameters in config.
Push snapshots to `storage_dest` in background.
```
//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
zenith start
```
How to pass credentials needed for `snapshot_path`?
#### 4. Export.
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
```
zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
```
#### Notes and questions
- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
- We can think of better names for all options.
- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.

View File

@@ -0,0 +1,227 @@
# Preface
GetPage@LSN can be called with older LSNs, and the page server needs
to be able to reconstruct older page versions. That's needed for
having read-only replicas that lag behind the primary, or that are
"anchored" at an older LSN, and internally in the page server whne you
branch at an older point in time. How do you do that?
For now, I'm not considering incremental snapshots at all. I don't
think that changes things. So whenever you create a snapshot or a
snapshot file, it contains an image of all the pages, there is no need
to look at an older snapshot file.
Also, I'm imagining that this works on a per-relation basis, so that
each snapshot file contains data for one relation. A "relation" is a
fuzzy concept - it could actually be one 1 GB relation segment. Or it
could include all the different "forks" of a relation, or you could
treat each fork as a separate relation for storage purpose. And once
we have the "non-relational" work is finished, a "relation" could
actually mean some other versioned object kept in the PostgreSQL data
directory. Let's ignore that for now.
# Eric's RFC:
Every now and then, you create a "snapshot". It means that you create
a new snapshot file for each relation that was modified after the last
snapshot, and write out the contents the relation as it is/was at the
snapshot LSN. Write-ahead log is stored separately in S3 by the WAL
safekeeping service, in the original PostgreSQL WAL file format.
SNAPSHOT @100 WAL
. |
. |
. |
. |
SNAPSHOT @200 |
. |
. |
. |
. |
SNAPSHOT @300 |
. |
. V
IN-MEMORY @400
If a GetPage@LSN request comes from the primary, you return the latest
page from the in-memory layer. If there is no trace of the page in
memory, it means that it hasn't been modified since the last snapshot,
so you return the page from the latest snapshot, at LSN 300 in the
above example.
PITR is implemented using the original WAL files:
If a GetPage@LSN request comes from a read replica with LSN 250, you
read the image of the page from the snapshot at LSN 200, and you also
scan the WAL between 200 and 250, and apply all WAL records for the
requested page, to reconstruct it at LSN 250.
Scanning the WAL naively for every GetPage@LSN request would be
expensive, so in practice you'd construct an in-memory data structure
of all the WAL between 200 and 250 once that allows quickly looking up
records for a given page.
## Problems/questions
I think you'll need to store the list of snapshot LSNs on each
timeline somewhere.
If the latest snapshot of a relation is at LSN 100, and you request a
page at LSN 1000000, how do you know if there are some modifications
to it between 100 and 1000000 that you need to replay? You can scan
all the WAL between 100 and 1000000, but that would be expensive.
You can skip that, if you know that a snapshot was taken e.g. at LSN
999900. Then you know that the fact that there is no snapshot file at
999900 means that the relation hasn't been modified between
100-999900. Then you only need to scan the WAL between 999900 and
1000000. However, there is no trace of a snapshot happening at LSN
999900 in the snapshot file for this relation, so you need to get
that information from somewhere else.
Where do you get that information from? Perhaps you can scan all the
other relations, and if you see a snapshot file for *any* relation at
LSN 999900, you know that if there were modifications to this
relation, there would be a newer snapshot file for it, too. In other
words, the list of snapshots that have been taken can be constructed
by scanning all relations and computing the union of all snapshot LSNs
that you see for any relation. But that's expensive so at least you
should keep that in memory, after computing it once. Also, if you rely
on that, it's not possible to have snapshots at different intervals
for different files. That seems limiting.
Another option is to explicitly store a list of snapshot LSNs in a
separate metadata file.
# Current implementation in the 'layered_repo' branch:
We store snapshot files like in the RFC, but each snapshot file also
contains all the WAL in the range of LSNs, so that you don't need to
fetch the WAL separately from S3. So you have "layers" like this:
SNAPSHOT+WAL 100-200
|
|
|
|
SNAPSHOT+WAL 200-300
|
|
|
|
IN-MEMORY 300-
Each "snapshot+WAL" is a file that contains a snapshot - i.e. full
copy of each page in the relation, at the *start* LSN. In addition to
that, it contains all the WAL applicable to the relation from the
start LSN to the end LSN. With that, you can reconstruct any page
version in the range that the file covers.
## Problems/questions
I can see one potential performance issue here, compared to the RFC.
Let's focus on a single relation for now. Imagine that you start from
an empty relation, and you receive WAL from 100 to 200, containing
a bunch of inserts and updates to the relation. You now have all that
WAL in memory:
memory: WAL from 100-200
We decide that it's time to materialize that to a snapshot file on
disk. We materialize full image of the relation as it was at LSN 100
to the snapshot file, and include all of the WAL. Since the relation
was initially empty, the "image" at the beginning of th range is empty
too.
So now you have one file on on disk:
SNAPSHOT+WAL 100-200
It contains a full image of the relation at LSN 100 and all WAL
between 100-200. (It's actually stored as a serialized BTreeMap of
page versions, with the page images and WAL records all stored
together in the same BtreeMap. But for this story, that's not
important.)
We now receive more WAL updating the relation, up to LSN 300. We
decide it's time to materialize a new snapshot file, and we now have
two files:
SNAPSHOT+WAL 100-200
SNAPSHOT+WAL 200-300
Note that the latest "full snapshot" that we store on disk always lags
behind by one snapshot cycle. The first file contains a full image of
the relation at LSN 100, the second at LSN 200. When we have received
WAL up to LSN 300, we write a materialized image at LSN 200. That
seems a bit silly. In the design per your RFC, you would write a
snapshots at LSNs 200 and 300, instead. That seems better.
# Third option (not implemented yet)
Store snapshot files like in the RFC, but also store per-relation
WAL files that contain WAL in a range of LSNs for that relation.
SNAPSHOT @100 WAL 100-200
. |
. |
. |
. |
SNAPSHOT @200 WAL 200-300
. |
. |
. |
. |
SNAPSHOT @300
.
.
IN-MEMORY 300-
This could be the best of both worlds. The snapshot files would be
independent of the PostgreSQL WAL format. When it's time to write
snapshot file @300, you write a full image of the relation at LSN 300,
and you write the WAL that you had accumulated between 200 and 300 to
a separate file. That way, you don't "lag behind" for one snapshot
cycle like in the current implementation. But you still have the WAL
for a particular relation readily available alongside the snapshot
files, and you don't need to track what snapshot LSNs exist
separately.
(If we wanted to minize the number of files, you could include the
snapshot @300 and the WAL between 200 and 300 in the same file, but I
feel it's probably better to keep them separate)
# Further thoughts
There's no fundamental reason why the LSNs of the snapshot files and the
ranges of the WAL files would need to line up. So this would be possible
too:
SNAPSHOT @100 WAL 100-150
. |
. |
. WAL 150-250
. |
SNAPSHOT @200 |
. |
. WAL 250-400
. |
. |
SNAPSHOT @300 |
. |
. |
IN-MEMORY 300-
I'm not sure what the benefit of this would be. You could materialize
additional snapshot files in the middle of a range covered by a WAL
file, maybe? Might be useful to speed up access when you create a new
branch in the middle of an LSN range or if there's some other reason
to believe that a particular LSN is "interesting" and there will be
a lot of requests using it.

View File

@@ -0,0 +1,148 @@
# Snapshot-first storage architecture
Goals:
- Long-term storage of database pages.
- Easy snapshots; simple snapshot and branch management.
- Allow cloud-based snapshot/branch management.
- Allow cloud-centric branching; decouple branch state from running pageserver.
- Allow customer ownership of data via s3 permissions.
- Provide same or better performance for typical workloads, vs plain postgres.
Non-goals:
- Service database reads from s3 (reads should be serviced from the pageserver cache).
- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot).
## Principle of operation
The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that havent yet been written back to s3.
In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere.
The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not.
Its expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesnt seem very useful right now.
Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling.
Objects in s3 are immutable snapshots, never to be modified once written (only deleted).
Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). Its expected that most snapshots are incremental to keep storage costs low.
Its expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance.
No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots.
A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica).
WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.)
## Pageserver operation
To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed.
To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down.
Its assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch.
The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot.
The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.)
If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches.
The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions.
The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow.
The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If thats still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal).
A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot.
## Cloud snapshot manager operation
Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent):
Create/delete/clone/rename a database
Create a new branch (possibly from a historical snapshot)
Start/stop the pageserver/safekeeper on a branch
List databases/branches/snapshots that are visible to this user account
Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries.
This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries.
## Snapshot names, deletion and concurrency
There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. Its assumed that concurrent delete wont disrupt a read in flight, but it may be possible for some process to read Bs header, and then discover on the next operation that B is gone.
For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails. This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values. `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded.
## Branching
A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen:
- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch.
- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object.
- The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages.
- The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data.
- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice.
Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same.
## Long-term file format
Snapshot files (and any other object stored in s3) must be readable by future software versions.
It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management.
Files should contain the following metadata, in addition to the set of pages:
- The version of the file format.
- A unique identifier for this branch (should be worldwide-unique and unchanging).
- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging).
- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges).
- The location of the predecessor branch snapshot, if different from this branchs location.
- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0.
- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle).
- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity.
A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database.
Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only.
## S3 semantics, and other kinds of storage
For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. dont edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket.
Some users may want to use a local or network filesystem in place of s3. This isnt prohibited but its not a priority, either.
Alternate implementations of s3 should be supported, including Google Cloud Storage.
Azure Blob Storage should be supported. We assume (without evidence) that its semantically equivalent to s3 for this purpose.
The properties of s3 that we depend on are:
list objects
streaming read of entire object
read byte range from object
streaming write new object (may use multipart upload for better relialibity)
delete object (that should not disrupt an already-started read).
Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content theyre supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully.
## Notes
Possible simplifications, for a first draft implementation:
- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later.
- Dont worry about the details of the squashing process yet.
- Dont implement cloud metadata service; try to make everything work using basic s3 list-objects and reads.
- Dont implement rename, delete at first.
- Dont implement public/private, just use s3 permissions.
- Dont worry about sharing history yet-- each user has their own bucket and a full copy of all data.
- Dont worry about history that spans multiple buckets.
- Dont worry about s3 regions.
- Dont support user-writeable s3 buckets; users get only read-only access at most.
Open questions:
- How important is point-in-time recovery? When should we add this? How should it work?
- Should snapshot files use compression?
- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as theyre created.
- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy?
- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver?
- How can pageserver software upgrade be done with minimal downtime?

View File

@@ -0,0 +1,144 @@
# Storage details
Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details.
## Overview
![storage](images/storage.jpeg)
### MemStore
MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL.
### PageIndex
PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset):
* PageStoreRef -- page offset in the PageStore
* LocalStoreRef -- snapshot_id and page offset inside of that snapshot
* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore
PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized.
We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper.
### WalStore
WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory.
For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server).
### PageStore
PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it.
There are few possible options for PageStore:
a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation.
b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex.
I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete.
With option b) we can also treat PageStor as an uncompleted increamental snapshot.
### LocalStore
LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold.
## Granularity
By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account:
* can we shard big databases between page servers?
* how much time will we spend applying WAL to access certain pages with older LSN's?
* how many files do we create for a single database?
I can think of the following options here:
1. whole database goes to one full snapshot.
* +: we never create a lot of files for one database
* +: the approach is quite straightforward, moving data around is simple
* -: can not be sharded
* -: long recovery -- we always need to recover the whole database
2. table segment is the unit of snapshotting
* +: straightforward for sharding
* +: individual segment can be quickly recovered with sliced WAL
* -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big.
3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots.
* +: addresses all mentioned issues
* -: harder to implement
I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it.
Both PageStore and WalStore should be "sharded" by this granularity level.
## Security
We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials.
Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure.
## Dynamics
### WAL stream handling
When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value.
### getPage queries
Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page.
### Starting page server without local data
* build set of latest full snapshots and incremental snapshots on top of them
* load all their metadata into PageIndex
* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot
* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore).
### Starting page server with local data (aka restart or reboot)
* check that local snapshot files are consistent with S3
### Snapshot creation
Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore.
To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation.
Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots.
### S3 pushdown
When we have several full snapshots GC can push the old one with its increments to S3.
### Branch creation
Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky.
## File formats
As far as I understand Bookfile/Aversion addresses versioning and serialization parts.
As for exact data that should go to snapshots I think it is the following for each snapshot:
* format version number
* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number.
* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile
* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records
* pages, one by one
* WAL records, one by one
It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))).
1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small).
2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor.
I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines.
Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned:
1. snapshot lsn=200, includes WAL in range 200-300
2. snapshot lsn=200, includes WAL in range 100-200
3. data snapshots are separated from WAL snapshots
Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas).

View File

@@ -0,0 +1,91 @@
# User-visible timeline history
The user can specify a retention policy. The retention policy is
presented to the user as a PITR period and snapshots. The PITR period
is the amount of recent history that needs to be retained, as minutes,
hours, or days. Within that period, you can create a branch or
snapshot at any point in time, open a compute node, and start running
queries. Internally, a PITR period is represented as a range of LSNs
The user can also create snapshots. A snapshot is a point in time,
internally represented by an LSN. The user gives the snapshot a name.
The user can also specify an interval, at which the system creates
snapshots automatically. For example, create a snapshot every night at
2 AM. After some user-specified time, old automatically created
snapshots are removed.
Snapshot Snapshot
PITR "Monday" "Tuesday" PITR
----######----------+-------------+-------------######>
If there are multiple branches, you can specify different policies or
different branches.
The PITR period and user-visible snapshots together define the
retention policy.
NOTE: As presented here, this is probably overly flexible. In reality,
we want to keep the user interface simple. Only allow a PITR period at
the tip of a branch, for example. But that doesn't make much
difference to the internals.
# Retention policy behind the scenes
The retention policy consists of points (for snapshots) and ranges
(for PITR periods).
The system must be able to reconstruct any page within the retention
policy. Other page versions can be garbage collected away. We have a
lot of flexibility on when to perform the garbage collection and how
aggressive it is.
# Base images and WAL slices
The page versions are stored in two kinds of files: base images and
WAL slices. A base image contains a dump of all the pages of one
relation at a specific LSN. A WAL slice contains all the WAL in an LSN
range.
|
|
|
| --Base img @100 +
| |
| | WAL slice
| | 100-200
| |
| --Base img @200 +
| |
| | WAL slice
| | 200-300
| |
| +
|
V
To recover a page e.g. at LSN 150, you need the base image at LSN 100,
and the WAL slice 100-200.
All of this works at a per-relation or per-relation-segment basis. If
a relation is updated very frequently, we create base images and WAL
slices for it more quickly. For a relation that's updated
infrequently, we hold the recent WAL for that relation longer, and
only write it out when we need to release the disk space occupied by
the original WAL. (We need a backstop like that, because until all the
WAL/base images have been been durably copied to S3, we must keep the
original WAL for that period somewhere, in the WAL service or in S3.)
# Branching
Internally, branch points are also "retention points", in addition to
the user-visible snapshots. If a branch has been forked off at LSN
100, we need to be able to reconstruct any page on the parent branch
at that LSN, because it is needed by the child branch. If a page is
modified in the child, we don't need to keep that in the parent
anymore, though.

View File

@@ -0,0 +1,38 @@
# Eviction
Write out in-memory layer to disk, into a delta layer.
- To release memory
- To make it possible to advance disk_consistent_lsn and allow the WAL
service to release some WAL.
- Triggered if we are short on memory
- Or if the oldest in-memory layer is so old that it's holding back
the WAL service from removing old WAL
# Materialization
Create a new image layer of a segment, by performing WAL redo
- To reduce the amount of WAL that needs to be replayed on a GetPage request.
- To allow garbage collection of old layers
- Triggered by distance to last full image of a page
# Coalescing
Replace N consecutive layers of a segment with one larger layer.
- To reduce the number of small files that needs to be uploaded to S3
# Bundling
Zip together multiple small files belonging to different segments.
- To reduce the number of small files that needs to be uploaded to S3
# Garbage collection
Remove a layer that's older than the GC horizon, and isn't needed anymore.

View File

@@ -0,0 +1,147 @@
# What
Currently, apart from WAL safekeeper persistently stores only two logical clock
counter (aka term) values, sourced from the same sequence. The first is bumped
whenever safekeeper gives vote to proposer (or acknowledges already elected one)
and e.g. prevents electing two proposers with the same term -- it is actually
called `term` in the code. The second, called `epoch`, reflects progress of log
receival and this might lag behind `term`; safekeeper switches to epoch `n` when
it has received all committed log records from all `< n` terms. This roughly
correspones to proposed in
https://github.com/zenithdb/rfcs/pull/3/files
This makes our biggest our difference from Raft. In Raft, every log record is
stamped with term in which it was generated; while we essentialy store in
`epoch` only the term of the highest record on this safekeeper -- when we know
it -- because during recovery generally we don't, and `epoch` is bumped directly
to the term of the proposer who performs the recovery when it is finished. It is
not immediately obvious that this simplification is safe. I thought and I still
think it is; model checking confirmed that. However, some details now make me
believe it is better to keep full term switching history (which is equivalent to
knowing term of each record).
# Why
Without knowing full history (list of <term, LSN> pairs) of terms it is hard to
determine the exact divergence point, and if we don't perform truncation at that
point safety becomes questionable. Consider the following history, with
safekeepers A, B, C, D, E. n_m means record created by proposer in term n with
LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y.
1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only
on A.
<pre>
A(t=1, e=1) 1.1 1.2 1.3 1.4
B(t=1, e=1) 1.1
C(t=1, e=1) 1.1
D(t=1, e=1) 1.1
E(t=1, e=1) 1.1
</pre>
2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD:
<pre>
A(t=1, e=1) 1.1 1.2 1.3 1.4
B(t=1, e=1) 1.1
C(t=2, e=2) 1.1 2.2 2.3
D(t=2, e=2) 1.1 2.2 2.3
E(t=2, e=1) 1.1
</pre>
3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D:
<pre>
A(t=1, e=1) 1.1 1.2 1.3 1.4
B(t=1, e=1) 1.1
C(t=3, e=2) 1.1 2.2 2.3
D(t=3, e=3) 1.1 2.2 2.3 3.4
E(t=3, e=1) 1.1
</pre>
Now, A gets back and P3 starts recovering it. How it should proceed? There are
two options.
## Don't try to find divergence point at all
...start sending WAL conservatively since the horizon (1.1), and truncate
obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is
reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes.
Then the following is possible:
4) P3 moves one record 2.2 to A.
<pre>
A(t=1, e=1) 1.1 <b>2.2</b> 1.3 1.4
B(t=1, e=1) 1.1 1.2
C(t=3, e=2) 1.1 2.2 2.3
D(t=3, e=3) 1.1 2.2 2.3 3.4
E(t=3, e=1) 1.1
</pre>
Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and
A's log is the longest one, they can elect P4 who will commit such log.
Note that this particular history couldn't happen if we forbid to *create* new
records in term n until majority of safekeepers switch to it. It would force CDE
to switch to 2 before 2.2 is created, and A could never become donor while his
log is corrupted. Generally with this additional barrier I believe the algorithm
becomes safe, but
- I don't like this kind of artificial barrier;
- I also feel somewhat discomfortable about even temporary having intentionally
corrupted WAL;
- I'd still model check the idea.
## Find divergence point and truncate at it
Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we
do that? Without term switching history we have to resort to sending again since
the horizon and memcmp'ing records, which is inefficient and ugly. Or we can
maintain full history and determine truncation point by comparing 'wrong' and
'right' histories -- much like pg_rewind does -- and perform truncation + start
streaming right there.
# Proposal
- Add term history as array of <term, LSN> pairs to safekeeper controlfile.
- Return it to proposer with VoteResponse so 1) proposer can tell it to other
nodes and 2) determine personal streaming starting point. However, since we
don't append WAL and update controlfile atomically, let's first always update
controlfile but send only the history of what we really have (up to highest
term in history where begin_lsn >= end of wal; this highest term replaces
current `epoch`). We also send end of wal as we do now to determine the donor.
- Create ProposerAnnouncement message which proposer sends before starting
streaming. It announces proposer as elected and
1) Truncates wrong part of WAL on safekeeper
(divergence point is already calculated at proposer, but can be
cross-verified here).
2) Communicates the 'right' history of its term (taken from donor). Seems
better to immediately put the history in the controlfile,
though safekeeper might not have full WAL for previous terms in it --
this way is simpler, and we can't update WAL and controlfile atomically anyway.
This also constitutes analogue of current epoch bump for those safekeepers
which don't need recovery, which is important for sync-safekeepers (bump
epoch without waiting records from new term).
- After ProposerAnnouncement proposer streams WAL since calculated starting
point -- only what is missing.
pros/cons:
+ (more) clear safety of WAL truncation -- we get very close to Raft
+ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters
only for 5+ nodes)
+ adds some observability at safekeepers
- complexity, but not that much
# Misc
- During model checking I did truncation on first locally non existent or
different record -- analogue of 'memcmp' variant described above.

95
docs/rfcs/README.md Normal file
View File

@@ -0,0 +1,95 @@
This directory contains Request for Comments documents, or RFCs, for
features or concepts that have been proposed. Alternative names:
technical design doc, ERD, one-pager
To make a new proposal, create a new text file in this directory and
open a Pull Request with it. That gives others a chance and a forum
to comment and discuss the design.
When a feature is implemented and the code changes are committed, also
include the corresponding RFC in this directory.
Some of the RFCs in this directory have been implemented in some form
or another, while others are on the roadmap, while still others are
just obsolete and forgotten about. So read them with a grain of salt,
but hopefully even the ones that don't reflect reality give useful
context information.
## What
We use Tech Design RFCs to summarize what we are planning to
implement in our system. These RFCs should be created for large or not
obvious technical tasks, e.g. changes of the architecture or bigger
tasks that could take over a week, changes that touch multiple
components or their interaction. RFCs should fit into a couple of
pages, but could be longer on occasion.
## Why
Were using RFCs to enable early review and collaboration, reduce
uncertainties, risk and save time during the implementation phase that
follows the Tech Design RFC.
Tech Design RFCs also aim to avoid bus factor and are an additional
measure to keep more peers up to date & familiar with our design and
architecture.
This is a crucial part for ensuring collaboration across timezones and
setting up for success a distributed team that works on complex
topics.
## Prior art
- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md)
- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md)
- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE)
- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process)
## How
RFC lifecycle:
- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body.
- RFC should be published for review before most of the actual code is written. This isnt a strict rule, dont hesitate to experiment and build a POC in parallel with writing an RFC.
- Add labels to the PR in the same manner as you do Issues. Example TBD
- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code.
- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach
- RFCs stop evolving once the consensus is found or the proposal is implemented and merged.
- RFCs are not intended as a documentation thats kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate.
### RFC template
Note, a lot of the sections are marked as if relevant. They are included into the template as a reminder and to help inspiration.
```
# Name
Created on ..
Implemented on ..
## Summary
## Motivation
## Non Goals (if relevant)
## Impacted components (e.g. pageserver, safekeeper, console, etc)
## Proposed implementation
### Reliability, failure modes and corner cases (if relevant)
### Interaction/Sequence diagram (if relevant)
### Scalability (if relevant)
### Security implications (if relevant)
### Unresolved questions (if relevant)
## Alternative implementation (if relevant)
## Pros/cons of proposed approaches (if relevant)
## Definition of Done (if relevant)
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 421 KiB

View File

@@ -2,7 +2,14 @@
use std::{env, path::Path, str::FromStr};
use tracing::*;
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};
use zenith_utils::{
auth::JwtAuth,
logging,
postgres_backend::AuthType,
tcp_listener,
zid::{ZTenantId, ZTimelineId},
GIT_VERSION,
};
use anyhow::{bail, Context, Result};
@@ -10,11 +17,10 @@ use clap::{App, Arg};
use daemonize::Daemonize;
use pageserver::{
branches,
config::{defaults::*, PageServerConf},
http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr,
thread_mgr::ThreadKind,
virtual_file, LOG_FILE_NAME,
timelines, virtual_file, LOG_FILE_NAME,
};
use zenith_utils::http::endpoint;
use zenith_utils::postgres_backend;
@@ -37,7 +43,7 @@ fn main() -> Result<()> {
Arg::new("init")
.long("init")
.takes_value(false)
.help("Initialize pageserver repo"),
.help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"),
)
.arg(
Arg::new("workdir")
@@ -53,6 +59,13 @@ fn main() -> Result<()> {
.help("Create tenant during init")
.requires("init"),
)
.arg(
Arg::new("initial-timeline-id")
.long("initial-timeline-id")
.takes_value(true)
.help("Use a specific timeline id during init and tenant creation")
.requires("create-tenant"),
)
// See `settings.md` for more details on the extra configuration patameters pageserver can process
.arg(
Arg::new("config-override")
@@ -72,7 +85,16 @@ fn main() -> Result<()> {
let cfg_file_path = workdir.join("pageserver.toml");
let init = arg_matches.is_present("init");
let create_tenant = arg_matches.value_of("create-tenant");
let create_tenant = arg_matches
.value_of("create-tenant")
.map(ZTenantId::from_str)
.transpose()
.context("Failed to parse tenant id from the arguments")?;
let initial_timeline_id = arg_matches
.value_of("initial-timeline-id")
.map(ZTimelineId::from_str)
.transpose()
.context("Failed to parse timeline id from the arguments")?;
// Set CWD to workdir for non-daemon modes
env::set_current_dir(&workdir).with_context(|| {
@@ -143,7 +165,8 @@ fn main() -> Result<()> {
// Create repo and exit if init was requested
if init {
branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
timelines::init_pageserver(conf, create_tenant, initial_timeline_id)
.context("Failed to init pageserver")?;
// write the config file
std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
format!(

View File

@@ -1,433 +0,0 @@
//!
//! Branch management code
//!
// TODO: move all paths construction to conf impl
//
use anyhow::{bail, Context, Result};
use postgres_ffi::ControlFileData;
use serde::{Deserialize, Serialize};
use std::{
fs,
path::Path,
process::{Command, Stdio},
str::FromStr,
sync::Arc,
};
use tracing::*;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use zenith_utils::{crashsafe_dir, logging};
use crate::config::PageServerConf;
use crate::pgdatadir_mapping::DatadirTimeline;
use crate::repository::{Repository, Timeline};
use crate::tenant_mgr;
use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
use crate::RepositoryImpl;
use crate::{import_datadir, LOG_FILE_NAME};
#[derive(Serialize, Deserialize, Clone)]
pub struct BranchInfo {
pub name: String,
#[serde(with = "hex")]
pub timeline_id: ZTimelineId,
pub latest_valid_lsn: Lsn,
pub ancestor_id: Option<String>,
pub ancestor_lsn: Option<String>,
pub current_logical_size: usize,
pub current_logical_size_non_incremental: Option<usize>,
}
impl BranchInfo {
pub fn from_path<T: AsRef<Path>>(
path: T,
tenantid: ZTenantId,
include_non_incremental_logical_size: bool,
) -> Result<Self> {
let path = path.as_ref();
let name = path.file_name().unwrap().to_string_lossy().to_string();
let timeline_id = std::fs::read_to_string(path)
.with_context(|| {
format!(
"Failed to read branch file contents at path '{}'",
path.display()
)
})?
.parse::<ZTimelineId>()?;
let timeline = match tenant_mgr::get_timeline_for_tenant(tenantid, timeline_id) {
Ok(timeline) => timeline,
Err(err) => {
// FIXME: this was:
// bail!("Timeline {} is remote, no branches to display", timeline_id)
//
// but we cannot distinguish that from other errors now. Have
// get_timeline_for_tenant() return a more specific error
return Err(err);
}
};
// we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
let (ancestor_id, ancestor_lsn) = match timeline.tline.get_ancestor_timeline_id() {
Some(ancestor_id) => (
Some(ancestor_id.to_string()),
Some(timeline.tline.get_ancestor_lsn().to_string()),
),
None => (None, None),
};
// non incremental size calculation can be heavy, so let it be optional
// needed for tests to check size calculation
let current_logical_size_non_incremental = include_non_incremental_logical_size
.then(|| {
timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
})
.transpose()?;
let current_logical_size = timeline.get_current_logical_size();
Ok(BranchInfo {
name,
timeline_id,
latest_valid_lsn: timeline.get_last_record_lsn(),
ancestor_id,
ancestor_lsn,
current_logical_size,
current_logical_size_non_incremental,
})
}
}
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
pub timelineid: ZTimelineId,
pub lsn: Lsn,
}
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
// Initialize logger
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
let _log_file = logging::init(LOG_FILE_NAME, true)?;
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
// process during repository initialization.
//
// FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
// initdb in the background, and it kept running even after the "zenith init" had exited.
// In tests, we started the page server immediately after that, so that initdb was still
// running in the background, and we failed to run initdb again in the same directory. This
// has been solved for the rapid init+start case now, but the general race condition remains
// if you restart the server quickly. The WAL redo manager doesn't use a separate thread
// anymore, but I think that could still happen.
let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {});
if let Some(tenantid) = create_tenant {
let tenantid = ZTenantId::from_str(tenantid)?;
println!("initializing tenantid {}", tenantid);
create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?;
}
crashsafe_dir::create_dir_all(conf.tenants_path())?;
println!("pageserver init succeeded");
Ok(())
}
pub fn create_repo(
conf: &'static PageServerConf,
tenantid: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
) -> Result<Arc<RepositoryImpl>> {
let repo_dir = conf.tenant_path(&tenantid);
if repo_dir.exists() {
bail!("repo for {} already exists", tenantid)
}
// top-level dir may exist if we are creating it through CLI
crashsafe_dir::create_dir_all(&repo_dir)
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
info!("created directory structure in {}", repo_dir.display());
// create a new timeline directory
let timeline_id = ZTimelineId::generate();
let timelinedir = conf.timeline_path(&timeline_id, &tenantid);
crashsafe_dir::create_dir(&timelinedir)?;
let repo = crate::layered_repository::LayeredRepository::new(
conf,
wal_redo_manager,
tenantid,
conf.remote_storage_config.is_some(),
);
// Load data into pageserver
// TODO To implement zenith import we need to
// move data loading out of create_repo()
bootstrap_timeline(conf, tenantid, timeline_id, &repo)?;
Ok(Arc::new(repo))
}
// Returns checkpoint LSN from controlfile
fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
let lsn = controlfile.checkPoint;
Ok(Lsn(lsn))
}
// Create the cluster temporarily in 'initdbpath' directory inside the repository
// to get bootstrap data for timeline initialization.
//
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
info!("running initdb in {}... ", initdbpath.display());
let initdb_path = conf.pg_bin_dir().join("initdb");
let initdb_output = Command::new(initdb_path)
.args(&["-D", initdbpath.to_str().unwrap()])
.args(&["-U", &conf.superuser])
.args(&["-E", "utf8"])
.arg("--no-instructions")
// This is only used for a temporary installation that is deleted shortly after,
// so no need to fsync it
.arg("--no-sync")
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null())
.output()
.context("failed to execute initdb")?;
if !initdb_output.status.success() {
anyhow::bail!(
"initdb failed: '{}'",
String::from_utf8_lossy(&initdb_output.stderr)
);
}
Ok(())
}
//
// - run initdb to init temporary instance and get bootstrap data
// - after initialization complete, remove the temp dir.
//
fn bootstrap_timeline<R: Repository>(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tli: ZTimelineId,
repo: &R,
) -> Result<()> {
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
// Init temporarily repo to get bootstrap data
run_initdb(conf, &initdb_path)?;
let pgdata_path = initdb_path;
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
// Import the contents of the data directory at the initial checkpoint
// LSN, and any WAL after that.
// Initdb lsn will be equal to last_record_lsn which will be set after import.
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
let timeline = repo.create_empty_timeline(tli, lsn)?;
let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline);
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
println!(
"created initial timeline {} timeline.lsn {}",
tli,
page_tline.tline.get_last_record_lsn()
);
let data = tli.to_string();
fs::write(conf.branch_path("main", &tenantid), data)?;
println!("created main branch");
// Remove temp dir. We don't need it anymore
fs::remove_dir_all(pgdata_path)?;
Ok(())
}
pub(crate) fn get_branches(
conf: &PageServerConf,
tenantid: &ZTenantId,
include_non_incremental_logical_size: bool,
) -> Result<Vec<BranchInfo>> {
// Each branch has a corresponding record (text file) in the refs/branches
// with timeline_id.
let branches_dir = conf.branches_path(tenantid);
std::fs::read_dir(&branches_dir)
.with_context(|| {
format!(
"Found no branches directory '{}' for tenant {}",
branches_dir.display(),
tenantid
)
})?
.map(|dir_entry_res| {
let dir_entry = dir_entry_res.with_context(|| {
format!(
"Failed to list branches directory '{}' content for tenant {}",
branches_dir.display(),
tenantid
)
})?;
BranchInfo::from_path(
dir_entry.path(),
*tenantid,
include_non_incremental_logical_size,
)
})
.collect()
}
pub(crate) fn create_branch(
conf: &PageServerConf,
branchname: &str,
startpoint_str: &str,
tenantid: &ZTenantId,
) -> Result<BranchInfo> {
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
if conf.branch_path(branchname, tenantid).exists() {
anyhow::bail!("branch {} already exists", branchname);
}
let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
let timeline = repo
.get_timeline(startpoint.timelineid)?
.local_timeline()
.context("Cannot branch off the timeline that's not present locally")?;
if startpoint.lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
startpoint.lsn = end_of_wal;
} else {
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
timeline.wait_lsn(startpoint.lsn)?;
}
startpoint.lsn = startpoint.lsn.align();
if timeline.get_ancestor_lsn() > startpoint.lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!(
"invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}",
startpoint.lsn,
branchname,
timeline.get_ancestor_lsn()
);
}
let new_timeline_id = ZTimelineId::generate();
// Forward entire timeline creation routine to repository
// backend, so it can do all needed initialization
repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;
// Remember the human-readable branch name for the new timeline.
// FIXME: there's a race condition, if you create a branch with the same
// name concurrently.
let data = new_timeline_id.to_string();
fs::write(conf.branch_path(branchname, tenantid), data)?;
Ok(BranchInfo {
name: branchname.to_string(),
timeline_id: new_timeline_id,
latest_valid_lsn: startpoint.lsn,
ancestor_id: Some(startpoint.timelineid.to_string()),
ancestor_lsn: Some(startpoint.lsn.to_string()),
current_logical_size: 0,
current_logical_size_non_incremental: Some(0),
})
}
//
// Parse user-given string that represents a point-in-time.
//
// We support multiple variants:
//
// Raw timeline id in hex, meaning the end of that timeline:
// bc62e7d612d0e6fe8f99a6dd2f281f9d
//
// A specific LSN on a timeline:
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
//
// Same, with a human-friendly branch name:
// main
// main@2/15D3DD8
//
// Human-friendly tag name:
// mytag
//
//
fn parse_point_in_time(
conf: &PageServerConf,
s: &str,
tenantid: &ZTenantId,
) -> Result<PointInTime> {
let mut strings = s.split('@');
let name = strings.next().unwrap();
let lsn = strings
.next()
.map(Lsn::from_str)
.transpose()
.context("invalid LSN in point-in-time specification")?;
// Check if it's a tag
if lsn.is_none() {
let tagpath = conf.tag_path(name, tenantid);
if tagpath.exists() {
let pointstr = fs::read_to_string(tagpath)?;
return parse_point_in_time(conf, &pointstr, tenantid);
}
}
// Check if it's a branch
// Check if it's branch @ LSN
let branchpath = conf.branch_path(name, tenantid);
if branchpath.exists() {
let pointstr = fs::read_to_string(branchpath)?;
let mut result = parse_point_in_time(conf, &pointstr, tenantid)?;
result.lsn = lsn.unwrap_or(Lsn(0));
return Ok(result);
}
// Check if it's a timelineid
// Check if it's timelineid @ LSN
if let Ok(timelineid) = ZTimelineId::from_str(name) {
let tlipath = conf.timeline_path(&timelineid, tenantid);
if tlipath.exists() {
return Ok(PointInTime {
timelineid,
lsn: lsn.unwrap_or(Lsn(0)),
});
}
}
bail!("could not parse point-in-time {}", s);
}

View File

@@ -392,22 +392,6 @@ impl PageServerConf {
self.tenants_path().join(tenantid.to_string())
}
pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf {
self.tenant_path(tenantid).join("refs").join("tags")
}
pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf {
self.tags_path(tenantid).join(tag_name)
}
pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf {
self.tenant_path(tenantid).join("refs").join("branches")
}
pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf {
self.branches_path(tenantid).join(branch_name)
}
pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
}
@@ -416,10 +400,6 @@ impl PageServerConf {
self.timelines_path(tenantid).join(timelineid.to_string())
}
pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
self.timeline_path(timelineid, tenantid).join("ancestor")
}
//
// Postgres distribution paths
//

View File

@@ -1,20 +1,121 @@
use crate::timelines::TimelineInfo;
use anyhow::{anyhow, bail, Context};
use serde::{Deserialize, Serialize};
use crate::ZTenantId;
use zenith_utils::zid::ZNodeId;
use zenith_utils::{
lsn::Lsn,
zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId},
};
#[derive(Serialize, Deserialize)]
pub struct BranchCreateRequest {
#[serde(with = "hex")]
pub tenant_id: ZTenantId,
pub name: String,
pub start_point: String,
pub struct TimelineCreateRequest {
pub new_timeline_id: Option<HexZTimelineId>,
pub ancestor_timeline_id: Option<HexZTimelineId>,
pub ancestor_start_lsn: Option<Lsn>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateRequest {
pub new_tenant_id: Option<HexZTenantId>,
}
#[derive(Serialize, Deserialize)]
pub struct TimelineInfoResponse {
pub kind: String,
#[serde(with = "hex")]
pub tenant_id: ZTenantId,
timeline_id: ZTimelineId,
#[serde(with = "hex")]
tenant_id: ZTenantId,
disk_consistent_lsn: String,
last_record_lsn: Option<String>,
prev_record_lsn: Option<String>,
ancestor_timeline_id: Option<HexZTimelineId>,
ancestor_lsn: Option<String>,
current_logical_size: Option<usize>,
current_logical_size_non_incremental: Option<usize>,
}
impl From<TimelineInfo> for TimelineInfoResponse {
fn from(other: TimelineInfo) -> Self {
match other {
TimelineInfo::Local {
timeline_id,
tenant_id,
last_record_lsn,
prev_record_lsn,
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn,
current_logical_size,
current_logical_size_non_incremental,
} => TimelineInfoResponse {
kind: "Local".to_owned(),
timeline_id,
tenant_id,
disk_consistent_lsn: disk_consistent_lsn.to_string(),
last_record_lsn: Some(last_record_lsn.to_string()),
prev_record_lsn: Some(prev_record_lsn.to_string()),
ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from),
ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()),
current_logical_size: Some(current_logical_size),
current_logical_size_non_incremental,
},
TimelineInfo::Remote {
timeline_id,
tenant_id,
disk_consistent_lsn,
} => TimelineInfoResponse {
kind: "Remote".to_owned(),
timeline_id,
tenant_id,
disk_consistent_lsn: disk_consistent_lsn.to_string(),
last_record_lsn: None,
prev_record_lsn: None,
ancestor_timeline_id: None,
ancestor_lsn: None,
current_logical_size: None,
current_logical_size_non_incremental: None,
},
}
}
}
impl TryFrom<TimelineInfoResponse> for TimelineInfo {
type Error = anyhow::Error;
fn try_from(other: TimelineInfoResponse) -> anyhow::Result<Self> {
let parse_lsn_hex_string = |lsn_string: String| {
lsn_string
.parse::<Lsn>()
.with_context(|| format!("Failed to parse Lsn as hex string from '{}'", lsn_string))
};
let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?;
Ok(match other.kind.as_str() {
"Local" => TimelineInfo::Local {
timeline_id: other.timeline_id,
tenant_id: other.tenant_id,
last_record_lsn: other
.last_record_lsn
.ok_or(anyhow!("Local timeline should have last_record_lsn"))
.and_then(parse_lsn_hex_string)?,
prev_record_lsn: other
.prev_record_lsn
.ok_or(anyhow!("Local timeline should have prev_record_lsn"))
.and_then(parse_lsn_hex_string)?,
ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from),
ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?,
disk_consistent_lsn,
current_logical_size: other.current_logical_size.ok_or(anyhow!("No "))?,
current_logical_size_non_incremental: other.current_logical_size_non_incremental,
},
"Remote" => TimelineInfo::Remote {
timeline_id: other.timeline_id,
tenant_id: other.tenant_id,
disk_consistent_lsn,
},
unknown => bail!("Unknown timeline kind: {}", unknown),
})
}
}
#[derive(Serialize)]

View File

@@ -22,7 +22,7 @@ paths:
properties:
id:
type: integer
/v1/timeline/{tenant_id}:
/v1/tenant/{tenant_id}/timeline:
parameters:
- name: tenant_id
in: path
@@ -30,19 +30,22 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: List tenant timelines
description: Get timelines for tenant
responses:
"200":
description: array of brief timeline descriptions
description: TimelineInfo
content:
application/json:
schema:
type: array
items:
# currently, just a timeline id string, but when remote index gets to be accessed
# remote/local timeline field would be added at least
type: string
$ref: "#/components/schemas/TimelineInfo"
"400":
description: Error when no tenant id found in path
content:
@@ -67,7 +70,7 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/timeline/{tenant_id}/{timeline_id}:
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
parameters:
- name: tenant_id
in: path
@@ -81,8 +84,13 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get timeline info for tenant's remote timeline
description: Get info about the timeline
responses:
"200":
description: TimelineInfo
@@ -91,7 +99,7 @@ paths:
schema:
$ref: "#/components/schemas/TimelineInfo"
"400":
description: Error when no tenant id found in path or no branch name
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
@@ -114,7 +122,7 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/branch/{tenant_id}:
/v1/tenant/{tenant_id}/timeline/:
parameters:
- name: tenant_id
in: path
@@ -122,126 +130,33 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get branches for tenant
responses:
"200":
description: BranchInfo
content:
application/json:
schema:
type: array
items:
$ref: "#/components/schemas/BranchInfo"
"400":
description: Error when no tenant id found in path
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/branch/{tenant_id}/{branch_name}:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
- name: branch_name
in: path
required: true
schema:
type: string
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get branches for tenant
responses:
"200":
description: BranchInfo
content:
application/json:
schema:
$ref: "#/components/schemas/BranchInfo"
"400":
description: Error when no tenant id found in path or no branch name
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/branch/:
post:
description: Create branch
description: |
Create a timeline. Returns new timeline id on success.\
If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
requestBody:
content:
application/json:
schema:
type: object
required:
- "tenant_id"
- "name"
- "start_point"
properties:
tenant_id:
new_timeline_id:
type: string
format: hex
name:
ancestor_timeline_id:
type: string
start_point:
format: hex
ancestor_start_lsn:
type: string
responses:
"201":
description: BranchInfo
description: TimelineInfo
content:
application/json:
schema:
$ref: "#/components/schemas/BranchInfo"
$ref: "#/components/schemas/TimelineInfo"
"400":
description: Malformed branch create request
description: Malformed timeline create request
content:
application/json:
schema:
@@ -258,6 +173,12 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409":
description: Timeline already exists, creation skipped
content:
application/json:
schema:
$ref: "#/components/schemas/AlreadyExistsError"
"500":
description: Generic operation error
content:
@@ -295,27 +216,26 @@ paths:
schema:
$ref: "#/components/schemas/Error"
post:
description: Create tenant
description: |
Create a tenant. Returns new tenant id on success.\
If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant.
requestBody:
content:
application/json:
schema:
type: object
required:
- "tenant_id"
properties:
tenant_id:
new_tenant_id:
type: string
format: hex
responses:
"201":
description: CREATED
description: New tenant created successfully
content:
application/json:
schema:
type: array
items:
type: string
type: string
format: hex
"400":
description: Malformed tenant create request
content:
@@ -334,6 +254,12 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409":
description: Tenant already exists, creation skipped
content:
application/json:
schema:
$ref: "#/components/schemas/AlreadyExistsError"
"500":
description: Generic operation error
content:
@@ -358,38 +284,11 @@ components:
type: string
state:
type: string
BranchInfo:
type: object
required:
- name
- timeline_id
- latest_valid_lsn
- current_logical_size
properties:
name:
type: string
timeline_id:
type: string
format: hex
ancestor_id:
type: string
format: hex
ancestor_lsn:
type: string
current_logical_size:
type: integer
current_logical_size_non_incremental:
type: integer
latest_valid_lsn:
type: integer
TimelineInfo:
type: object
required:
- timeline_id
- tenant_id
- last_record_lsn
- prev_record_lsn
- start_lsn
- disk_consistent_lsn
properties:
timeline_id:
@@ -398,19 +297,21 @@ components:
tenant_id:
type: string
format: hex
ancestor_timeline_id:
type: string
format: hex
last_record_lsn:
type: string
prev_record_lsn:
type: string
start_lsn:
ancestor_timeline_id:
type: string
format: hex
ancestor_lsn:
type: string
disk_consistent_lsn:
type: string
timeline_state:
type: string
current_logical_size:
type: integer
current_logical_size_non_incremental:
type: integer
Error:
type: object
@@ -426,6 +327,13 @@ components:
properties:
msg:
type: string
AlreadyExistsError:
type: object
required:
- msg
properties:
msg:
type: string
ForbiddenError:
type: object
required:

View File

@@ -1,9 +1,8 @@
use std::sync::Arc;
use anyhow::{Context, Result};
use anyhow::Result;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use serde::Serialize;
use tracing::*;
use zenith_utils::auth::JwtAuth;
use zenith_utils::http::endpoint::attach_openapi_ui;
@@ -14,22 +13,18 @@ use zenith_utils::http::{
endpoint,
error::HttpErrorBody,
json::{json_request, json_response},
request::get_request_param,
request::parse_request_param,
};
use zenith_utils::http::{RequestExt, RouterBuilder};
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::HexZTimelineId;
use zenith_utils::zid::ZTimelineId;
use zenith_utils::zid::{HexZTenantId, ZTimelineId};
use super::models::BranchCreateRequest;
use super::models::StatusResponse;
use super::models::TenantCreateRequest;
use crate::branches::BranchInfo;
use super::models::{
StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse,
};
use crate::repository::Repository;
use crate::repository::RepositoryTimeline;
use crate::repository::TimelineSyncState;
use crate::repository::{Repository, Timeline};
use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId};
use crate::timelines::TimelineInfo;
use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
#[derive(Debug)]
struct State {
@@ -74,23 +69,45 @@ async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiErr
)?)
}
async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let request_data: BranchCreateRequest = json_request(&mut request).await?;
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
check_permission(&request, Some(request_data.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
branches::create_branch(
let new_timeline_info = tokio::task::spawn_blocking(move || {
let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered();
timelines::create_timeline(
get_config(&request),
&request_data.name,
&request_data.start_point,
&request_data.tenant_id,
tenant_id,
request_data.new_timeline_id.map(ZTimelineId::from),
request_data.ancestor_timeline_id.map(ZTimelineId::from),
request_data.ancestor_start_lsn,
)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::CREATED, response_data)?)
Ok(match new_timeline_info {
Some(info) => json_response(StatusCode::CREATED, TimelineInfoResponse::from(info))?,
None => json_response(StatusCode::CONFLICT, ())?,
})
}
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
let response_data: Vec<TimelineInfoResponse> = tokio::task::spawn_blocking(move || {
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size)
})
.await
.map_err(ApiError::from_err)??
.into_iter()
.map(TimelineInfoResponse::from)
.collect();
Ok(json_response(StatusCode::OK, response_data)?)
}
// Gate non incremental logical size calculation behind a flag
@@ -108,111 +125,6 @@ fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
.unwrap_or(false)
}
async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
check_permission(&request, Some(tenantid))?;
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
crate::branches::get_branches(
get_config(&request),
&tenantid,
include_non_incremental_logical_size,
)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::OK, response_data)?)
}
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
let conf = get_state(&request).conf;
let path = conf.branch_path(&branch_name, &tenantid);
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
BranchInfo::from_path(path, tenantid, include_non_incremental_logical_size)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::OK, response_data)?)
}
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let conf = get_state(&request).conf;
let timelines_dir = conf.timelines_path(&tenant_id);
let mut timelines_dir_contents =
tokio::fs::read_dir(&timelines_dir).await.with_context(|| {
format!(
"Failed to list timelines dir '{}' contents",
timelines_dir.display()
)
})?;
let mut local_timelines = Vec::new();
while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| {
format!(
"Failed to list timelines dir '{}' contents",
timelines_dir.display()
)
})? {
let entry_path = entry.path();
let entry_type = entry.file_type().await.with_context(|| {
format!(
"Failed to get file type of timeline dirs' entry '{}'",
entry_path.display()
)
})?;
if entry_type.is_dir() {
match entry.file_name().to_string_lossy().parse::<ZTimelineId>() {
Ok(timeline_id) => local_timelines.push(timeline_id.to_string()),
Err(e) => error!(
"Failed to get parse timeline id from timeline dirs' entry '{}': {}",
entry_path.display(),
e
),
}
}
}
Ok(json_response(StatusCode::OK, local_timelines)?)
}
#[derive(Debug, Serialize)]
#[serde(tag = "type")]
enum TimelineInfo {
Local {
#[serde(with = "hex")]
timeline_id: ZTimelineId,
#[serde(with = "hex")]
tenant_id: ZTenantId,
ancestor_timeline_id: Option<HexZTimelineId>,
last_record_lsn: Lsn,
prev_record_lsn: Lsn,
disk_consistent_lsn: Lsn,
timeline_state: Option<TimelineSyncState>,
},
Remote {
#[serde(with = "hex")]
timeline_id: ZTimelineId,
#[serde(with = "hex")]
tenant_id: ZTenantId,
},
}
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -223,27 +135,13 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
let _enter =
info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
.entered();
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
Ok::<_, anyhow::Error>(match repo.get_timeline(timeline_id)?.local_timeline() {
None => TimelineInfo::Remote {
timeline_id,
tenant_id,
},
Some(timeline) => TimelineInfo::Local {
timeline_id,
tenant_id,
ancestor_timeline_id: timeline
.get_ancestor_timeline_id()
.map(HexZTimelineId::from),
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(),
timeline_state: repo.get_timeline_state(timeline_id),
},
})
let include_non_incremental_logical_size =
get_include_non_incremental_logical_size(&request);
TimelineInfo::from_ids(tenant_id, timeline_id, include_non_incremental_logical_size)
})
.await
.map_err(ApiError::from_err)??;
.map_err(ApiError::from_err)?
.map(TimelineInfoResponse::from)?;
Ok(json_response(StatusCode::OK, response_data)?)
}
@@ -260,7 +158,7 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
.entered();
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
match repo.get_timeline(timeline_id)? {
RepositoryTimeline::Local(_) => {
RepositoryTimeline::Local { .. } => {
anyhow::bail!("Timeline with id {} is already local", timeline_id)
}
RepositoryTimeline::Remote {
@@ -320,13 +218,20 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
let request_data: TenantCreateRequest = json_request(&mut request).await?;
tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
let new_tenant_id = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_create", tenant = ?request_data.new_tenant_id).entered();
tenant_mgr::create_tenant_repository(
get_config(&request),
request_data.new_tenant_id.map(ZTenantId::from),
)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::CREATED, ())?)
Ok(match new_tenant_id {
Some(id) => json_response(StatusCode::CREATED, HexZTenantId::from(id))?,
None => json_response(StatusCode::CONFLICT, ())?,
})
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -356,23 +261,21 @@ pub fn make_router(
router
.data(Arc::new(State::new(conf, auth)))
.get("/v1/status", status_handler)
.get("/v1/timeline/:tenant_id", timeline_list_handler)
.get("/v1/tenant", tenant_list_handler)
.post("/v1/tenant", tenant_create_handler)
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
.get(
"/v1/timeline/:tenant_id/:timeline_id",
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_detail_handler,
)
.post(
"/v1/timeline/:tenant_id/:timeline_id/attach",
"/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
timeline_attach_handler,
)
.post(
"/v1/timeline/:tenant_id/:timeline_id/detach",
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
timeline_detach_handler,
)
.get("/v1/branch/:tenant_id", branch_list_handler)
.get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
.post("/v1/branch", branch_create_handler)
.get("/v1/tenant", tenant_list_handler)
.post("/v1/tenant", tenant_create_handler)
.any(handler_404)
}

View File

@@ -47,7 +47,7 @@ use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
use crate::{ZTenantId, ZTimelineId};
use zenith_metrics::{register_histogram_vec, HistogramVec};
use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec};
use zenith_utils::crashsafe_dir;
use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
use zenith_utils::seqwait::SeqWait;
@@ -82,7 +82,17 @@ lazy_static! {
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
"pageserver_storage_time",
"Time spent on storage operations",
&["operation"]
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric");
}
// Metrics collected on operations on the storage repository.
lazy_static! {
static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
"pageserver_getpage_reconstruct_time",
"Time spent on storage operations",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric");
}
@@ -113,20 +123,21 @@ pub struct LayeredRepository {
impl Repository for LayeredRepository {
type Timeline = LayeredTimeline;
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline<LayeredTimeline>> {
let mut timelines = self.timelines.lock().unwrap();
Ok(
match self.get_or_init_timeline(timelineid, &mut timelines)? {
LayeredTimelineEntry::Local(local) => RepositoryTimeline::Local(local),
LayeredTimelineEntry::Remote {
id,
disk_consistent_lsn,
} => RepositoryTimeline::Remote {
id,
disk_consistent_lsn,
},
},
)
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline<Self::Timeline>> {
Ok(RepositoryTimeline::from(self.get_or_init_timeline(
timelineid,
&mut self.timelines.lock().unwrap(),
)?))
}
fn list_timelines(&self) -> Result<Vec<RepositoryTimeline<Self::Timeline>>> {
Ok(self
.timelines
.lock()
.unwrap()
.values()
.map(|timeline_entry| RepositoryTimeline::from(timeline_entry.clone()))
.collect())
}
fn create_empty_timeline(
@@ -224,8 +235,12 @@ impl Repository for LayeredRepository {
horizon: u64,
checkpoint_before_gc: bool,
) -> Result<GcResult> {
let timeline_str = target_timelineid
.map(|x| x.to_string())
.unwrap_or_else(|| "-".to_string());
STORAGE_TIME
.with_label_values(&["gc"])
.with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str])
.observe_closure_duration(|| {
self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc)
})
@@ -405,6 +420,24 @@ impl LayeredTimelineEntry {
}
}
impl From<LayeredTimelineEntry> for RepositoryTimeline<LayeredTimeline> {
fn from(layered_timeline: LayeredTimelineEntry) -> Self {
match layered_timeline {
LayeredTimelineEntry::Local(timeline) => RepositoryTimeline::Local {
id: timeline.timelineid,
timeline,
},
LayeredTimelineEntry::Remote {
id,
disk_consistent_lsn,
} => RepositoryTimeline::Remote {
id,
disk_consistent_lsn,
},
}
}
}
/// Private functions
impl LayeredRepository {
// Implementation of the public `get_timeline` function. This differs from the public
@@ -719,6 +752,12 @@ pub struct LayeredTimeline {
ancestor_timeline: Option<LayeredTimelineEntry>,
ancestor_lsn: Lsn,
// Metrics histograms
reconstruct_time_histo: Histogram,
checkpoint_time_histo: Histogram,
flush_checkpoint_time_histo: Histogram,
forced_checkpoint_time_histo: Histogram,
/// If `true`, will backup its files that appear after each checkpointing to the remote storage.
upload_relishes: AtomicBool,
@@ -806,7 +845,8 @@ impl Timeline for LayeredTimeline {
self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?;
self.reconstruct_value(key, lsn, reconstruct_state)
self.reconstruct_time_histo
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
}
/// Public entry point for checkpoint(). All the logic is in the private
@@ -814,14 +854,14 @@ impl Timeline for LayeredTimeline {
/// metrics collection.
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> {
match cconf {
CheckpointConfig::Flush => STORAGE_TIME
.with_label_values(&["flush checkpoint"])
CheckpointConfig::Flush => self
.flush_checkpoint_time_histo
.observe_closure_duration(|| self.checkpoint_internal(0, false)),
CheckpointConfig::Forced => STORAGE_TIME
.with_label_values(&["forced checkpoint"])
CheckpointConfig::Forced => self
.forced_checkpoint_time_histo
.observe_closure_duration(|| self.checkpoint_internal(0, true)),
CheckpointConfig::Distance(distance) => STORAGE_TIME
.with_label_values(&["checkpoint"])
CheckpointConfig::Distance(distance) => self
.checkpoint_time_histo
.observe_closure_duration(|| self.checkpoint_internal(distance, true)),
}
}
@@ -889,6 +929,31 @@ impl LayeredTimeline {
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
upload_relishes: bool,
) -> LayeredTimeline {
let reconstruct_time_histo = RECONSTRUCT_TIME
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
.unwrap();
let checkpoint_time_histo = STORAGE_TIME
.get_metric_with_label_values(&[
"checkpoint",
&tenantid.to_string(),
&timelineid.to_string(),
])
.unwrap();
let flush_checkpoint_time_histo = STORAGE_TIME
.get_metric_with_label_values(&[
"flush checkpoint",
&tenantid.to_string(),
&timelineid.to_string(),
])
.unwrap();
let forced_checkpoint_time_histo = STORAGE_TIME
.get_metric_with_label_values(&[
"forced checkpoint",
&tenantid.to_string(),
&timelineid.to_string(),
])
.unwrap();
LayeredTimeline {
conf,
timelineid,
@@ -906,6 +971,12 @@ impl LayeredTimeline {
ancestor_timeline: ancestor,
ancestor_lsn: metadata.ancestor_lsn(),
reconstruct_time_histo,
checkpoint_time_histo,
flush_checkpoint_time_histo,
forced_checkpoint_time_histo,
upload_relishes: AtomicBool::new(upload_relishes),
write_lock: Mutex::new(()),
@@ -2162,8 +2233,10 @@ mod tests {
for _ in 0..50 {
let new_tline_id = ZTimelineId::generate();
repo.branch_timeline(tline_id, new_tline_id, lsn)?;
tline = if let RepositoryTimeline::Local(local) = repo.get_timeline(new_tline_id)? {
local
tline = if let RepositoryTimeline::Local { id: _, timeline } =
repo.get_timeline(new_tline_id)?
{
timeline
} else {
panic!("unexpected timeline state");
};

View File

@@ -1,5 +1,4 @@
pub mod basebackup;
pub mod branches;
pub mod config;
pub mod http;
pub mod import_datadir;
@@ -14,6 +13,7 @@ pub mod repository;
pub mod tenant_mgr;
pub mod tenant_threads;
pub mod thread_mgr;
pub mod timelines;
pub mod virtual_file;
pub mod walingest;
pub mod walreceiver;

View File

@@ -300,7 +300,7 @@ lazy_static! {
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
"pageserver_smgr_query_time",
"Time spent on smgr query handling",
&["smgr_query_type"],
&["smgr_query_type", "tenant_id", "timeline_id"],
TIME_BUCKETS.into()
)
.expect("failed to define a metric");
@@ -342,20 +342,22 @@ impl PageServerHandler {
};
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
let tenant_id = tenantid.to_string();
let timeline_id = timelineid.to_string();
let response = match zenith_fe_msg {
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
.with_label_values(&["get_rel_exists"])
.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id])
.observe_closure_duration(|| {
self.handle_get_rel_exists_request(timeline.as_ref(), &req)
}),
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
.with_label_values(&["get_rel_size"])
.with_label_values(&["get_rel_size", &tenant_id, &timeline_id])
.observe_closure_duration(|| {
self.handle_get_nblocks_request(timeline.as_ref(), &req)
}),
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
.with_label_values(&["get_page_at_lsn"])
.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id])
.observe_closure_duration(|| {
self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
}),

View File

@@ -62,11 +62,3 @@ Based on previous evaluation, even `rusoto-s3` could be a better choice over thi
So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.
* bracnhes implementaion could be improved
Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded,
on the timeline download, missing remote branch files are downlaoded.
A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally.
Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated.

View File

@@ -14,13 +14,6 @@
//! Only GC removes local timeline files, the GC support is not added to sync currently,
//! yet downloading extra files is not critically bad at this stage, GC can remove those again.
//!
//! Along the timeline files, branch files are uploaded and downloaded every time a corresponding sync task is processed.
//! For simplicity, branch files are also treated as immutable: only missing files are uploaded or downloaded, no removals, amendments or file contents checks are done.
//! Also, the branches are copied as separate files, with no extra compressions done.
//! Despite branches information currently belonging to tenants, a tenants' timeline sync is required to upload or download the branch files, also, there's no way to know
//! the branch sync state outside of the sync loop.
//! This implementation is currently considered as temporary and is a subjec to change later.
//!
//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents.
//! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has
//! an exclusive write access to the remote storage: new files appear in the storage only after the same
@@ -66,7 +59,6 @@
//! NOTE: No real contents or checksum check happens right now and is a subject to improve later.
//!
//! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed.
//! No extra branch registration is done.
//!
//! When pageserver signals shutdown, current sync task gets finished and the loop exists.
@@ -77,7 +69,7 @@ pub mod index;
mod upload;
use std::{
collections::{BTreeSet, HashMap, HashSet, VecDeque},
collections::{BTreeSet, HashMap, VecDeque},
num::{NonZeroU32, NonZeroUsize},
path::{Path, PathBuf},
sync::Arc,
@@ -87,7 +79,6 @@ use anyhow::{bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use lazy_static::lazy_static;
use tokio::{
fs,
runtime::Runtime,
sync::{
mpsc::{self, UnboundedReceiver},
@@ -101,8 +92,7 @@ use self::{
compression::ArchiveHeader,
download::{download_timeline, DownloadedTimeline},
index::{
ArchiveDescription, ArchiveId, RelativePath, RemoteTimeline, RemoteTimelineIndex,
TimelineIndexEntry,
ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry,
},
upload::upload_timeline_checkpoint,
};
@@ -843,28 +833,6 @@ async fn download_archive_header<
Ok(header)
}
async fn tenant_branch_files(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
) -> anyhow::Result<HashSet<RelativePath>> {
let branches_dir = conf.branches_path(&tenant_id);
if !branches_dir.exists() {
return Ok(HashSet::new());
}
let mut branch_entries = fs::read_dir(&branches_dir)
.await
.context("Failed to list tenant branches dir contents")?;
let mut branch_files = HashSet::new();
while let Some(branch_entry) = branch_entries.next_entry().await? {
if branch_entry.file_type().await?.is_file() {
branch_files.insert(RelativePath::new(&branches_dir, branch_entry.path())?);
}
}
Ok(branch_files)
}
#[cfg(test)]
mod test_utils {
use std::{
@@ -971,30 +939,9 @@ mod test_utils {
"Index contains unexpected sync ids"
);
let mut actual_branches = BTreeMap::new();
let mut expected_branches = BTreeMap::new();
let mut actual_timeline_entries = BTreeMap::new();
let mut expected_timeline_entries = BTreeMap::new();
for sync_id in actual_sync_ids {
actual_branches.insert(
sync_id.tenant_id,
index_read
.branch_files(sync_id.tenant_id)
.into_iter()
.flat_map(|branch_paths| branch_paths.iter())
.cloned()
.collect::<BTreeSet<_>>(),
);
expected_branches.insert(
sync_id.tenant_id,
expected_index_with_descriptions
.branch_files(sync_id.tenant_id)
.into_iter()
.flat_map(|branch_paths| branch_paths.iter())
.cloned()
.collect::<BTreeSet<_>>(),
);
actual_timeline_entries.insert(
sync_id,
index_read.timeline_entry(&sync_id).unwrap().clone(),
@@ -1009,11 +956,6 @@ mod test_utils {
}
drop(index_read);
assert_eq!(
actual_branches, expected_branches,
"Index contains unexpected branches"
);
for (sync_id, actual_timeline_entry) in actual_timeline_entries {
let expected_timeline_description = expected_timeline_entries
.remove(&sync_id)

View File

@@ -1,10 +1,8 @@
//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
//! Currently, tenant branch files are also downloaded, but this does not appear final.
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
use anyhow::{ensure, Context};
use futures::{stream::FuturesUnordered, StreamExt};
use tokio::{fs, sync::RwLock};
use tracing::{debug, error, trace, warn};
use zenith_utils::{lsn::Lsn, zid::ZTenantId};
@@ -14,8 +12,8 @@ use crate::{
layered_repository::metadata::{metadata_path, TimelineMetadata},
remote_storage::{
storage_sync::{
compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files,
update_index_description, SyncKind, SyncTask,
compression, index::TimelineIndexEntry, sync_queue, update_index_description, SyncKind,
SyncTask,
},
RemoteStorage, ZTenantTimelineId,
},
@@ -42,8 +40,6 @@ pub(super) enum DownloadedTimeline {
/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
/// updated in the end of every checkpoint archive extraction.
///
/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded.
///
/// On an error, bumps the retries count and reschedules the download, with updated archive skip list
/// (for any new successful archive downloads and extractions).
pub(super) async fn download_timeline<
@@ -113,22 +109,6 @@ pub(super) async fn download_timeline<
}
};
if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.tenant_id).await
{
error!(
"Failed to download missing branches for sync id {}: {:?}",
sync_id, e
);
sync_queue::push(SyncTask::new(
sync_id,
retries,
SyncKind::Download(download),
));
return DownloadedTimeline::FailedAndRescheduled {
disk_consistent_lsn,
};
}
debug!("Downloading timeline archives");
let archives_to_download = remote_timeline
.checkpoints()
@@ -250,82 +230,6 @@ async fn read_local_metadata(
.context("Failed to read local metadata files bytes")?)
}
async fn download_missing_branches<
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
>(
conf: &'static PageServerConf,
(storage, index): &(S, RwLock<RemoteTimelineIndex>),
tenant_id: ZTenantId,
) -> anyhow::Result<()> {
let local_branches = tenant_branch_files(conf, tenant_id)
.await
.context("Failed to list local branch files for the tenant")?;
let local_branches_dir = conf.branches_path(&tenant_id);
if !local_branches_dir.exists() {
fs::create_dir_all(&local_branches_dir)
.await
.with_context(|| {
format!(
"Failed to create local branches directory at path '{}'",
local_branches_dir.display()
)
})?;
}
if let Some(remote_branches) = index.read().await.branch_files(tenant_id) {
let mut remote_only_branches_downloads = remote_branches
.difference(&local_branches)
.map(|remote_only_branch| async move {
let branches_dir = conf.branches_path(&tenant_id);
let remote_branch_path = remote_only_branch.as_path(&branches_dir);
let storage_path =
storage.storage_path(&remote_branch_path).with_context(|| {
format!(
"Failed to derive a storage path for branch with local path '{}'",
remote_branch_path.display()
)
})?;
let mut target_file = fs::OpenOptions::new()
.write(true)
.create_new(true)
.open(&remote_branch_path)
.await
.with_context(|| {
format!(
"Failed to create local branch file at '{}'",
remote_branch_path.display()
)
})?;
storage
.download(&storage_path, &mut target_file)
.await
.with_context(|| {
format!(
"Failed to download branch file from the remote path {:?}",
storage_path
)
})?;
Ok::<_, anyhow::Error>(())
})
.collect::<FuturesUnordered<_>>();
let mut branch_downloads_failed = false;
while let Some(download_result) = remote_only_branches_downloads.next().await {
if let Err(e) = download_result {
branch_downloads_failed = true;
error!("Failed to download a branch file: {:?}", e);
}
}
ensure!(
!branch_downloads_failed,
"Failed to download all branch files"
);
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::collections::BTreeSet;

View File

@@ -5,7 +5,7 @@
//! This way in the future, the index could be restored fast from its serialized stored form.
use std::{
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
collections::{BTreeMap, BTreeSet, HashMap},
path::{Path, PathBuf},
};
@@ -49,10 +49,9 @@ impl RelativePath {
}
/// An index to track tenant files that exist on the remote storage.
/// Currently, timeline archives and branch files are tracked.
/// Currently, timeline archive files are tracked only.
#[derive(Debug, Clone)]
pub struct RemoteTimelineIndex {
branch_files: HashMap<ZTenantId, HashSet<RelativePath>>,
timeline_files: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
}
@@ -65,7 +64,6 @@ impl RemoteTimelineIndex {
paths: impl Iterator<Item = P>,
) -> Self {
let mut index = Self {
branch_files: HashMap::new(),
timeline_files: HashMap::new(),
};
for path in paths {
@@ -98,17 +96,6 @@ impl RemoteTimelineIndex {
pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
self.timeline_files.keys().copied()
}
pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) {
self.branch_files
.entry(tenant_id)
.or_insert_with(HashSet::new)
.insert(path);
}
pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet<RelativePath>> {
self.branch_files.get(&tenant_id)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
@@ -306,20 +293,9 @@ fn try_parse_index_entry(
.parse::<ZTenantId>()
.with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
let branches_path = conf.branches_path(&tenant_id);
let timelines_path = conf.timelines_path(&tenant_id);
match (
RelativePath::new(&branches_path, &path),
path.strip_prefix(&timelines_path),
) {
(Ok(_), Ok(_)) => bail!(
"Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes",
path.display(),
branches_path.display(),
timelines_path.display()
),
(Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry),
(Err(_), Ok(timelines_subpath)) => {
match path.strip_prefix(&timelines_path) {
Ok(timelines_subpath) => {
let mut segments = timelines_subpath.iter();
let timeline_id = segments
.next()
@@ -375,11 +351,10 @@ fn try_parse_index_entry(
}
}
}
(Err(branches_error), Err(timelines_strip_error)) => {
Err(timelines_strip_error) => {
bail!(
"Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'",
"Path '{}' is not an archive entry '{}'",
path.display(),
branches_error,
timelines_strip_error,
)
}

View File

@@ -1,13 +1,10 @@
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
//! Currently, tenant branch files are also uploaded, but this does not appear final.
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
use anyhow::{ensure, Context};
use futures::{stream::FuturesUnordered, StreamExt};
use tokio::{fs, sync::RwLock};
use anyhow::ensure;
use tokio::sync::RwLock;
use tracing::{debug, error, warn};
use zenith_utils::zid::ZTenantId;
use crate::{
config::PageServerConf,
@@ -15,7 +12,7 @@ use crate::{
storage_sync::{
compression,
index::{RemoteTimeline, TimelineIndexEntry},
sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask,
sync_queue, update_index_description, SyncKind, SyncTask,
},
RemoteStorage, ZTenantTimelineId,
},
@@ -26,8 +23,6 @@ use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoin
/// Attempts to compress and upload given checkpoint files.
/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
///
/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely.
///
/// On an error, bumps the retries count and reschedules the entire task.
/// On success, populates index data with new downloads.
pub(super) async fn upload_timeline_checkpoint<
@@ -41,19 +36,6 @@ pub(super) async fn upload_timeline_checkpoint<
retries: u32,
) -> Option<bool> {
debug!("Uploading checkpoint for sync id {}", sync_id);
if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.tenant_id).await
{
error!(
"Failed to upload missing branches for sync id {}: {:?}",
sync_id, e
);
sync_queue::push(SyncTask::new(
sync_id,
retries,
SyncKind::Upload(new_checkpoint),
));
return Some(false);
}
let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();
let index = &remote_assets.1;
@@ -201,76 +183,6 @@ async fn try_upload_checkpoint<
.map(|(header, header_size, _)| (header, header_size))
}
async fn upload_missing_branches<
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
>(
config: &'static PageServerConf,
(storage, index): &(S, RwLock<RemoteTimelineIndex>),
tenant_id: ZTenantId,
) -> anyhow::Result<()> {
let local_branches = tenant_branch_files(config, tenant_id)
.await
.context("Failed to list local branch files for the tenant")?;
let index_read = index.read().await;
let remote_branches = index_read
.branch_files(tenant_id)
.cloned()
.unwrap_or_default();
drop(index_read);
let mut branch_uploads = local_branches
.difference(&remote_branches)
.map(|local_only_branch| async move {
let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id));
let storage_path = storage.storage_path(&local_branch_path).with_context(|| {
format!(
"Failed to derive a storage path for branch with local path '{}'",
local_branch_path.display()
)
})?;
let local_branch_file = fs::OpenOptions::new()
.read(true)
.open(&local_branch_path)
.await
.with_context(|| {
format!(
"Failed to open local branch file {} for reading",
local_branch_path.display()
)
})?;
storage
.upload(local_branch_file, &storage_path)
.await
.with_context(|| {
format!(
"Failed to upload branch file to the remote path {:?}",
storage_path
)
})?;
Ok::<_, anyhow::Error>(local_only_branch)
})
.collect::<FuturesUnordered<_>>();
let mut branch_uploads_failed = false;
while let Some(upload_result) = branch_uploads.next().await {
match upload_result {
Ok(local_only_branch) => index
.write()
.await
.add_branch_file(tenant_id, local_only_branch.clone()),
Err(e) => {
error!("Failed to upload branch file: {:?}", e);
branch_uploads_failed = true;
}
}
}
ensure!(!branch_uploads_failed, "Failed to upload all branch files");
Ok(())
}
#[cfg(test)]
mod tests {
use tempfile::tempdir;

View File

@@ -179,6 +179,10 @@ pub trait Repository: Send + Sync {
/// Get Timeline handle for given zenith timeline ID.
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline<Self::Timeline>>;
/// Lists timelines the repository contains.
/// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
fn list_timelines(&self) -> Result<Vec<RepositoryTimeline<Self::Timeline>>>;
/// Create a new, empty timeline. The caller is responsible for loading data into it
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
fn create_empty_timeline(
@@ -215,7 +219,7 @@ pub trait Repository: Send + Sync {
pub enum RepositoryTimeline<T> {
/// Timeline, with its files present locally in pageserver's working directory.
/// Loaded into pageserver's memory and ready to be used.
Local(Arc<T>),
Local { id: ZTimelineId, timeline: Arc<T> },
/// Timeline, found on the pageserver's remote storage, but not yet downloaded locally.
Remote {
id: ZTimelineId,
@@ -226,17 +230,24 @@ pub enum RepositoryTimeline<T> {
impl<T> RepositoryTimeline<T> {
pub fn local_timeline(&self) -> Option<Arc<T>> {
if let Self::Local(local_timeline) = self {
Some(Arc::clone(local_timeline))
if let Self::Local { timeline, .. } = self {
Some(Arc::clone(timeline))
} else {
None
}
}
pub fn id(&self) -> ZTimelineId {
match self {
Self::Local { id, .. } => *id,
Self::Remote { id, .. } => *id,
}
}
}
/// A state of the timeline synchronization with the remote storage.
/// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum TimelineSyncState {
/// No further downloads from the remote storage are needed.
/// The timeline state is up-to-date or ahead of the remote storage one,
@@ -470,7 +481,6 @@ pub mod repo_harness {
let tenant_id = ZTenantId::generate();
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
fs::create_dir_all(conf.branches_path(&tenant_id))?;
Ok(Self {
conf,

View File

@@ -1,21 +1,21 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use crate::branches;
use crate::config::PageServerConf;
use crate::layered_repository::LayeredRepository;
use crate::repository::Repository;
use crate::repository::TimelineSyncState;
use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
use crate::timelines;
use crate::walredo::PostgresRedoManager;
use crate::CheckpointConfig;
use crate::{DatadirTimelineImpl, RepositoryImpl};
use anyhow::{bail, Context, Result};
use anyhow::{Context, Result};
use lazy_static::lazy_static;
use log::*;
use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap};
use std::collections::HashMap;
use std::fmt;
use std::sync::{Arc, Mutex, MutexGuard};
use zenith_utils::zid::{ZTenantId, ZTimelineId};
@@ -183,25 +183,28 @@ pub fn shutdown_all_tenants() {
}
}
pub fn create_repository_for_tenant(
pub fn create_tenant_repository(
conf: &'static PageServerConf,
tenantid: ZTenantId,
) -> Result<()> {
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
match access_tenants().entry(tenantid) {
hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid),
hash_map::Entry::Vacant(v) => {
v.insert(Tenant {
state: TenantState::Idle,
repo,
timelines: HashMap::new(),
});
new_tenant_id: Option<ZTenantId>,
) -> Result<Option<ZTenantId>> {
let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate);
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id));
match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? {
Some(repo) => {
access_tenants()
.entry(new_tenant_id)
.or_insert_with(|| Tenant {
state: TenantState::Idle,
repo,
timelines: HashMap::new(),
});
Ok(Some(new_tenant_id))
}
None => {
debug!("repository already exists for tenant {}", new_tenant_id);
Ok(None)
}
}
Ok(())
}
pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {

381
pageserver/src/timelines.rs Normal file
View File

@@ -0,0 +1,381 @@
//!
//! Timeline management code
//
use anyhow::{anyhow, bail, Context, Result};
use postgres_ffi::ControlFileData;
use std::{
fs,
path::Path,
process::{Command, Stdio},
sync::Arc,
};
use tracing::*;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use zenith_utils::{crashsafe_dir, logging};
use crate::DatadirTimeline;
use crate::RepositoryImpl;
use crate::{config::PageServerConf, repository::Repository};
use crate::{import_datadir, LOG_FILE_NAME};
use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
use crate::{repository::RepositoryTimeline, tenant_mgr};
use crate::{repository::Timeline, CheckpointConfig};
#[derive(Clone)]
pub enum TimelineInfo {
Local {
timeline_id: ZTimelineId,
tenant_id: ZTenantId,
last_record_lsn: Lsn,
prev_record_lsn: Lsn,
ancestor_timeline_id: Option<ZTimelineId>,
ancestor_lsn: Option<Lsn>,
disk_consistent_lsn: Lsn,
current_logical_size: usize,
current_logical_size_non_incremental: Option<usize>,
},
Remote {
timeline_id: ZTimelineId,
tenant_id: ZTenantId,
disk_consistent_lsn: Lsn,
},
}
impl TimelineInfo {
pub fn from_ids(
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
include_non_incremental_logical_size: bool,
) -> Result<Self> {
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
let result = match repo.get_timeline(timeline_id)? {
RepositoryTimeline::Local { id, timeline } => {
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
let ancestor_lsn = if ancestor_timeline_id.is_some() {
Some(timeline.get_ancestor_lsn())
} else {
None
};
let tline = tenant_mgr::get_timeline_for_tenant(tenant_id, timeline_id)?;
let current_logical_size = tline.get_current_logical_size();
let current_logical_size_non_incremental = get_current_logical_size_non_incremental(
include_non_incremental_logical_size,
tline.as_ref(),
);
Self::Local {
timeline_id: id,
tenant_id,
last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(),
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
current_logical_size,
current_logical_size_non_incremental,
}
}
RepositoryTimeline::Remote {
id,
disk_consistent_lsn,
} => Self::Remote {
timeline_id: id,
tenant_id,
disk_consistent_lsn,
},
};
Ok(result)
}
pub fn timeline_id(&self) -> ZTimelineId {
match *self {
TimelineInfo::Local { timeline_id, .. } => timeline_id,
TimelineInfo::Remote { timeline_id, .. } => timeline_id,
}
}
pub fn tenant_id(&self) -> ZTenantId {
match *self {
TimelineInfo::Local { tenant_id, .. } => tenant_id,
TimelineInfo::Remote { tenant_id, .. } => tenant_id,
}
}
}
fn get_current_logical_size_non_incremental<R: Repository>(
include_non_incremental_logical_size: bool,
timeline: &DatadirTimeline<R>,
) -> Option<usize> {
if !include_non_incremental_logical_size {
return None;
}
match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) {
Ok(size) => Some(size),
Err(e) => {
error!("Failed to get non-incremental logical size: {:?}", e);
None
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
pub timeline_id: ZTimelineId,
pub lsn: Lsn,
}
pub fn init_pageserver(
conf: &'static PageServerConf,
create_tenant: Option<ZTenantId>,
initial_timeline_id: Option<ZTimelineId>,
) -> anyhow::Result<()> {
// Initialize logger
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
let _log_file = logging::init(LOG_FILE_NAME, true)?;
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
// process during repository initialization.
//
// FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
// initdb in the background, and it kept running even after the "zenith init" had exited.
// In tests, we started the page server immediately after that, so that initdb was still
// running in the background, and we failed to run initdb again in the same directory. This
// has been solved for the rapid init+start case now, but the general race condition remains
// if you restart the server quickly. The WAL redo manager doesn't use a separate thread
// anymore, but I think that could still happen.
let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {});
crashsafe_dir::create_dir_all(conf.tenants_path())?;
if let Some(tenant_id) = create_tenant {
println!("initializing tenantid {}", tenant_id);
let repo = create_repo(conf, tenant_id, dummy_redo_mgr)
.context("failed to create repo")?
.ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?;
let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())
.context("failed to create initial timeline")?;
println!("initial timeline {} created", new_timeline_id)
} else if initial_timeline_id.is_some() {
println!("Ignoring initial timeline parameter, due to no tenant id to create given");
}
println!("pageserver init succeeded");
Ok(())
}
pub fn create_repo(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
) -> Result<Option<Arc<RepositoryImpl>>> {
let repo_dir = conf.tenant_path(&tenant_id);
if repo_dir.exists() {
debug!("repo for {} already exists", tenant_id);
return Ok(None);
}
// top-level dir may exist if we are creating it through CLI
crashsafe_dir::create_dir_all(&repo_dir)
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?;
info!("created directory structure in {}", repo_dir.display());
Ok(Some(Arc::new(LayeredRepository::new(
conf,
wal_redo_manager,
tenant_id,
conf.remote_storage_config.is_some(),
))))
}
// Returns checkpoint LSN from controlfile
fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
let lsn = controlfile.checkPoint;
Ok(Lsn(lsn))
}
// Create the cluster temporarily in 'initdbpath' directory inside the repository
// to get bootstrap data for timeline initialization.
//
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
info!("running initdb in {}... ", initdbpath.display());
let initdb_path = conf.pg_bin_dir().join("initdb");
let initdb_output = Command::new(initdb_path)
.args(&["-D", initdbpath.to_str().unwrap()])
.args(&["-U", &conf.superuser])
.args(&["-E", "utf8"])
.arg("--no-instructions")
// This is only used for a temporary installation that is deleted shortly after,
// so no need to fsync it
.arg("--no-sync")
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null())
.output()
.context("failed to execute initdb")?;
if !initdb_output.status.success() {
bail!(
"initdb failed: '{}'",
String::from_utf8_lossy(&initdb_output.stderr)
);
}
Ok(())
}
//
// - run initdb to init temporary instance and get bootstrap data
// - after initialization complete, remove the temp dir.
//
fn bootstrap_timeline<R: Repository>(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tli: ZTimelineId,
repo: &R,
) -> Result<()> {
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
// Init temporarily repo to get bootstrap data
run_initdb(conf, &initdb_path)?;
let pgdata_path = initdb_path;
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
// Import the contents of the data directory at the initial checkpoint
// LSN, and any WAL after that.
// Initdb lsn will be equal to last_record_lsn which will be set after import.
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
let timeline = repo.create_empty_timeline(tli, lsn)?;
let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline);
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
println!(
"created initial timeline {} timeline.lsn {}",
tli,
page_tline.tline.get_last_record_lsn()
);
// Remove temp dir. We don't need it anymore
fs::remove_dir_all(pgdata_path)?;
Ok(())
}
pub(crate) fn get_timelines(
tenant_id: ZTenantId,
include_non_incremental_logical_size: bool,
) -> Result<Vec<TimelineInfo>> {
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
.with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
let mut result = Vec::new();
for timeline in repo
.list_timelines()
.with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))?
{
match timeline {
RepositoryTimeline::Local {
timeline: _,
id: timeline_id,
} => {
result.push(TimelineInfo::from_ids(
tenant_id,
timeline_id,
include_non_incremental_logical_size,
)?);
}
RepositoryTimeline::Remote { .. } => continue,
}
}
Ok(result)
}
pub(crate) fn create_timeline(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
new_timeline_id: Option<ZTimelineId>,
ancestor_timeline_id: Option<ZTimelineId>,
ancestor_start_lsn: Option<Lsn>,
) -> Result<Option<TimelineInfo>> {
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
if conf.timeline_path(&new_timeline_id, &tenant_id).exists() {
match repo.get_timeline(new_timeline_id)? {
RepositoryTimeline::Local { id, .. } => {
debug!("timeline {} already exists", id);
return Ok(None);
}
RepositoryTimeline::Remote { id, .. } => bail!(
"timeline {} already exists in pageserver's remote storage",
id
),
}
}
let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));
match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
let ancestor_timeline = repo
.get_timeline(ancestor_timeline_id)
.with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))?
.local_timeline()
.with_context(|| {
format!(
"Cannot branch off the timeline {} that's not present locally",
ancestor_timeline_id
)
})?;
if start_lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = ancestor_timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
start_lsn = end_of_wal;
} else {
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline.wait_lsn(start_lsn)?;
}
start_lsn = start_lsn.align();
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
if ancestor_ancestor_lsn > start_lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!(
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
start_lsn,
ancestor_timeline_id,
ancestor_ancestor_lsn,
);
}
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
}
None => {
bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
}
}
let new_timeline_info = TimelineInfo::from_ids(tenant_id, new_timeline_id, false)?;
Ok(Some(new_timeline_info))
}

View File

@@ -1,8 +1,8 @@
from contextlib import closing
from typing import Iterator
from uuid import UUID, uuid4
import psycopg2
from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException
from requests.exceptions import HTTPError
import pytest
@@ -25,25 +25,31 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
ps.safe_psql("set FOO", password=tenant_token)
ps.safe_psql("set FOO", password=management_token)
new_timeline_id = env.zenith_cli.create_branch('test_pageserver_auth',
tenant_id=env.initial_tenant)
# tenant can create branches
tenant_http_client.branch_create(env.initial_tenant, 'new1', 'main')
tenant_http_client.timeline_create(tenant_id=env.initial_tenant,
ancestor_timeline_id=new_timeline_id)
# console can create branches for tenant
management_http_client.branch_create(env.initial_tenant, 'new2', 'main')
management_http_client.timeline_create(tenant_id=env.initial_tenant,
ancestor_timeline_id=new_timeline_id)
# fail to create branch using token with different tenant_id
with pytest.raises(ZenithPageserverApiException,
match='Forbidden: Tenant id mismatch. Permission denied'):
invalid_tenant_http_client.branch_create(env.initial_tenant, "new3", "main")
invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant,
ancestor_timeline_id=new_timeline_id)
# create tenant using management token
management_http_client.tenant_create(uuid4())
management_http_client.tenant_create()
# fail to create tenant using tenant token
with pytest.raises(
ZenithPageserverApiException,
match='Forbidden: Attempt to access management api with tenant scope. Permission denied'
):
tenant_http_client.tenant_create(uuid4())
tenant_http_client.tenant_create()
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
@@ -53,9 +59,8 @@ def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_w
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
env.zenith_cli.create_branch(branch, "main")
branch = f'test_compute_auth_to_pageserver{with_wal_acceptors}'
env.zenith_cli.create_branch(branch)
pg = env.postgres.create_start(branch)
with closing(pg.connect()) as conn:

View File

@@ -95,7 +95,7 @@ def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
# Create a branch for us
env.zenith_cli.create_branch("test_backpressure", "main")
env.zenith_cli.create_branch('test_backpressure')
pg = env.postgres.create_start('test_backpressure',
config_lines=['max_replication_write_lag=30MB'])

View File

@@ -22,8 +22,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
env = zenith_env_builder.init_start()
# Branch at the point where only 100 rows were inserted
env.zenith_cli.create_branch("test_branch_behind", "main")
env.zenith_cli.create_branch('test_branch_behind')
pgmain = env.postgres.create_start('test_branch_behind')
log.info("postgres is running on 'test_branch_behind' branch")
@@ -60,7 +59,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
log.info(f'LSN after 200100 rows: {lsn_b}')
# Branch at the point where only 100 rows were inserted
env.zenith_cli.create_branch("test_branch_behind_hundred", "test_branch_behind@" + lsn_a)
env.zenith_cli.create_branch('test_branch_behind_hundred',
'test_branch_behind',
ancestor_start_lsn=lsn_a)
# Insert many more rows. This generates enough WAL to fill a few segments.
main_cur.execute('''
@@ -75,10 +76,12 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
log.info(f'LSN after 400100 rows: {lsn_c}')
# Branch at the point where only 200100 rows were inserted
env.zenith_cli.create_branch("test_branch_behind_more", "test_branch_behind@" + lsn_b)
env.zenith_cli.create_branch('test_branch_behind_more',
'test_branch_behind',
ancestor_start_lsn=lsn_b)
pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
pg_more = env.postgres.create_start("test_branch_behind_more")
pg_hundred = env.postgres.create_start('test_branch_behind_hundred')
pg_more = env.postgres.create_start('test_branch_behind_more')
# On the 'hundred' branch, we should see only 100 rows
hundred_pg_conn = pg_hundred.connect()
@@ -99,19 +102,23 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
# Check bad lsn's for branching
# branch at segment boundary
env.zenith_cli.create_branch("test_branch_segment_boundary", "test_branch_behind@0/3000000")
pg = env.postgres.create_start("test_branch_segment_boundary")
env.zenith_cli.create_branch('test_branch_segment_boundary',
'test_branch_behind',
ancestor_start_lsn="0/3000000")
pg = env.postgres.create_start('test_branch_segment_boundary')
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
# branch at pre-initdb lsn
with pytest.raises(Exception, match="invalid branch start lsn"):
env.zenith_cli.create_branch("test_branch_preinitdb", "main@0/42")
env.zenith_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42")
# branch at pre-ancestor lsn
with pytest.raises(Exception, match="less than timeline ancestor lsn"):
env.zenith_cli.create_branch("test_branch_preinitdb", "test_branch_behind@0/42")
env.zenith_cli.create_branch('test_branch_preinitdb',
'test_branch_behind',
ancestor_start_lsn="0/42")
# check that we cannot create branch based on garbage collected data
with closing(env.pageserver.connect()) as psconn:
@@ -123,7 +130,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
with pytest.raises(Exception, match="invalid branch start lsn"):
# this gced_lsn is pretty random, so if gc is disabled this woudln't fail
env.zenith_cli.create_branch("test_branch_create_fail", f"test_branch_behind@{gced_lsn}")
env.zenith_cli.create_branch('test_branch_create_fail',
'test_branch_behind',
ancestor_start_lsn=gced_lsn)
# check that after gc everything is still there
hundred_cur.execute('SELECT count(*) FROM foo')

View File

@@ -12,7 +12,7 @@ from fixtures.log_helper import log
#
def test_clog_truncate(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_clog_truncate", "empty")
env.zenith_cli.create_branch('test_clog_truncate', 'empty')
# set agressive autovacuum to make sure that truncation will happen
config = [
@@ -62,9 +62,9 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv):
# create new branch after clog truncation and start a compute node on it
log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
env.zenith_cli.create_branch("test_clog_truncate_new",
"test_clog_truncate@" + lsn_after_truncation)
env.zenith_cli.create_branch('test_clog_truncate_new',
'test_clog_truncate',
ancestor_start_lsn=lsn_after_truncation)
pg2 = env.postgres.create_start('test_clog_truncate_new')
log.info('postgres is running on test_clog_truncate_new branch')

View File

@@ -11,7 +11,7 @@ from fixtures.log_helper import log
#
def test_createdb(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_createdb", "empty")
env.zenith_cli.create_branch('test_createdb', 'empty')
pg = env.postgres.create_start('test_createdb')
log.info("postgres is running on 'test_createdb' branch")
@@ -27,8 +27,7 @@ def test_createdb(zenith_simple_env: ZenithEnv):
lsn = cur.fetchone()[0]
# Create a branch
env.zenith_cli.create_branch("test_createdb2", "test_createdb@" + lsn)
env.zenith_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn)
pg2 = env.postgres.create_start('test_createdb2')
# Test that you can connect to the new database on both branches
@@ -41,8 +40,7 @@ def test_createdb(zenith_simple_env: ZenithEnv):
#
def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
env = zenith_simple_env
env.zenith_cli.create_branch("test_dropdb", "empty")
env.zenith_cli.create_branch('test_dropdb', 'empty')
pg = env.postgres.create_start('test_dropdb')
log.info("postgres is running on 'test_dropdb' branch")
@@ -66,10 +64,14 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
lsn_after_drop = cur.fetchone()[0]
# Create two branches before and after database drop.
env.zenith_cli.create_branch("test_before_dropdb", "test_dropdb@" + lsn_before_drop)
env.zenith_cli.create_branch('test_before_dropdb',
'test_dropdb',
ancestor_start_lsn=lsn_before_drop)
pg_before = env.postgres.create_start('test_before_dropdb')
env.zenith_cli.create_branch("test_after_dropdb", "test_dropdb@" + lsn_after_drop)
env.zenith_cli.create_branch('test_after_dropdb',
'test_dropdb',
ancestor_start_lsn=lsn_after_drop)
pg_after = env.postgres.create_start('test_after_dropdb')
# Test that database exists on the branch before drop

View File

@@ -9,8 +9,7 @@ from fixtures.log_helper import log
#
def test_createuser(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_createuser", "empty")
env.zenith_cli.create_branch('test_createuser', 'empty')
pg = env.postgres.create_start('test_createuser')
log.info("postgres is running on 'test_createuser' branch")
@@ -25,8 +24,7 @@ def test_createuser(zenith_simple_env: ZenithEnv):
lsn = cur.fetchone()[0]
# Create a branch
env.zenith_cli.create_branch("test_createuser2", "test_createuser@" + lsn)
env.zenith_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn)
pg2 = env.postgres.create_start('test_createuser2')
# Test that you can connect to new branch as a new user

View File

@@ -10,7 +10,7 @@ from fixtures.log_helper import log
#
def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
env = zenith_simple_env
env.zenith_cli.create_branch("test_multixact", "empty")
env.zenith_cli.create_branch('test_multixact', 'empty')
pg = env.postgres.create_start('test_multixact')
log.info("postgres is running on 'test_multixact' branch")
@@ -60,7 +60,7 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
assert int(next_multixact_id) > int(next_multixact_id_old)
# Branch at this point
env.zenith_cli.create_branch("test_multixact_new", "test_multixact@" + lsn)
env.zenith_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn)
pg_new = env.postgres.create_start('test_multixact_new')
log.info("postgres is running on 'test_multixact_new' branch")

View File

@@ -23,21 +23,25 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID):
client.tenant_create(tenant_id)
assert tenant_id.hex in {t['id'] for t in client.tenant_list()}
# check its timelines
timelines = client.timeline_list(tenant_id)
assert len(timelines) == 0, "initial tenant should not have any timelines"
# create timeline
timeline_id = uuid4()
client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id)
timelines = client.timeline_list(tenant_id)
assert len(timelines) > 0
for timeline_id_str in timelines:
timeline_details = client.timeline_detail(tenant_id, UUID(timeline_id_str))
assert timeline_details['type'] == 'Local'
assert timeline_details['tenant_id'] == tenant_id.hex
assert timeline_details['timeline_id'] == timeline_id_str
# create branch
branch_name = uuid4().hex
client.branch_create(tenant_id, branch_name, "main")
# check it is there
assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}
assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)}
for timeline in timelines:
timeline_id_str = str(timeline['timeline_id'])
timeline_details = client.timeline_detail(tenant_id=tenant_id,
timeline_id=UUID(timeline_id_str))
assert timeline_details['kind'] == 'Local'
assert timeline_details['tenant_id'] == tenant_id.hex
assert timeline_details['timeline_id'] == timeline_id_str
def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):

View File

@@ -16,7 +16,7 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_pageserver_catchup_while_compute_down", "main")
env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down')
pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down')
pg_conn = pg.connect()

View File

@@ -15,7 +15,7 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_pageserver_restart", "main")
env.zenith_cli.create_branch('test_pageserver_restart')
pg = env.postgres.create_start('test_pageserver_restart')
pg_conn = pg.connect()

View File

@@ -1,7 +1,5 @@
from io import BytesIO
import asyncio
import asyncpg
import subprocess
from fixtures.zenith_fixtures import ZenithEnv, Postgres
from fixtures.log_helper import log

View File

@@ -1,2 +1,15 @@
import pytest
def test_proxy_select_1(static_proxy):
static_proxy.safe_psql("select 1;")
@pytest.mark.xfail # Proxy eats the extra connection options
def test_proxy_options(static_proxy):
schema_name = "tmp_schema_1"
with static_proxy.connect(schema=schema_name) as conn:
with conn.cursor() as cur:
cur.execute("SHOW search_path;")
search_path = cur.fetchall()[0][0]
assert schema_name == search_path

View File

@@ -11,8 +11,7 @@ from fixtures.zenith_fixtures import ZenithEnv
#
def test_readonly_node(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_readonly_node", "empty")
env.zenith_cli.create_branch('test_readonly_node', 'empty')
pgmain = env.postgres.create_start('test_readonly_node')
log.info("postgres is running on 'test_readonly_node' branch")
@@ -53,12 +52,14 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
log.info('LSN after 400100 rows: ' + lsn_c)
# Create first read-only node at the point where only 100 rows were inserted
pg_hundred = env.postgres.create_start("test_readonly_node_hundred",
branch=f'test_readonly_node@{lsn_a}')
pg_hundred = env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_readonly_node_hundred',
lsn=lsn_a)
# And another at the point where 200100 rows were inserted
pg_more = env.postgres.create_start("test_readonly_node_more",
branch=f'test_readonly_node@{lsn_b}')
pg_more = env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_readonly_node_more',
lsn=lsn_b)
# On the 'hundred' node, we should see only 100 rows
hundred_pg_conn = pg_hundred.connect()
@@ -77,8 +78,9 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
assert main_cur.fetchone() == (400100, )
# Check creating a node at segment boundary
pg = env.postgres.create_start("test_branch_segment_boundary",
branch="test_readonly_node@0/3000000")
pg = env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_branch_segment_boundary',
lsn='0/3000000')
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
@@ -86,5 +88,6 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
# Create node at pre-initdb lsn
with pytest.raises(Exception, match="invalid basebackup lsn"):
# compute node startup with invalid LSN should fail
env.zenith_cli.pg_start("test_readonly_node_preinitdb",
timeline_spec="test_readonly_node@0/42")
env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_readonly_node_preinitdb',
lsn='0/42')

View File

@@ -43,7 +43,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
##### First start, insert secret data and upload it to the remote storage
env = zenith_env_builder.init_start()
pg = env.postgres.create_start()
pg = env.postgres.create_start('main')
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
@@ -85,7 +85,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id))
assert timeline_details['timeline_id'] == timeline_id
assert timeline_details['tenant_id'] == tenant_id
if timeline_details['type'] == 'Local':
if timeline_details['kind'] == 'Local':
log.info("timeline downloaded, checking its data")
break
attempts += 1
@@ -94,7 +94,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
log.debug("still waiting")
time.sleep(1)
pg = env.postgres.create_start()
pg = env.postgres.create_start('main')
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};')

View File

@@ -15,8 +15,7 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_restart_compute", "main")
env.zenith_cli.create_branch('test_restart_compute')
pg = env.postgres.create_start('test_restart_compute')
log.info("postgres is running on 'test_restart_compute' branch")

View File

@@ -127,16 +127,14 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
# create folder for remote storage mock
remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage'
tenant = env.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
log.info("tenant to relocate %s", tenant)
env.zenith_cli.create_branch("test_tenant_relocation", "main", tenant_id=tenant)
env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant)
tenant_pg = env.postgres.create_start(
"test_tenant_relocation",
"main", # branch name, None means same as node name
tenant_id=tenant,
)
tenant_pg = env.postgres.create_start(branch_name='main',
node_name='test_tenant_relocation',
tenant_id=tenant)
# insert some data
with closing(tenant_pg.connect()) as conn:

View File

@@ -12,25 +12,21 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce
env = zenith_env_builder.init_start()
"""Tests tenants with and without wal acceptors"""
tenant_1 = env.create_tenant()
tenant_2 = env.create_tenant()
tenant_1 = env.zenith_cli.create_tenant()
tenant_2 = env.zenith_cli.create_tenant()
env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
"main",
tenant_id=tenant_1)
env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
"main",
tenant_id=tenant_2)
env.zenith_cli.create_timeline(
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_1)
env.zenith_cli.create_timeline(
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_2)
pg_tenant1 = env.postgres.create_start(
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
None, # branch name, None means same as node name
tenant_1,
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}',
tenant_id=tenant_1,
)
pg_tenant2 = env.postgres.create_start(
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
None, # branch name, None means same as node name
tenant_2,
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}',
tenant_id=tenant_2,
)
for pg in [pg_tenant1, pg_tenant2]:

View File

@@ -10,10 +10,10 @@ import time
def test_timeline_size(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
# Branch at the point where only 100 rows were inserted
env.zenith_cli.create_branch("test_timeline_size", "empty")
new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty')
client = env.pageserver.http_client()
res = client.branch_detail(env.initial_tenant, "test_timeline_size")
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
pgmain = env.postgres.create_start("test_timeline_size")
@@ -31,11 +31,11 @@ def test_timeline_size(zenith_simple_env: ZenithEnv):
FROM generate_series(1, 10) g
""")
res = client.branch_detail(env.initial_tenant, "test_timeline_size")
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
cur.execute("TRUNCATE foo")
res = client.branch_detail(env.initial_tenant, "test_timeline_size")
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
@@ -68,17 +68,16 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60
def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_timeline_size_quota", "main")
new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota')
client = env.pageserver.http_client()
res = client.branch_detail(env.initial_tenant, "test_timeline_size_quota")
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
pgmain = env.postgres.create_start(
"test_timeline_size_quota",
# Set small limit for the test
config_lines=['zenith.max_cluster_size=30MB'],
)
config_lines=['zenith.max_cluster_size=30MB'])
log.info("postgres is running on 'test_timeline_size_quota' branch")
with closing(pgmain.connect()) as conn:

View File

@@ -10,7 +10,6 @@ from fixtures.log_helper import log
def test_twophase(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_twophase", "empty")
pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
log.info("postgres is running on 'test_twophase' branch")

View File

@@ -13,7 +13,7 @@ from dataclasses import dataclass, field
from multiprocessing import Process, Value
from pathlib import Path
from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol
from fixtures.utils import lsn_to_hex, mkdir_if_needed
from fixtures.utils import lsn_to_hex, mkdir_if_needed, lsn_from_hex
from fixtures.log_helper import log
from typing import List, Optional, Any
@@ -24,8 +24,7 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_wal_acceptors_normal_work", "main")
env.zenith_cli.create_branch('test_wal_acceptors_normal_work')
pg = env.postgres.create_start('test_wal_acceptors_normal_work')
with closing(pg.connect()) as conn:
@@ -39,9 +38,9 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
@dataclass
class BranchMetrics:
name: str
latest_valid_lsn: int
class TimelineMetrics:
timeline_id: str
last_record_lsn: int
# One entry per each Safekeeper, order is the same
flush_lsns: List[int] = field(default_factory=list)
commit_lsns: List[int] = field(default_factory=list)
@@ -55,23 +54,32 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
n_timelines = 3
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
branch_names = [
"test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)
]
# pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418')
# that's not really human readable, so the branch names are introduced in Zenith CLI.
# Zenith CLI stores its branch <-> timeline mapping in its internals,
# but we need this to collect metrics from other servers, related to the timeline.
branch_names_to_timeline_ids = {}
# start postgres on each timeline
pgs = []
for branch in branches:
env.zenith_cli.create_branch(branch, "main")
pgs.append(env.postgres.create_start(branch))
for branch_name in branch_names:
new_timeline_id = env.zenith_cli.create_branch(branch_name)
pgs.append(env.postgres.create_start(branch_name))
branch_names_to_timeline_ids[branch_name] = new_timeline_id
tenant_id = env.initial_tenant
def collect_metrics(message: str) -> List[BranchMetrics]:
def collect_metrics(message: str) -> List[TimelineMetrics]:
with env.pageserver.http_client() as pageserver_http:
branch_details = [
pageserver_http.branch_detail(tenant_id=tenant_id, name=branch)
for branch in branches
timeline_details = [
pageserver_http.timeline_detail(
tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name])
for branch_name in branch_names
]
# All changes visible to pageserver (latest_valid_lsn) should be
# All changes visible to pageserver (last_record_lsn) should be
# confirmed by safekeepers first. As we cannot atomically get
# state of both pageserver and safekeepers, we should start with
# pageserver. Looking at outdated data from pageserver is ok.
@@ -80,14 +88,14 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
# safekeepers' state, it will look contradictory.
sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers]
branch_metrics = []
timeline_metrics = []
with env.pageserver.http_client() as pageserver_http:
for branch_detail in branch_details:
timeline_id: str = branch_detail["timeline_id"]
for timeline_detail in timeline_details:
timeline_id: str = timeline_detail["timeline_id"]
m = BranchMetrics(
name=branch_detail["name"],
latest_valid_lsn=branch_detail["latest_valid_lsn"],
m = TimelineMetrics(
timeline_id=timeline_id,
last_record_lsn=lsn_from_hex(timeline_detail["last_record_lsn"]),
)
for sk_m in sk_metrics:
m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)])
@@ -99,13 +107,13 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
# We only call collect_metrics() after a transaction is confirmed by
# the compute node, which only happens after a consensus of safekeepers
# has confirmed the transaction. We assume majority consensus here.
assert (2 * sum(m.latest_valid_lsn <= lsn
assert (2 * sum(m.last_record_lsn <= lsn
for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers)
assert (2 * sum(m.latest_valid_lsn <= lsn
assert (2 * sum(m.last_record_lsn <= lsn
for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers)
branch_metrics.append(m)
log.info(f"{message}: {branch_metrics}")
return branch_metrics
timeline_metrics.append(m)
log.info(f"{message}: {timeline_metrics}")
return timeline_metrics
# TODO: https://github.com/zenithdb/zenith/issues/809
# collect_metrics("before CREATE TABLE")
@@ -117,7 +125,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
init_m = collect_metrics("after CREATE TABLE")
# Populate data for 2/3 branches
# Populate data for 2/3 timelines
class MetricsChecker(threading.Thread):
def __init__(self) -> None:
super().__init__(daemon=True)
@@ -155,15 +163,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
collect_metrics("after INSERT INTO")
# Check data for 2/3 branches
# Check data for 2/3 timelines
for pg in pgs[:-1]:
res = pg.safe_psql("SELECT sum(key) FROM t")
assert res[0] == (5000050000, )
final_m = collect_metrics("after SELECT")
# Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly.
# Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly.
# Also assume that safekeepers will not be significantly out of sync in this test.
middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2
middle_lsn = (init_m[0].last_record_lsn + final_m[0].last_record_lsn) // 2
assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns)
assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns)
assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns)
@@ -183,7 +191,7 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = n_acceptors
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_wal_acceptors_restarts", "main")
env.zenith_cli.create_branch('test_wal_acceptors_restarts')
pg = env.postgres.create_start('test_wal_acceptors_restarts')
# we rely upon autocommit after each statement
@@ -220,7 +228,7 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 2
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_wal_acceptors_unavailability", "main")
env.zenith_cli.create_branch('test_wal_acceptors_unavailability')
pg = env.postgres.create_start('test_wal_acceptors_unavailability')
# we rely upon autocommit after each statement
@@ -291,7 +299,7 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_wal_acceptors_race_conditions", "main")
env.zenith_cli.create_branch('test_wal_acceptors_race_conditions')
pg = env.postgres.create_start('test_wal_acceptors_race_conditions')
# we rely upon autocommit after each statement
@@ -456,7 +464,7 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_timeline_status", "main")
env.zenith_cli.create_branch('test_timeline_status')
pg = env.postgres.create_start('test_timeline_status')
wa = env.safekeepers[0]
@@ -630,7 +638,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 4
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_replace_safekeeper", "main")
env.zenith_cli.create_branch('test_replace_safekeeper')
log.info("Use only first 3 safekeepers")
env.safekeepers[3].stop()

View File

@@ -202,7 +202,7 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch("test_wal_acceptors_restarts_under_load", "main")
env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load')
pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load')
asyncio.run(run_restarts_under_load(pg, env.safekeepers))

View File

@@ -7,52 +7,46 @@ from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserv
from typing import cast
def helper_compare_branch_list(pageserver_http_client: ZenithPageserverHttpClient,
env: ZenithEnv,
initial_tenant: uuid.UUID):
def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpClient,
env: ZenithEnv,
initial_tenant: uuid.UUID):
"""
Compare branches list returned by CLI and directly via API.
Filters out branches created by other tests.
Compare timelines list returned by CLI and directly via API.
Filters out timelines created by other tests.
"""
branches = pageserver_http_client.branch_list(initial_tenant)
branches_api = sorted(map(lambda b: cast(str, b['name']), branches))
branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
res = env.zenith_cli.list_branches()
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
timelines_api = sorted(
map(lambda t: cast(str, t['timeline_id']),
pageserver_http_client.timeline_list(initial_tenant)))
res = env.zenith_cli.list_branches(tenant_id=initial_tenant)
branches_cli_with_tenant_arg = sorted(
map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
branches_cli_with_tenant_arg = [
b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')
]
timelines_cli = env.zenith_cli.list_timelines()
assert timelines_cli == env.zenith_cli.list_timelines(initial_tenant)
assert branches_api == branches_cli == branches_cli_with_tenant_arg
cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli])
assert timelines_api == cli_timeline_ids
def test_cli_branch_list(zenith_simple_env: ZenithEnv):
def test_cli_timeline_list(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
pageserver_http_client = env.pageserver.http_client()
# Initial sanity check
helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant)
env.zenith_cli.create_branch("test_cli_branch_list_main", "empty")
helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant)
helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant)
# Create a branch for us
main_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_main')
helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant)
# Create a nested branch
res = env.zenith_cli.create_branch("test_cli_branch_list_nested", "test_cli_branch_list_main")
assert res.stderr == ''
helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant)
nested_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_nested',
'test_cli_branch_list_main')
helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant)
# Check that all new branches are visible via CLI
res = env.zenith_cli.list_branches()
assert res.stderr == ''
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
timelines_cli = [timeline_id for (_, timeline_id) in env.zenith_cli.list_timelines()]
assert 'test_cli_branch_list_main' in branches_cli
assert 'test_cli_branch_list_nested' in branches_cli
assert main_timeline_id.hex in timelines_cli
assert nested_timeline_id.hex in timelines_cli
def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv):
@@ -60,7 +54,6 @@ def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClien
tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants))
res = env.zenith_cli.list_tenants()
assert res.stderr == ''
tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
assert tenants_api == tenants_cli
@@ -73,15 +66,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
helper_compare_tenant_list(pageserver_http_client, env)
# Create new tenant
tenant1 = uuid.uuid4()
env.zenith_cli.create_tenant(tenant1)
tenant1 = env.zenith_cli.create_tenant()
# check tenant1 appeared
helper_compare_tenant_list(pageserver_http_client, env)
# Create new tenant
tenant2 = uuid.uuid4()
env.zenith_cli.create_tenant(tenant2)
tenant2 = env.zenith_cli.create_tenant()
# check tenant2 appeared
helper_compare_tenant_list(pageserver_http_client, env)

View File

@@ -64,9 +64,8 @@ class ZenithCompare(PgCompare):
self._pg_bin = pg_bin
# We only use one branch and one timeline
self.branch = branch_name
self.env.zenith_cli.create_branch(self.branch, "empty")
self._pg = self.env.postgres.create_start(self.branch)
self.env.zenith_cli.create_branch(branch_name, 'empty')
self._pg = self.env.postgres.create_start(branch_name)
self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0]
# Long-lived cursor, useful for flushing

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass, field
from dataclasses import field
import textwrap
from cached_property import cached_property
import asyncpg
@@ -29,7 +29,6 @@ from dataclasses import dataclass
from psycopg2.extensions import connection as PgConnection
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple
from typing_extensions import Literal
import pytest
import requests
import backoff # type: ignore
@@ -58,6 +57,7 @@ Fn = TypeVar('Fn', bound=Callable[..., Any])
DEFAULT_OUTPUT_DIR = 'test_output'
DEFAULT_POSTGRES_DIR = 'tmp_install'
DEFAULT_BRANCH_NAME = 'main'
BASE_PORT = 15000
WORKER_PORT_NUM = 100
@@ -219,7 +219,7 @@ def can_bind(host: str, port: int) -> bool:
class PortDistributor:
def __init__(self, base_port: int, port_number: int) -> None:
def __init__(self, base_port: int, port_number: int):
self.iterator = iter(range(base_port, base_port + port_number))
def get_port(self) -> int:
@@ -242,15 +242,20 @@ class PgProtocol:
host: str,
port: int,
username: Optional[str] = None,
password: Optional[str] = None):
password: Optional[str] = None,
dbname: Optional[str] = None,
schema: Optional[str] = None):
self.host = host
self.port = port
self.username = username
self.password = password
self.dbname = dbname
self.schema = schema
def connstr(self,
*,
dbname: str = 'postgres',
dbname: Optional[str] = None,
schema: Optional[str] = None,
username: Optional[str] = None,
password: Optional[str] = None) -> str:
"""
@@ -259,6 +264,8 @@ class PgProtocol:
username = username or self.username
password = password or self.password
dbname = dbname or self.dbname or "postgres"
schema = schema or self.schema
res = f'host={self.host} port={self.port} dbname={dbname}'
if username:
@@ -267,13 +274,17 @@ class PgProtocol:
if password:
res = f'{res} password={password}'
if schema:
res = f"{res} options='-c search_path={schema}'"
return res
# autocommit=True here by default because that's what we need most of the time
def connect(self,
*,
autocommit=True,
dbname: str = 'postgres',
dbname: Optional[str] = None,
schema: Optional[str] = None,
username: Optional[str] = None,
password: Optional[str] = None) -> PgConnection:
"""
@@ -282,11 +293,13 @@ class PgProtocol:
This method passes all extra params to connstr.
"""
conn = psycopg2.connect(self.connstr(
dbname=dbname,
username=username,
password=password,
))
conn = psycopg2.connect(
self.connstr(
dbname=dbname,
schema=schema,
username=username,
password=password,
))
# WARNING: this setting affects *all* tests!
conn.autocommit = autocommit
return conn
@@ -411,7 +424,8 @@ class ZenithEnvBuilder:
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 0,
pageserver_auth_enabled: bool = False,
rust_log_override: Optional[str] = None):
rust_log_override: Optional[str] = None,
default_branch_name=DEFAULT_BRANCH_NAME):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
self.port_distributor = port_distributor
@@ -419,6 +433,7 @@ class ZenithEnvBuilder:
self.pageserver_config_override = pageserver_config_override
self.num_safekeepers = num_safekeepers
self.pageserver_auth_enabled = pageserver_auth_enabled
self.default_branch_name = default_branch_name
self.env: Optional[ZenithEnv] = None
self.s3_mock_server: Optional[MockS3Server] = None
@@ -523,7 +538,7 @@ class ZenithEnv:
initial_tenant - tenant ID of the initial tenant created in the repository
zenith_cli() - zenith_cli() can be used to run the 'zenith' CLI tool
zenith_cli - can be used to run the 'zenith' CLI tool
create_tenant() - initializes a new tenant in the page server, returns
the tenant id
@@ -534,9 +549,7 @@ class ZenithEnv:
self.port_distributor = config.port_distributor
self.s3_mock_server = config.s3_mock_server
self.zenith_cli = ZenithCli(env=self)
self.postgres = PostgresFactory(self)
self.safekeepers: List[Safekeeper] = []
# generate initial tenant ID here instead of letting 'zenith init' generate it,
@@ -545,7 +558,7 @@ class ZenithEnv:
# Create a config file corresponding to the options
toml = textwrap.dedent(f"""
default_tenantid = '{self.initial_tenant.hex}'
default_tenant_id = '{self.initial_tenant.hex}'
""")
# Create config for pageserver
@@ -587,7 +600,6 @@ class ZenithEnv:
self.safekeepers.append(safekeeper)
log.info(f"Config: {toml}")
self.zenith_cli.init(toml)
def start(self):
@@ -601,12 +613,6 @@ class ZenithEnv:
""" Get list of safekeeper endpoints suitable for wal_acceptors GUC """
return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers])
def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
if tenant_id is None:
tenant_id = uuid.uuid4()
self.zenith_cli.create_tenant(tenant_id)
return tenant_id
@cached_property
def auth_keys(self) -> AuthKeys:
pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes()
@@ -630,13 +636,11 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]:
shutil.rmtree(repo_dir, ignore_errors=True)
with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder:
env = builder.init_start()
# For convenience in tests, create a branch from the freshly-initialized cluster.
env.zenith_cli.create_branch("empty", "main")
env.zenith_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME)
# Return the builder to the caller
yield env
@@ -685,7 +689,7 @@ class ZenithPageserverApiException(Exception):
class ZenithPageserverHttpClient(requests.Session):
def __init__(self, port: int, auth_token: Optional[str] = None) -> None:
def __init__(self, port: int, auth_token: Optional[str] = None):
super().__init__()
self.port = port
self.auth_token = auth_token
@@ -708,38 +712,36 @@ class ZenithPageserverHttpClient(requests.Session):
def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
res = self.post(
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/attach", )
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/attach",
)
self.verbose_error(res)
def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
res = self.post(
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", )
self.verbose_error(res)
def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]:
res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict[Any, Any]:
res = self.post(f"http://localhost:{self.port}/v1/branch",
json={
'tenant_id': tenant_id.hex,
'name': name,
'start_point': start_point,
})
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict[Any, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}?include-non-incremental-logical-size=1",
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/detach",
)
self.verbose_error(res)
def timeline_create(
self,
tenant_id: uuid.UUID,
new_timeline_id: Optional[uuid.UUID] = None,
ancestor_timeline_id: Optional[uuid.UUID] = None,
ancestor_start_lsn: Optional[str] = None,
) -> Dict[Any, Any]:
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline",
json={
'new_timeline_id':
new_timeline_id.hex if new_timeline_id else None,
'ancestor_start_lsn':
ancestor_start_lsn,
'ancestor_timeline_id':
ancestor_timeline_id.hex if ancestor_timeline_id else None,
})
self.verbose_error(res)
if res.status_code == 409:
raise Exception(f'could not create timeline: already exists for id {new_timeline_id}')
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
@@ -751,18 +753,22 @@ class ZenithPageserverHttpClient(requests.Session):
assert isinstance(res_json, list)
return res_json
def tenant_create(self, tenant_id: uuid.UUID):
def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
res = self.post(
f"http://localhost:{self.port}/v1/tenant",
json={
'tenant_id': tenant_id.hex,
'new_tenant_id': new_tenant_id.hex if new_tenant_id else None,
},
)
self.verbose_error(res)
return res.json()
if res.status_code == 409:
raise Exception(f'could not create tenant: already exists for id {new_tenant_id}')
new_tenant_id = res.json()
assert isinstance(new_tenant_id, str)
return uuid.UUID(new_tenant_id)
def timeline_list(self, tenant_id: uuid.UUID) -> List[str]:
res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}")
def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]:
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
@@ -770,7 +776,8 @@ class ZenithPageserverHttpClient(requests.Session):
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}")
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1"
)
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
@@ -804,54 +811,124 @@ class S3Storage:
RemoteStorage = Union[LocalFsStorage, S3Storage]
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
re.MULTILINE)
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
re.MULTILINE)
TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P<branch_name>[^\s]+)\s\[(?P<timeline_id>[^\]]+)\]",
re.MULTILINE)
class ZenithCli:
"""
A typed wrapper around the `zenith` CLI tool.
Supports main commands via typed methods and a way to run arbitrary command directly via CLI.
"""
def __init__(self, env: ZenithEnv) -> None:
def __init__(self, env: ZenithEnv):
self.env = env
pass
def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
"""
Creates a new tenant, returns its id and its initial timeline's id.
"""
if tenant_id is None:
tenant_id = uuid.uuid4()
self.raw_cli(['tenant', 'create', tenant_id.hex])
res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex])
res.check_returncode()
return tenant_id
def list_tenants(self) -> 'subprocess.CompletedProcess[str]':
return self.raw_cli(['tenant', 'list'])
res = self.raw_cli(['tenant', 'list'])
res.check_returncode()
return res
def create_timeline(self,
new_branch_name: str,
tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
cmd = [
'timeline',
'create',
'--branch-name',
new_branch_name,
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
res = self.raw_cli(cmd)
res.check_returncode()
matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
created_timeline_id = None
if matches is not None:
created_timeline_id = matches.group('timeline_id')
return uuid.UUID(created_timeline_id)
def create_branch(self,
branch_name: str,
starting_point: str,
tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]':
args = ['branch']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
args.extend([branch_name, starting_point])
new_branch_name: str = DEFAULT_BRANCH_NAME,
ancestor_branch_name: Optional[str] = None,
tenant_id: Optional[uuid.UUID] = None,
ancestor_start_lsn: Optional[str] = None) -> uuid.UUID:
cmd = [
'timeline',
'branch',
'--branch-name',
new_branch_name,
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
if ancestor_branch_name is not None:
cmd.extend(['--ancestor-branch-name', ancestor_branch_name])
if ancestor_start_lsn is not None:
cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn])
return self.raw_cli(args)
res = self.raw_cli(cmd)
res.check_returncode()
def list_branches(self,
tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]':
args = ['branch']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
return self.raw_cli(args)
matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]':
created_timeline_id = None
if matches is not None:
created_timeline_id = matches.group('timeline_id')
if created_timeline_id is None:
raise Exception('could not find timeline id after `zenith timeline create` invocation')
else:
return uuid.UUID(created_timeline_id)
def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]:
"""
Returns a list of (branch_name, timeline_id) tuples out of parsed `zenith timeline list` CLI output.
"""
# (L) main [b49f7954224a0ad25cc0013ea107b54b]
# (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540]
res = self.raw_cli(
['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex])
timelines_cli = sorted(
map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]),
TIMELINE_DATA_EXTRACTOR.findall(res.stdout)))
return timelines_cli
def init(self,
config_toml: str,
initial_timeline_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]':
with tempfile.NamedTemporaryFile(mode='w+') as tmp:
tmp.write(config_toml)
tmp.flush()
cmd = ['init', f'--config={tmp.name}']
if initial_timeline_id:
cmd.extend(['--timeline-id', initial_timeline_id.hex])
append_pageserver_param_overrides(cmd,
self.env.pageserver.remote_storage,
self.env.pageserver.config_override)
return self.raw_cli(cmd)
res = self.raw_cli(cmd)
res.check_returncode()
return res
def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]':
start_args = ['pageserver', 'start', *overrides]
@@ -883,38 +960,54 @@ class ZenithCli:
def pg_create(
self,
node_name: str,
branch_name: str,
node_name: Optional[str] = None,
tenant_id: Optional[uuid.UUID] = None,
timeline_spec: Optional[str] = None,
lsn: Optional[str] = None,
port: Optional[int] = None,
) -> 'subprocess.CompletedProcess[str]':
args = ['pg', 'create']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
args = [
'pg',
'create',
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
'--branch-name',
branch_name,
]
if lsn is not None:
args.extend(['--lsn', lsn])
if port is not None:
args.append(f'--port={port}')
args.append(node_name)
if timeline_spec is not None:
args.append(timeline_spec)
return self.raw_cli(args)
args.extend(['--port', str(port)])
if node_name is not None:
args.append(node_name)
res = self.raw_cli(args)
res.check_returncode()
return res
def pg_start(
self,
node_name: str,
tenant_id: Optional[uuid.UUID] = None,
timeline_spec: Optional[str] = None,
lsn: Optional[str] = None,
port: Optional[int] = None,
) -> 'subprocess.CompletedProcess[str]':
args = ['pg', 'start']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
args = [
'pg',
'start',
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
if lsn is not None:
args.append(f'--lsn={lsn}')
if port is not None:
args.append(f'--port={port}')
args.append(node_name)
if timeline_spec is not None:
args.append(timeline_spec)
if node_name is not None:
args.append(node_name)
return self.raw_cli(args)
res = self.raw_cli(args)
res.check_returncode()
return res
def pg_stop(
self,
@@ -922,12 +1015,16 @@ class ZenithCli:
tenant_id: Optional[uuid.UUID] = None,
destroy=False,
) -> 'subprocess.CompletedProcess[str]':
args = ['pg', 'stop']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
args = [
'pg',
'stop',
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
if destroy:
args.append('--destroy')
args.append(node_name)
if node_name is not None:
args.append(node_name)
return self.raw_cli(args)
@@ -1002,8 +1099,7 @@ class ZenithPageserver(PgProtocol):
env: ZenithEnv,
port: PageserverPort,
remote_storage: Optional[RemoteStorage] = None,
config_override: Optional[str] = None,
enable_auth=False):
config_override: Optional[str] = None):
super().__init__(host='localhost', port=port.pg, username='zenith_admin')
self.env = env
self.running = False
@@ -1031,7 +1127,6 @@ class ZenithPageserver(PgProtocol):
if self.running:
self.env.zenith_cli.pageserver_stop(immediate)
self.running = False
return self
def __enter__(self):
@@ -1092,7 +1187,7 @@ class PgBin:
self.env = os.environ.copy()
self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib')
def _fixpath(self, command: List[str]) -> None:
def _fixpath(self, command: List[str]):
if '/' not in command[0]:
command[0] = os.path.join(self.pg_bin_path, command[0])
@@ -1103,7 +1198,7 @@ class PgBin:
env.update(env_add)
return env
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None) -> None:
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
"""
Run one of the postgres binaries.
@@ -1153,18 +1248,18 @@ class VanillaPostgres(PgProtocol):
self.running = False
self.pg_bin.run_capture(['initdb', '-D', pgdatadir])
def configure(self, options: List[str]) -> None:
def configure(self, options: List[str]):
"""Append lines into postgresql.conf file."""
assert not self.running
with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file:
conf_file.writelines(options)
def start(self) -> None:
def start(self):
assert not self.running
self.running = True
self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'start'])
def stop(self) -> None:
def stop(self):
assert self.running
self.running = False
self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop'])
@@ -1247,8 +1342,9 @@ class Postgres(PgProtocol):
def create(
self,
node_name: str,
branch: Optional[str] = None,
branch_name: str,
node_name: Optional[str] = None,
lsn: Optional[str] = None,
config_lines: Optional[List[str]] = None,
) -> 'Postgres':
"""
@@ -1259,19 +1355,21 @@ class Postgres(PgProtocol):
if not config_lines:
config_lines = []
if branch is None:
branch = node_name
self.env.zenith_cli.pg_create(node_name,
self.node_name = node_name or f'{branch_name}_pg_node'
self.env.zenith_cli.pg_create(branch_name,
node_name=self.node_name,
tenant_id=self.tenant_id,
port=self.port,
timeline_spec=branch)
self.node_name = node_name
lsn=lsn,
port=self.port)
path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name
self.pgdata_dir = os.path.join(self.env.repo_dir, path)
if config_lines is None:
config_lines = []
# set small 'max_replication_write_lag' to enable backpressure
# and make tests more stable.
config_lines = ['max_replication_write_lag=15MB'] + config_lines
self.config(config_lines)
return self
@@ -1358,7 +1456,7 @@ class Postgres(PgProtocol):
if self.running:
assert self.node_name is not None
self.env.zenith_cli.pg_stop(self.node_name, tenant_id=self.tenant_id)
self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id)
self.running = False
return self
@@ -1370,15 +1468,16 @@ class Postgres(PgProtocol):
"""
assert self.node_name is not None
self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, destroy=True)
self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True)
self.node_name = None
return self
def create_start(
self,
node_name: str,
branch: Optional[str] = None,
branch_name: str,
node_name: Optional[str] = None,
lsn: Optional[str] = None,
config_lines: Optional[List[str]] = None,
) -> 'Postgres':
"""
@@ -1388,9 +1487,10 @@ class Postgres(PgProtocol):
"""
self.create(
branch_name=branch_name,
node_name=node_name,
branch=branch,
config_lines=config_lines,
lsn=lsn,
).start()
return self
@@ -1410,9 +1510,10 @@ class PostgresFactory:
self.instances: List[Postgres] = []
def create_start(self,
node_name: str = "main",
branch: Optional[str] = None,
branch_name: str,
node_name: Optional[str] = None,
tenant_id: Optional[uuid.UUID] = None,
lsn: Optional[str] = None,
config_lines: Optional[List[str]] = None) -> Postgres:
pg = Postgres(
@@ -1424,15 +1525,17 @@ class PostgresFactory:
self.instances.append(pg)
return pg.create_start(
branch_name=branch_name,
node_name=node_name,
branch=branch,
config_lines=config_lines,
lsn=lsn,
)
def create(self,
node_name: str = "main",
branch: Optional[str] = None,
branch_name: str,
node_name: Optional[str] = None,
tenant_id: Optional[uuid.UUID] = None,
lsn: Optional[str] = None,
config_lines: Optional[List[str]] = None) -> Postgres:
pg = Postgres(
@@ -1445,8 +1548,9 @@ class PostgresFactory:
self.instances.append(pg)
return pg.create(
branch_name=branch_name,
node_name=node_name,
branch=branch,
lsn=lsn,
config_lines=config_lines,
)
@@ -1549,7 +1653,7 @@ class SafekeeperMetrics:
class SafekeeperHttpClient(requests.Session):
def __init__(self, port: int) -> None:
def __init__(self, port: int):
super().__init__()
self.port = port
@@ -1667,7 +1771,7 @@ def list_files_to_compare(pgdata_dir: str):
# pg is the existing and running compute node, that we want to compare with a basebackup
def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres):
# Get the timeline ID of our branch. We need it for the 'basebackup' command
# Get the timeline ID. We need it for the 'basebackup' command
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("SHOW zenith.zenith_timeline")

View File

@@ -30,21 +30,16 @@ def test_bulk_tenant_create(
for i in range(tenants_count):
start = timeit.default_timer()
tenant = env.create_tenant()
env.zenith_cli.create_branch(
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
"main",
tenant_id=tenant)
tenant = env.zenith_cli.create_tenant()
env.zenith_cli.create_timeline(
f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant)
# FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now?
#if use_wal_acceptors == 'with_wa':
# wa_factory.start_n_new(3)
pg_tenant = env.postgres.create_start(
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
None, # branch name, None means same as node name
tenant,
)
f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant)
end = timeit.default_timer()
time_slices.append(end - start)

View File

@@ -1,4 +1,4 @@
use anyhow::{bail, Context, Result};
use anyhow::{anyhow, bail, Context, Result};
use clap::{App, AppSettings, Arg, ArgMatches};
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env;
@@ -9,7 +9,7 @@ use pageserver::config::defaults::{
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
};
use std::collections::HashMap;
use std::collections::{BTreeSet, HashMap};
use std::process::exit;
use std::str::FromStr;
use walkeeper::defaults::{
@@ -17,15 +17,17 @@ use walkeeper::defaults::{
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
};
use zenith_utils::auth::{Claims, Scope};
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};
use zenith_utils::GIT_VERSION;
use pageserver::branches::BranchInfo;
use pageserver::timelines::TimelineInfo;
// Default id of a safekeeper node, if not specified on the command line.
const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1);
const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1);
const DEFAULT_BRANCH_NAME: &str = "main";
fn default_conf() -> String {
format!(
@@ -53,13 +55,15 @@ http_port = {safekeeper_http_port}
}
///
/// Branches tree element used as a value in the HashMap.
/// Timelines tree element used as a value in the HashMap.
///
struct BranchTreeEl {
/// `BranchInfo` received from the `pageserver` via the `branch_list` libpq API call.
pub info: BranchInfo,
/// Holds all direct children of this branch referenced using `timeline_id`.
pub children: Vec<String>,
struct TimelineTreeEl {
/// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call.
pub info: TimelineInfo,
/// Name, recovered from zenith config mappings
pub name: Option<String>,
/// Holds all direct children of this timeline referenced using `timeline_id`.
pub children: BTreeSet<ZTimelineId>,
}
// Main entry point for the 'zenith' CLI utility
@@ -70,29 +74,28 @@ struct BranchTreeEl {
// * Providing CLI api to the pageserver
// * TODO: export/import to/from usual postgres
fn main() -> Result<()> {
#[rustfmt::skip] // rustfmt squashes these into a single line otherwise
let pg_node_arg = Arg::new("node")
.index(1)
.help("Node name")
.required(true);
#[rustfmt::skip]
let safekeeper_id_arg = Arg::new("id")
.index(1)
.help("safekeeper id")
let branch_name_arg = Arg::new("branch-name")
.long("branch-name")
.takes_value(true)
.help("Name of the branch to be created or used as an alias for other services")
.required(false);
let timeline_arg = Arg::new("timeline")
.index(2)
.help("Branch name or a point-in time specification")
.required(false);
let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
let tenantid_arg = Arg::new("tenantid")
.long("tenantid")
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
let tenant_id_arg = Arg::new("tenant-id")
.long("tenant-id")
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
.takes_value(true)
.required(false);
let timeline_id_arg = Arg::new("timeline-id")
.long("timeline-id")
.help("Timeline id. Represented as a hexadecimal string 32 symbols length")
.takes_value(true)
.required(false);
let port_arg = Arg::new("port")
.long("port")
.required(false)
@@ -114,6 +117,12 @@ fn main() -> Result<()> {
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let lsn_arg = Arg::new("lsn")
.long("lsn")
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
.takes_value(true)
.required(false);
let matches = App::new("Zenith CLI")
.setting(AppSettings::ArgRequiredElseHelp)
.version(GIT_VERSION)
@@ -121,6 +130,7 @@ fn main() -> Result<()> {
App::new("init")
.about("Initialize a new Zenith repository")
.arg(pageserver_config_args.clone())
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(
Arg::new("config")
.long("config")
@@ -129,17 +139,32 @@ fn main() -> Result<()> {
)
)
.subcommand(
App::new("branch")
.about("Create a new branch")
.arg(Arg::new("branchname").required(false).index(1))
.arg(Arg::new("start-point").required(false).index(2))
.arg(tenantid_arg.clone()),
App::new("timeline")
.about("Manage timelines")
.subcommand(App::new("list")
.about("List all timelines, available to this pageserver")
.arg(tenant_id_arg.clone()))
.subcommand(App::new("branch")
.about("Create a new timeline, using another timeline as a base, copying its data")
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true)
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
.arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true)
.help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
.subcommand(App::new("create")
.about("Create a new blank timeline")
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone()))
).subcommand(
App::new("tenant")
.setting(AppSettings::ArgRequiredElseHelp)
.about("Manage tenants")
.subcommand(App::new("list"))
.subcommand(App::new("create").arg(Arg::new("tenantid").required(false).index(1)))
.subcommand(App::new("create")
.arg(tenant_id_arg.clone())
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
)
)
.subcommand(
App::new("pageserver")
@@ -174,12 +199,13 @@ fn main() -> Result<()> {
App::new("pg")
.setting(AppSettings::ArgRequiredElseHelp)
.about("Manage postgres instances")
.subcommand(App::new("list").arg(tenantid_arg.clone()))
.subcommand(App::new("list").arg(tenant_id_arg.clone()))
.subcommand(App::new("create")
.about("Create a postgres compute node")
.arg(pg_node_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(branch_name_arg.clone())
.arg(tenant_id_arg.clone())
.arg(lsn_arg.clone())
.arg(port_arg.clone())
.arg(
Arg::new("config-only")
@@ -190,20 +216,21 @@ fn main() -> Result<()> {
.subcommand(App::new("start")
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
.arg(pg_node_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(timeline_id_arg.clone())
.arg(lsn_arg.clone())
.arg(port_arg.clone()))
.subcommand(
App::new("stop")
.arg(pg_node_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(
Arg::new("destroy")
.help("Also delete data directory (now optional, should be default in future)")
.long("destroy")
.required(false)
)
.arg(pg_node_arg.clone())
.arg(tenant_id_arg.clone())
.arg(
Arg::new("destroy")
.help("Also delete data directory (now optional, should be default in future)")
.long("destroy")
.required(false)
)
)
)
@@ -225,75 +252,89 @@ fn main() -> Result<()> {
};
// Check for 'zenith init' command first.
let subcmd_result = if sub_name == "init" {
handle_init(sub_args)
let subcommand_result = if sub_name == "init" {
handle_init(sub_args).map(Some)
} else {
// all other commands need an existing config
let env = match LocalEnv::load_config() {
Ok(conf) => conf,
Err(e) => {
eprintln!("Error loading config: {}", e);
exit(1);
}
};
let mut env = LocalEnv::load_config().context("Error loading config")?;
let original_env = env.clone();
match sub_name {
"tenant" => handle_tenant(sub_args, &env),
"branch" => handle_branch(sub_args, &env),
let subcommand_result = match sub_name {
"tenant" => handle_tenant(sub_args, &mut env),
"timeline" => handle_timeline(sub_args, &mut env),
"start" => handle_start_all(sub_args, &env),
"stop" => handle_stop_all(sub_args, &env),
"pageserver" => handle_pageserver(sub_args, &env),
"pg" => handle_pg(sub_args, &env),
"safekeeper" => handle_safekeeper(sub_args, &env),
_ => bail!("unexpected subcommand {}", sub_name),
};
if original_env != env {
subcommand_result.map(|()| Some(env))
} else {
subcommand_result.map(|()| None)
}
};
if let Err(e) = subcmd_result {
eprintln!("command failed: {:#}", e);
exit(1);
}
match subcommand_result {
Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
Ok(None) => (),
Err(e) => {
eprintln!("command failed: {:?}", e);
exit(1);
}
}
Ok(())
}
///
/// Prints branches list as a tree-like structure.
/// Prints timelines list as a tree-like structure.
///
fn print_branches_tree(branches: Vec<BranchInfo>) -> Result<()> {
let mut branches_hash: HashMap<String, BranchTreeEl> = HashMap::new();
fn print_timelines_tree(
timelines: Vec<TimelineInfo>,
mut timeline_name_mappings: HashMap<ZTenantTimelineId, String>,
) -> Result<()> {
let mut timelines_hash = timelines
.iter()
.map(|t| {
(
t.timeline_id(),
TimelineTreeEl {
info: t.clone(),
children: BTreeSet::new(),
name: timeline_name_mappings
.remove(&ZTenantTimelineId::new(t.tenant_id(), t.timeline_id())),
},
)
})
.collect::<HashMap<_, _>>();
// Form a hash table of branch timeline_id -> BranchTreeEl.
for branch in &branches {
branches_hash.insert(
branch.timeline_id.to_string(),
BranchTreeEl {
info: branch.clone(),
children: Vec::new(),
},
);
}
// Memorize all direct children of each branch.
for branch in &branches {
if let Some(tid) = &branch.ancestor_id {
branches_hash
// Memorize all direct children of each timeline.
for timeline in &timelines {
if let TimelineInfo::Local {
ancestor_timeline_id: Some(tid),
..
} = timeline
{
timelines_hash
.get_mut(tid)
.context("missing branch info in the HashMap")?
.context("missing timeline info in the HashMap")?
.children
.push(branch.timeline_id.to_string());
.insert(timeline.timeline_id());
}
}
// Sort children by tid to bring some minimal order.
for branch in &mut branches_hash.values_mut() {
branch.children.sort();
}
for branch in branches_hash.values() {
// Start with root branches (no ancestors) first.
// Now there is 'main' branch only, but things may change.
if branch.info.ancestor_id.is_none() {
print_branch(0, &Vec::from([true]), branch, &branches_hash)?;
for timeline in timelines_hash.values() {
// Start with root local timelines (no ancestors) first.
if let TimelineInfo::Local {
ancestor_timeline_id,
..
} = &timeline.info
{
if ancestor_timeline_id.is_none() {
print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?;
}
}
}
@@ -301,27 +342,32 @@ fn print_branches_tree(branches: Vec<BranchInfo>) -> Result<()> {
}
///
/// Recursively prints branch info with all its children.
/// Recursively prints timeline info with all its children.
///
fn print_branch(
fn print_timeline(
nesting_level: usize,
is_last: &[bool],
branch: &BranchTreeEl,
branches: &HashMap<String, BranchTreeEl>,
timeline: &TimelineTreeEl,
timelines: &HashMap<ZTimelineId, TimelineTreeEl>,
) -> Result<()> {
let local_or_remote = match timeline.info {
TimelineInfo::Local { .. } => "(L)",
TimelineInfo::Remote { .. } => "(R)",
};
// Draw main padding
print!(" ");
print!("{} ", local_or_remote);
if nesting_level > 0 {
let lsn = branch
.info
.ancestor_lsn
.as_ref()
.context("missing branch info in the HashMap")?;
let lsn_string = match &timeline.info {
TimelineInfo::Local { ancestor_lsn, .. } => ancestor_lsn
.map(|lsn| lsn.to_string())
.unwrap_or_else(|| "Unknown local Lsn".to_string()),
TimelineInfo::Remote { .. } => "unknown Lsn (remote)".to_string(),
};
let mut br_sym = "┣━";
// Draw each nesting padding with proper style
// depending on whether its branch ended or not.
// depending on whether its timeline ended or not.
if nesting_level > 1 {
for l in &is_last[1..is_last.len() - 1] {
if *l {
@@ -332,73 +378,92 @@ fn print_branch(
}
}
// We are the last in this sub-branch
// We are the last in this sub-timeline
if *is_last.last().unwrap() {
br_sym = "┗━";
}
print!("{} @{}: ", br_sym, lsn);
print!("{} @{}: ", br_sym, lsn_string);
}
// Finally print a branch name with new line
println!("{}", branch.info.name);
// Finally print a timeline id and name with new line
println!(
"{} [{}]",
timeline.name.as_deref().unwrap_or("_no_name_"),
timeline.info.timeline_id()
);
let len = branch.children.len();
let len = timeline.children.len();
let mut i: usize = 0;
let mut is_last_new = Vec::from(is_last);
is_last_new.push(false);
for child in &branch.children {
for child in &timeline.children {
i += 1;
// Mark that the last padding is the end of the branch
// Mark that the last padding is the end of the timeline
if i == len {
if let Some(last) = is_last_new.last_mut() {
*last = true;
}
}
print_branch(
print_timeline(
nesting_level + 1,
&is_last_new,
branches
timelines
.get(child)
.context("missing branch info in the HashMap")?,
branches,
.context("missing timeline info in the HashMap")?,
timelines,
)?;
}
Ok(())
}
/// Returns a map of timeline IDs to branch_name@lsn strings.
/// Returns a map of timeline IDs to timeline_id@lsn strings.
/// Connects to the pageserver to query this information.
fn get_branch_infos(
fn get_timeline_infos(
env: &local_env::LocalEnv,
tenantid: &ZTenantId,
) -> Result<HashMap<ZTimelineId, BranchInfo>> {
let page_server = PageServerNode::from_env(env);
let branch_infos: Vec<BranchInfo> = page_server.branch_list(tenantid)?;
let branch_infos: HashMap<ZTimelineId, BranchInfo> = branch_infos
tenant_id: &ZTenantId,
) -> Result<HashMap<ZTimelineId, TimelineInfo>> {
Ok(PageServerNode::from_env(env)
.timeline_list(tenant_id)?
.into_iter()
.map(|branch_info| (branch_info.timeline_id, branch_info))
.collect();
Ok(branch_infos)
.map(|timeline_info| (timeline_info.timeline_id(), timeline_info))
.collect())
}
// Helper function to parse --tenantid option, or get the default from config file
fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<ZTenantId> {
if let Some(tenantid_cmd) = sub_match.value_of("tenantid") {
Ok(ZTenantId::from_str(tenantid_cmd)?)
} else if let Some(tenantid_conf) = env.default_tenantid {
// Helper function to parse --tenant_id option, or get the default from config file
fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<ZTenantId> {
if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() {
tenant_id_from_arguments
} else if let Some(tenantid_conf) = env.default_tenant_id {
Ok(ZTenantId::from(tenantid_conf))
} else {
bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file");
bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
}
}
fn handle_init(init_match: &ArgMatches) -> Result<()> {
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<ZTenantId>> {
sub_match
.value_of("tenant-id")
.map(ZTenantId::from_str)
.transpose()
.context("Failed to parse tenant id from the argument string")
}
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<ZTimelineId>> {
sub_match
.value_of("timeline-id")
.map(ZTimelineId::from_str)
.transpose()
.context("Failed to parse timeline id from the argument string")
}
fn handle_init(init_match: &ArgMatches) -> Result<LocalEnv> {
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
// Create config file
let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
// load and parse the file
@@ -414,18 +479,29 @@ fn handle_init(init_match: &ArgMatches) -> Result<()> {
env.init()
.context("Failed to initialize zenith repository")?;
// default_tenantid was generated by the `env.init()` call above
let initial_tenant_id = ZTenantId::from(env.default_tenant_id.unwrap());
// Call 'pageserver init'.
let pageserver = PageServerNode::from_env(&env);
if let Err(e) = pageserver.init(
// default_tenantid was generated by the `env.init()` call above
Some(&ZTenantId::from(env.default_tenantid.unwrap()).to_string()),
&pageserver_config_overrides(init_match),
) {
eprintln!("pageserver init failed: {}", e);
exit(1);
}
let initial_timeline_id = pageserver
.init(
Some(initial_tenant_id),
initial_timeline_id_arg,
&pageserver_config_overrides(init_match),
)
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {}", e);
exit(1);
});
Ok(())
env.register_branch_mapping(
DEFAULT_BRANCH_NAME.to_owned(),
initial_tenant_id,
initial_timeline_id,
)?;
Ok(env)
}
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
@@ -436,7 +512,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
.collect()
}
fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
match tenant_match.subcommand() {
Some(("list", _)) => {
@@ -445,13 +521,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
}
}
Some(("create", create_match)) => {
let tenantid = match create_match.value_of("tenantid") {
Some(tenantid) => ZTenantId::from_str(tenantid)?,
None => ZTenantId::generate(),
};
println!("using tenant id {}", tenantid);
pageserver.tenant_create(tenantid)?;
println!("tenant successfully created on the pageserver");
let initial_tenant_id = parse_tenant_id(create_match)?;
let new_tenant_id = pageserver
.tenant_create(initial_tenant_id)?
.ok_or_else(|| {
anyhow!("Tenant with id {:?} was already created", initial_tenant_id)
})?;
println!(
"tenant {} successfully created on the pageserver",
new_tenant_id
);
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
None => bail!("no tenant subcommand provided"),
@@ -459,24 +538,94 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
Ok(())
}
fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
let tenantid = get_tenantid(branch_match, env)?;
match timeline_match.subcommand() {
Some(("list", list_match)) => {
let tenant_id = get_tenant_id(list_match, env)?;
let timelines = pageserver.timeline_list(&tenant_id)?;
print_timelines_tree(timelines, env.timeline_name_mappings())?;
}
Some(("create", create_match)) => {
let tenant_id = get_tenant_id(create_match, env)?;
let new_branch_name = create_match
.value_of("branch-name")
.ok_or(anyhow!("No branch name provided"))?;
let timeline = pageserver
.timeline_create(tenant_id, None, None, None)?
.ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?;
let new_timeline_id = timeline.timeline_id();
if let Some(branchname) = branch_match.value_of("branchname") {
let startpoint_str = branch_match
.value_of("start-point")
.context("Missing start-point")?;
let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?;
println!(
"Created branch '{}' at {:?} for tenant: {}",
branch.name, branch.latest_valid_lsn, tenantid,
);
} else {
// No arguments, list branches for tenant
let branches = pageserver.branch_list(&tenantid)?;
print_branches_tree(branches)?;
let last_record_lsn = match timeline {
TimelineInfo::Local {
last_record_lsn, ..
} => last_record_lsn,
TimelineInfo::Remote { .. } => {
bail!(
"Timeline {} was created as remote, not local",
new_timeline_id
)
}
};
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
println!(
"Created timeline '{}' at Lsn {} for tenant: {}",
timeline.timeline_id(),
last_record_lsn,
tenant_id,
);
}
Some(("branch", branch_match)) => {
let tenant_id = get_tenant_id(branch_match, env)?;
let new_branch_name = branch_match
.value_of("branch-name")
.ok_or(anyhow!("No branch name provided"))?;
let ancestor_branch_name = branch_match
.value_of("ancestor-branch-name")
.unwrap_or(DEFAULT_BRANCH_NAME);
let ancestor_timeline_id = env
.get_branch_timeline_id(ancestor_branch_name, tenant_id)
.ok_or_else(|| {
anyhow!(
"Found no timeline id for branch name '{}'",
ancestor_branch_name
)
})?;
let start_lsn = branch_match
.value_of("ancestor-start-lsn")
.map(Lsn::from_str)
.transpose()
.context("Failed to parse ancestor start Lsn from the request")?;
let timeline = pageserver
.timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))?
.ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?;
let new_timeline_id = timeline.timeline_id();
let last_record_lsn = match timeline {
TimelineInfo::Local {
last_record_lsn, ..
} => last_record_lsn,
TimelineInfo::Remote { .. } => bail!(
"Timeline {} was created as remote, not local",
new_timeline_id
),
};
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
println!(
"Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'",
timeline.timeline_id(),
last_record_lsn,
tenant_id,
ancestor_branch_name,
);
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
None => bail!("no tenant subcommand provided"),
}
Ok(())
@@ -490,63 +639,90 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let mut cplane = ComputeControlPlane::load(env.clone())?;
// All subcommands take an optional --tenantid option
let tenantid = get_tenantid(sub_args, env)?;
// All subcommands take an optional --tenant-id option
let tenant_id = get_tenant_id(sub_args, env)?;
match sub_name {
"list" => {
let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| {
eprintln!("Failed to load branch info: {}", e);
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
HashMap::new()
});
println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS");
let timeline_name_mappings = env.timeline_name_mappings();
println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS");
for ((_, node_name), node) in cplane
.nodes
.iter()
.filter(|((node_tenantid, _), _)| node_tenantid == &tenantid)
.filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id)
{
// FIXME: This shows the LSN at the end of the timeline. It's not the
// right thing to do for read-only nodes that might be anchored at an
// older point in time, or following but lagging behind the primary.
let lsn_str = branch_infos
.get(&node.timelineid)
.map(|bi| bi.latest_valid_lsn.to_string())
.unwrap_or_else(|| "?".to_string());
let lsn_str = timeline_infos
.get(&node.timeline_id)
.map(|bi| match bi {
TimelineInfo::Local {
last_record_lsn, ..
} => last_record_lsn.to_string(),
TimelineInfo::Remote { .. } => "? (remote)".to_string(),
})
.unwrap_or_else(|| '?'.to_string());
let branch_name = timeline_name_mappings
.get(&ZTenantTimelineId::new(tenant_id, node.timeline_id))
.map(|name| name.as_str())
.unwrap_or("?");
println!(
"{}\t{}\t{}\t{}\t{}",
"{}\t{}\t{}\t{}\t{}\t{}",
node_name,
node.address,
node.timelineid, // FIXME: resolve human-friendly branch name
node.timeline_id,
branch_name,
lsn_str,
node.status(),
);
}
}
"create" => {
let node_name = sub_args.value_of("node").unwrap_or("main");
let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name);
let branch_name = sub_args
.value_of("branch-name")
.unwrap_or(DEFAULT_BRANCH_NAME);
let node_name = sub_args
.value_of("node")
.map(ToString::to_string)
.unwrap_or_else(|| format!("{}_node", branch_name));
let lsn = sub_args
.value_of("lsn")
.map(Lsn::from_str)
.transpose()
.context("Failed to parse Lsn from the request")?;
let timeline_id = env
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?;
let port: Option<u16> = match sub_args.value_of("port") {
Some(p) => Some(p.parse()?),
None => None,
};
cplane.new_node(tenantid, node_name, timeline_name, port)?;
cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?;
}
"start" => {
let node_name = sub_args.value_of("node").unwrap_or("main");
let timeline_name = sub_args.value_of("timeline");
let port: Option<u16> = match sub_args.value_of("port") {
Some(p) => Some(p.parse()?),
None => None,
};
let node_name = sub_args
.value_of("node")
.ok_or_else(|| anyhow!("No node name was provided to start"))?;
let node = cplane.nodes.get(&(tenantid, node_name.to_owned()));
let node = cplane.nodes.get(&(tenant_id, node_name.to_owned()));
let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) {
let claims = Claims::new(Some(tenantid), Scope::Tenant);
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
Some(env.generate_auth_token(&claims)?)
} else {
@@ -554,40 +730,49 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
};
if let Some(node) = node {
if timeline_name.is_some() {
println!("timeline name ignored because node exists already");
}
println!("Starting existing postgres {}...", node_name);
node.start(&auth_token)?;
} else {
let branch_name = sub_args
.value_of("branch-name")
.unwrap_or(DEFAULT_BRANCH_NAME);
let timeline_id = env
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| {
anyhow!("Found no timeline id for branch name '{}'", branch_name)
})?;
let lsn = sub_args
.value_of("lsn")
.map(Lsn::from_str)
.transpose()
.context("Failed to parse Lsn from the request")?;
// when used with custom port this results in non obvious behaviour
// port is remembered from first start command, i e
// start --port X
// stop
// start <-- will also use port X even without explicit port argument
let timeline_name = timeline_name.unwrap_or(node_name);
println!(
"Starting new postgres {} on {}...",
node_name, timeline_name
"Starting new postgres {} on timeline {} ...",
node_name, timeline_id
);
let node = cplane.new_node(tenantid, node_name, timeline_name, port)?;
let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?;
node.start(&auth_token)?;
}
}
"stop" => {
let node_name = sub_args.value_of("node").unwrap_or("main");
let node_name = sub_args
.value_of("node")
.ok_or_else(|| anyhow!("No node name was provided to stop"))?;
let destroy = sub_args.is_present("destroy");
let node = cplane
.nodes
.get(&(tenantid, node_name.to_owned()))
.get(&(tenant_id, node_name.to_owned()))
.with_context(|| format!("postgres {} is not found", node_name))?;
node.stop(destroy)?;
}
_ => {
bail!("Unexpected pg subcommand '{}'", sub_name)
}
_ => bail!("Unexpected pg subcommand '{}'", sub_name),
}
Ok(())

View File

@@ -37,3 +37,8 @@ bytes = "1.0.1"
hex-literal = "0.3"
tempfile = "3.2"
webpki = "0.21"
criterion = "0.3"
[[bench]]
name = "benchmarks"
harness = false

View File

@@ -0,0 +1,22 @@
#![allow(unused)]
use criterion::{criterion_group, criterion_main, Criterion};
use zenith_utils::zid;
pub fn bench_zid_stringify(c: &mut Criterion) {
// Can only use public methods.
let ztl = zid::ZTenantTimelineId::generate();
c.bench_function("zid.to_string", |b| {
b.iter(|| {
// FIXME measurement overhead?
//for _ in 0..1000 {
// ztl.tenant_id.to_string();
//}
ztl.tenant_id.to_string();
})
});
}
criterion_group!(benches, bench_zid_stringify);
criterion_main!(benches);

View File

@@ -112,6 +112,17 @@ impl ZId {
rand::thread_rng().fill(&mut tli_buf);
ZId::from(tli_buf)
}
fn hex_encode(&self) -> String {
static HEX: &[u8] = b"0123456789abcdef";
let mut buf = vec![0u8; self.0.len() * 2];
for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) {
chunk[0] = HEX[((b >> 4) & 0xf) as usize];
chunk[1] = HEX[(b & 0xf) as usize];
}
unsafe { String::from_utf8_unchecked(buf) }
}
}
impl FromStr for ZId {
@@ -147,13 +158,13 @@ impl From<[u8; 16]> for ZId {
impl fmt::Display for ZId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&hex::encode(self.0))
f.write_str(&self.hex_encode())
}
}
impl fmt::Debug for ZId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&hex::encode(self.0))
f.write_str(&self.hex_encode())
}
}