Compare commits

..

3 Commits

Author SHA1 Message Date
Bojan Serafimov
1c40c26313 Parse search_path option 2022-03-07 18:50:52 -05:00
Bojan Serafimov
a6ace609a7 Fix typo 2022-03-07 17:56:12 -05:00
Bojan Serafimov
29d72e8955 Add proxy test 2022-03-07 14:32:24 -05:00
73 changed files with 2079 additions and 2577 deletions

View File

@@ -440,14 +440,8 @@ jobs:
command: |
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
DOCKER_TAG=$(git log --oneline|wc -l)
docker build \
--pull \
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest .
docker push zenithdb/zenith:${DOCKER_TAG}
docker push zenithdb/zenith:latest
docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
docker tag zenithdb/zenith:latest zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG}
# Build zenithdb/compute-node:latest image and push it to Docker hub
docker-image-compute:
@@ -474,9 +468,8 @@ jobs:
command: |
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
DOCKER_TAG=$(git log --oneline|wc -l)
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres
docker push zenithdb/compute-node:${DOCKER_TAG}
docker push zenithdb/compute-node:latest
docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest
docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG}
# Build production zenithdb/zenith:release image and push it to Docker hub
docker-image-release:
@@ -494,14 +487,8 @@ jobs:
command: |
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
DOCKER_TAG="release-$(git log --oneline|wc -l)"
docker build \
--pull \
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release .
docker push zenithdb/zenith:${DOCKER_TAG}
docker push zenithdb/zenith:release
docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:release . && docker push zenithdb/zenith:release
docker tag zenithdb/zenith:release zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG}
# Build production zenithdb/compute-node:release image and push it to Docker hub
docker-image-compute-release:
@@ -528,9 +515,8 @@ jobs:
command: |
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
DOCKER_TAG="release-$(git log --oneline|wc -l)"
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres
docker push zenithdb/compute-node:${DOCKER_TAG}
docker push zenithdb/compute-node:release
docker build -t zenithdb/compute-node:release vendor/postgres && docker push zenithdb/compute-node:release
docker tag zenithdb/compute-node:release zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG}
deploy-staging:
docker:

View File

@@ -48,7 +48,7 @@ jobs:
echo Python
python3 --version
poetry run python3 --version
echo Poetry
echo Pipenv
poetry --version
echo Pgbench
$PG_BIN/pgbench --version

1
Cargo.lock generated
View File

@@ -424,7 +424,6 @@ dependencies = [
"thiserror",
"toml",
"url",
"walkeeper",
"workspace_hack",
"zenith_utils",
]

View File

@@ -1,62 +1,62 @@
# Build Postgres
#
#FROM zimg/rust:1.56 AS pg-build
FROM zenithdb/build:buster-20220309 AS pg-build
WORKDIR /pg
USER root
COPY vendor/postgres vendor/postgres
COPY Makefile Makefile
# Docker image for console integration testing.
#
#
# Build Postgres separately --- this layer will be rebuilt only if one of
# mentioned paths will get any changes.
#
FROM zimg/rust:1.56 AS pg-build
WORKDIR /zenith
COPY ./vendor/postgres vendor/postgres
COPY ./Makefile Makefile
ENV BUILD_TYPE release
RUN set -e \
&& make -j $(nproc) -s postgres \
&& rm -rf tmp_install/build \
&& tar -C tmp_install -czf /postgres_install.tar.gz .
RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
RUN rm -rf postgres_install/build
#
# Build zenith binaries
#
#FROM zimg/rust:1.56 AS build
FROM zenithdb/build:buster-20220309 AS build
ARG GIT_VERSION=local
# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
#
FROM zimg/rust:1.56 AS build
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
ARG AWS_ACCESS_KEY_ID
ARG AWS_SECRET_ACCESS_KEY
#ENV RUSTC_WRAPPER cachepot
ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot
ARG GIT_VERSION
RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
WORKDIR /zenith
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY . .
RUN GIT_VERSION=$GIT_VERSION cargo build --release
RUN cargo build --release
# Build final image
#
# Copy binaries to resulting image.
#
FROM debian:bullseye-slim
WORKDIR /data
RUN set -e \
&& apt-get update \
&& apt-get install -y \
libreadline-dev \
libseccomp-dev \
openssl \
ca-certificates \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& useradd -d /data zenith \
&& chown -R zenith:zenith /data
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin
COPY --from=pg-build /pg/tmp_install/ /usr/local/
COPY --from=pg-build /postgres_install.tar.gz /data/
RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
mkdir zenith_install
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
COPY --from=build /zenith/target/release/proxy /usr/local/bin
COPY --from=pg-build /zenith/tmp_install postgres_install
COPY docker-entrypoint.sh /docker-entrypoint.sh
# Remove build artifacts (~ 500 MB)
RUN rm -rf postgres_install/build && \
# 'Install' Postgres binaries locally
cp -r postgres_install/* /usr/local/ && \
# Prepare an archive of Postgres binaries (should be around 11 MB)
# and keep it inside container for an ease of deploy pipeline.
cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \
rm -rf postgres_install
RUN useradd -d /data zenith && chown -R zenith:zenith /data
VOLUME ["/data"]
USER zenith
EXPOSE 6400

View File

@@ -1,23 +0,0 @@
FROM rust:1.56.1-slim-buster
WORKDIR /home/circleci/project
RUN set -e \
&& apt-get update \
&& apt-get -yq install \
automake \
libtool \
build-essential \
bison \
flex \
libreadline-dev \
zlib1g-dev \
libxml2-dev \
libseccomp-dev \
pkg-config \
libssl-dev \
clang
RUN set -e \
&& rustup component add clippy \
&& cargo install cargo-audit \
&& cargo install --git https://github.com/paritytech/cachepot

View File

@@ -57,12 +57,12 @@ pageserver init succeeded
Starting pageserver at 'localhost:64000' in '.zenith'
Pageserver started
initializing for single for 7676
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
Safekeeper started
# start postgres compute node
> ./target/debug/zenith pg start main
Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
Starting new postgres main on main...
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
waiting for server to start.... done
@@ -70,8 +70,8 @@ server started
# check list of running postgres instances
> ./target/debug/zenith pg list
NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS
main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running
BRANCH ADDRESS LSN STATUS
main 127.0.0.1:55432 0/1609610 running
```
4. Now it is possible to connect to postgres and run some queries:
@@ -91,13 +91,13 @@ postgres=# select * from t;
5. And create branches and run postgres on them:
```sh
# create branch named migration_check
> ./target/debug/zenith timeline branch --branch-name migration_check
Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
> ./target/debug/zenith branch migration_check main
Created branch 'migration_check' at 0/1609610
# check branches tree
> ./target/debug/zenith timeline list
main [5b014a9e41b4b63ce1a1febc04503636]
┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
> ./target/debug/zenith branch
main
┗━ @0/1609610: migration_check
# start postgres on that branch
> ./target/debug/zenith pg start migration_check

View File

@@ -17,6 +17,5 @@ url = "2.2.2"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
pageserver = { path = "../pageserver" }
walkeeper = { path = "../walkeeper" }
zenith_utils = { path = "../zenith_utils" }
workspace_hack = { path = "../workspace_hack" }

View File

@@ -37,7 +37,7 @@ impl ComputeControlPlane {
// pgdatadirs
// |- tenants
// | |- <tenant_id>
// | | |- <node name>
// | | |- <branch name>
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
let pageserver = Arc::new(PageServerNode::from_env(&env));
@@ -52,7 +52,7 @@ impl ComputeControlPlane {
.with_context(|| format!("failed to list {}", tenant_dir.path().display()))?
{
let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?;
nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node));
nodes.insert((node.tenantid, node.name.clone()), Arc::new(node));
}
}
@@ -73,14 +73,40 @@ impl ComputeControlPlane {
.unwrap_or(self.base_port)
}
// FIXME: see also parse_point_in_time in branches.rs.
fn parse_point_in_time(
&self,
tenantid: ZTenantId,
s: &str,
) -> Result<(ZTimelineId, Option<Lsn>)> {
let mut strings = s.split('@');
let name = strings.next().unwrap();
let lsn = strings
.next()
.map(Lsn::from_str)
.transpose()
.context("invalid LSN in point-in-time specification")?;
// Resolve the timeline ID, given the human-readable branch name
let timeline_id = self
.pageserver
.branch_get_by_name(&tenantid, name)?
.timeline_id;
Ok((timeline_id, lsn))
}
pub fn new_node(
&mut self,
tenant_id: ZTenantId,
tenantid: ZTenantId,
name: &str,
timeline_id: ZTimelineId,
lsn: Option<Lsn>,
timeline_spec: &str,
port: Option<u16>,
) -> Result<Arc<PostgresNode>> {
// Resolve the human-readable timeline spec into timeline ID and LSN
let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;
let port = port.unwrap_or_else(|| self.get_port());
let node = Arc::new(PostgresNode {
name: name.to_owned(),
@@ -88,9 +114,9 @@ impl ComputeControlPlane {
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
is_test: false,
timeline_id,
timelineid,
lsn,
tenant_id,
tenantid,
uses_wal_proposer: false,
});
@@ -98,7 +124,7 @@ impl ComputeControlPlane {
node.setup_pg_conf(self.env.pageserver.auth_type)?;
self.nodes
.insert((tenant_id, node.name.clone()), Arc::clone(&node));
.insert((tenantid, node.name.clone()), Arc::clone(&node));
Ok(node)
}
@@ -113,9 +139,9 @@ pub struct PostgresNode {
pub env: LocalEnv,
pageserver: Arc<PageServerNode>,
is_test: bool,
pub timeline_id: ZTimelineId,
pub timelineid: ZTimelineId,
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
pub tenant_id: ZTenantId,
pub tenantid: ZTenantId,
uses_wal_proposer: bool,
}
@@ -147,8 +173,8 @@ impl PostgresNode {
// Read a few options from the config file
let context = format!("in config file {}", cfg_path_str);
let port: u16 = conf.parse_field("port", &context)?;
let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
let uses_wal_proposer = conf.get("wal_acceptors").is_some();
// parse recovery_target_lsn, if any
@@ -162,9 +188,9 @@ impl PostgresNode {
env: env.clone(),
pageserver: Arc::clone(pageserver),
is_test: false,
timeline_id,
timelineid,
lsn: recovery_target_lsn,
tenant_id,
tenantid,
uses_wal_proposer,
})
}
@@ -215,9 +241,9 @@ impl PostgresNode {
);
let sql = if let Some(lsn) = lsn {
format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn)
} else {
format!("basebackup {} {}", self.tenant_id, self.timeline_id)
format!("basebackup {} {}", self.tenantid, self.timelineid)
};
let mut client = self
@@ -303,8 +329,8 @@ impl PostgresNode {
conf.append("shared_preload_libraries", "zenith");
conf.append_line("");
conf.append("zenith.page_server_connstring", &pageserver_connstr);
conf.append("zenith.zenith_tenant", &self.tenant_id.to_string());
conf.append("zenith.zenith_timeline", &self.timeline_id.to_string());
conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
if let Some(lsn) = self.lsn {
conf.append("recovery_target_lsn", &lsn.to_string());
}
@@ -382,7 +408,7 @@ impl PostgresNode {
}
pub fn pgdata(&self) -> PathBuf {
self.env.pg_data_dir(&self.tenant_id, &self.name)
self.env.pg_data_dir(&self.tenantid, &self.name)
}
pub fn status(&self) -> &str {

View File

@@ -3,18 +3,16 @@
//! Now it also provides init method which acts like a stub for proper installation
//! script which will use local paths.
use anyhow::{bail, ensure, Context};
use anyhow::{bail, Context};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::env;
use std::fmt::Write;
use std::fs;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::{
HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId,
};
use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId};
use crate::safekeeper::SafekeeperNode;
@@ -25,7 +23,7 @@ use crate::safekeeper::SafekeeperNode;
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
// an example.
//
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
// compute nodes).
@@ -50,7 +48,7 @@ pub struct LocalEnv {
// Default tenant ID to use with the 'zenith' command line utility, when
// --tenantid is not explicitly specified.
#[serde(default)]
pub default_tenant_id: Option<HexZTenantId>,
pub default_tenantid: Option<HexZTenantId>,
// used to issue tokens during e.g pg start
#[serde(default)]
@@ -60,16 +58,9 @@ pub struct LocalEnv {
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
#[serde(default)]
// A `HashMap<String, HashMap<ZTenantId, ZTimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
branch_name_mappings: HashMap<String, Vec<(HexZTenantId, HexZTimelineId)>>,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(default)]
pub struct PageServerConf {
// node id
@@ -97,7 +88,7 @@ impl Default for PageServerConf {
}
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[derive(Serialize, Deserialize, Clone, Debug)]
#[serde(default)]
pub struct SafekeeperConf {
pub id: ZNodeId,
@@ -153,72 +144,6 @@ impl LocalEnv {
self.base_data_dir.join("safekeepers").join(data_dir_name)
}
pub fn register_branch_mapping(
&mut self,
branch_name: String,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
) -> anyhow::Result<()> {
let existing_values = self
.branch_name_mappings
.entry(branch_name.clone())
.or_default();
let tenant_id = HexZTenantId::from(tenant_id);
let timeline_id = HexZTimelineId::from(timeline_id);
let existing_ids = existing_values
.iter()
.find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
if let Some((_, old_timeline_id)) = existing_ids {
if old_timeline_id == &timeline_id {
Ok(())
} else {
bail!(
"branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
branch_name,
old_timeline_id,
timeline_id
);
}
} else {
existing_values.push((tenant_id, timeline_id));
Ok(())
}
}
pub fn get_branch_timeline_id(
&self,
branch_name: &str,
tenant_id: ZTenantId,
) -> Option<ZTimelineId> {
let tenant_id = HexZTenantId::from(tenant_id);
self.branch_name_mappings
.get(branch_name)?
.iter()
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
.map(|&(_, timeline_id)| timeline_id)
.map(ZTimelineId::from)
}
pub fn timeline_name_mappings(&self) -> HashMap<ZTenantTimelineId, String> {
self.branch_name_mappings
.iter()
.flat_map(|(name, tenant_timelines)| {
tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
(
ZTenantTimelineId::new(
ZTenantId::from(tenant_id),
ZTimelineId::from(timeline_id),
),
name.clone(),
)
})
})
.collect()
}
/// Create a LocalEnv from a config file.
///
/// Unlike 'load_config', this function fills in any defaults that are missing
@@ -258,8 +183,8 @@ impl LocalEnv {
}
// If no initial tenant ID was given, generate it.
if env.default_tenant_id.is_none() {
env.default_tenant_id = Some(HexZTenantId::from(ZTenantId::generate()));
if env.default_tenantid.is_none() {
env.default_tenantid = Some(HexZTenantId::from(ZTenantId::generate()));
}
env.base_data_dir = base_path();
@@ -289,39 +214,6 @@ impl LocalEnv {
Ok(env)
}
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
// Currently, the user first passes a config file with 'zenith init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
let mut conf_content = r#"# This file describes a locale deployment of the page server
# and safekeeeper node. It is read by the 'zenith' command-line
# utility.
"#
.to_string();
// Convert the LocalEnv to a toml file.
//
// This could be as simple as this:
//
// conf_content += &toml::to_string_pretty(env)?;
//
// But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
let target_config_path = base_path.join("config");
fs::write(&target_config_path, conf_content).with_context(|| {
format!(
"Failed to write config file into path '{}'",
target_config_path.display()
)
})
}
// this function is used only for testing purposes in CLI e g generate tokens during init
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
let private_key_path = if self.private_key_path.is_absolute() {
@@ -340,15 +232,15 @@ impl LocalEnv {
pub fn init(&mut self) -> anyhow::Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
ensure!(
base_path != Path::new(""),
"repository base path is missing"
);
ensure!(
!base_path.exists(),
"directory '{}' already exists. Perhaps already initialized?",
base_path.display()
);
if base_path == Path::new("") {
bail!("repository base path is missing");
}
if base_path.exists() {
bail!(
"directory '{}' already exists. Perhaps already initialized?",
base_path.to_str().unwrap()
);
}
fs::create_dir(&base_path)?;
@@ -400,7 +292,36 @@ impl LocalEnv {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
}
self.persist_config(base_path)
let mut conf_content = String::new();
// Currently, the user first passes a config file with 'zenith init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
write!(
&mut conf_content,
r#"# This file describes a locale deployment of the page server
# and safekeeeper node. It is read by the 'zenith' command-line
# utility.
"#
)?;
// Convert the LocalEnv to a toml file.
//
// This could be as simple as this:
//
// conf_content += &toml::to_string_pretty(env)?;
//
// But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
fs::write(base_path.join("config"), conf_content)?;
Ok(())
}
}

View File

@@ -14,9 +14,8 @@ use postgres::Config;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use walkeeper::http::models::TimelineCreateRequest;
use zenith_utils::http::error::HttpErrorBody;
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
use zenith_utils::zid::ZNodeId;
use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::storage::PageServerNode;
@@ -262,25 +261,4 @@ impl SafekeeperNode {
.error_from_body()?;
Ok(())
}
pub fn timeline_create(
&self,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
peer_ids: Vec<ZNodeId>,
) -> Result<()> {
Ok(self
.http_request(
Method::POST,
format!("{}/{}", self.http_base_url, "timeline"),
)
.json(&TimelineCreateRequest {
tenant_id,
timeline_id,
peer_ids,
})
.send()?
.error_from_body()?
.json()?)
}
}

View File

@@ -1,4 +1,3 @@
use std::convert::TryFrom;
use std::io::Write;
use std::net::TcpStream;
use std::path::PathBuf;
@@ -6,23 +5,22 @@ use std::process::Command;
use std::time::Duration;
use std::{io, result, thread};
use anyhow::{bail, Context};
use anyhow::bail;
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse};
use pageserver::timelines::TimelineInfo;
use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
use postgres::{Config, NoTls};
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use zenith_utils::http::error::HttpErrorBody;
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::{HexZTenantId, HexZTimelineId, ZTenantId, ZTimelineId};
use zenith_utils::zid::ZTenantId;
use crate::local_env::LocalEnv;
use crate::{fill_rust_env_vars, read_pidfile};
use pageserver::branches::BranchInfo;
use pageserver::tenant_mgr::TenantInfo;
use zenith_utils::connstring::connection_address;
@@ -100,10 +98,9 @@ impl PageServerNode {
pub fn init(
&self,
create_tenant: Option<ZTenantId>,
initial_timeline_id: Option<ZTimelineId>,
create_tenant: Option<&str>,
config_overrides: &[&str],
) -> anyhow::Result<ZTimelineId> {
) -> anyhow::Result<()> {
let mut cmd = Command::new(self.env.pageserver_bin()?);
let id = format!("id={}", self.env.pageserver.id);
@@ -140,24 +137,19 @@ impl PageServerNode {
]);
}
let create_tenant = create_tenant.map(|id| id.to_string());
if let Some(tenant_id) = create_tenant.as_deref() {
args.extend(["--create-tenant", tenant_id])
if let Some(tenantid) = create_tenant {
args.extend(["--create-tenant", tenantid])
}
let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
let initial_timeline_id_string = initial_timeline_id.to_string();
args.extend(["--initial-timeline-id", &initial_timeline_id_string]);
let status = fill_rust_env_vars(cmd.args(args))
.status()
.expect("pageserver init failed");
let init_output = fill_rust_env_vars(cmd.args(args))
.output()
.context("pageserver init failed")?;
if !init_output.status.success() {
if !status.success() {
bail!("pageserver init failed");
}
Ok(initial_timeline_id)
Ok(())
}
pub fn repo_path(&self) -> PathBuf {
@@ -318,7 +310,7 @@ impl PageServerNode {
}
pub fn check_status(&self) -> Result<()> {
self.http_request(Method::GET, format!("{}/status", self.http_base_url))
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
.send()?
.error_from_body()?;
Ok(())
@@ -326,76 +318,64 @@ impl PageServerNode {
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
Ok(self
.http_request(Method::GET, format!("{}/tenant", self.http_base_url))
.http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
.send()?
.error_from_body()?
.json()?)
}
pub fn tenant_create(
&self,
new_tenant_id: Option<ZTenantId>,
) -> anyhow::Result<Option<ZTenantId>> {
let tenant_id_string = self
.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> {
Ok(self
.http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant"))
.json(&TenantCreateRequest {
new_tenant_id: new_tenant_id.map(HexZTenantId::from),
tenant_id: tenantid,
})
.send()?
.error_from_body()?
.json::<Option<String>>()?;
tenant_id_string
.map(|id| {
id.parse().with_context(|| {
format!(
"Failed to parse tennat creation response as tenant id: {}",
id
)
})
})
.transpose()
.json()?)
}
pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
let timeline_infos: Vec<TimelineInfoResponse> = self
pub fn branch_list(&self, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
Ok(self
.http_request(
Method::GET,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
format!("{}/branch/{}", self.http_base_url, tenantid),
)
.send()?
.error_from_body()?
.json()?;
timeline_infos
.into_iter()
.map(TimelineInfo::try_from)
.collect()
.json()?)
}
pub fn timeline_create(
pub fn branch_create(
&self,
tenant_id: ZTenantId,
new_timeline_id: Option<ZTimelineId>,
ancestor_start_lsn: Option<Lsn>,
ancestor_timeline_id: Option<ZTimelineId>,
) -> anyhow::Result<Option<TimelineInfo>> {
let timeline_info_response = self
.http_request(
Method::POST,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
)
.json(&TimelineCreateRequest {
new_timeline_id: new_timeline_id.map(HexZTimelineId::from),
ancestor_start_lsn,
ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from),
branch_name: &str,
startpoint: &str,
tenantid: &ZTenantId,
) -> Result<BranchInfo> {
Ok(self
.http_request(Method::POST, format!("{}/branch", self.http_base_url))
.json(&BranchCreateRequest {
tenant_id: tenantid.to_owned(),
name: branch_name.to_owned(),
start_point: startpoint.to_owned(),
})
.send()?
.error_from_body()?
.json::<Option<TimelineInfoResponse>>()?;
.json()?)
}
timeline_info_response
.map(TimelineInfo::try_from)
.transpose()
pub fn branch_get_by_name(
&self,
tenantid: &ZTenantId,
branch_name: &str,
) -> Result<BranchInfo> {
Ok(self
.http_request(
Method::GET,
format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name),
)
.send()?
.error_for_status()?
.json()?)
}
}

View File

@@ -2,14 +2,7 @@
use std::{env, path::Path, str::FromStr};
use tracing::*;
use zenith_utils::{
auth::JwtAuth,
logging,
postgres_backend::AuthType,
tcp_listener,
zid::{ZTenantId, ZTimelineId},
GIT_VERSION,
};
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};
use anyhow::{bail, Context, Result};
@@ -17,10 +10,11 @@ use clap::{App, Arg};
use daemonize::Daemonize;
use pageserver::{
branches,
config::{defaults::*, PageServerConf},
http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr,
thread_mgr::ThreadKind,
timelines, virtual_file, LOG_FILE_NAME,
virtual_file, LOG_FILE_NAME,
};
use zenith_utils::http::endpoint;
use zenith_utils::postgres_backend;
@@ -43,7 +37,7 @@ fn main() -> Result<()> {
Arg::new("init")
.long("init")
.takes_value(false)
.help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"),
.help("Initialize pageserver repo"),
)
.arg(
Arg::new("workdir")
@@ -59,13 +53,6 @@ fn main() -> Result<()> {
.help("Create tenant during init")
.requires("init"),
)
.arg(
Arg::new("initial-timeline-id")
.long("initial-timeline-id")
.takes_value(true)
.help("Use a specific timeline id during init and tenant creation")
.requires("create-tenant"),
)
// See `settings.md` for more details on the extra configuration patameters pageserver can process
.arg(
Arg::new("config-override")
@@ -85,16 +72,7 @@ fn main() -> Result<()> {
let cfg_file_path = workdir.join("pageserver.toml");
let init = arg_matches.is_present("init");
let create_tenant = arg_matches
.value_of("create-tenant")
.map(ZTenantId::from_str)
.transpose()
.context("Failed to parse tenant id from the arguments")?;
let initial_timeline_id = arg_matches
.value_of("initial-timeline-id")
.map(ZTimelineId::from_str)
.transpose()
.context("Failed to parse timeline id from the arguments")?;
let create_tenant = arg_matches.value_of("create-tenant");
// Set CWD to workdir for non-daemon modes
env::set_current_dir(&workdir).with_context(|| {
@@ -165,8 +143,7 @@ fn main() -> Result<()> {
// Create repo and exit if init was requested
if init {
timelines::init_pageserver(conf, create_tenant, initial_timeline_id)
.context("Failed to init pageserver")?;
branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
// write the config file
std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
format!(

View File

@@ -1,221 +0,0 @@
//! Pageserver benchmark tool
//!
//! Usually it's easier to write python perf tests, but here the performance
//! of the tester matters, and the API is easier to work with from rust.
use std::{collections::HashMap, io::{BufRead, BufReader, Cursor}, net::SocketAddr, ops::AddAssign};
use byteorder::ReadBytesExt;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use bytes::{BufMut, Bytes, BytesMut};
use clap::{App, Arg};
use std::fs::File;
use zenith_utils::{GIT_VERSION, pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage}};
use anyhow::Result;
pub fn read_lines_buffered(file_name: &str) -> impl Iterator<Item = String> {
BufReader::new(File::open(file_name).unwrap())
.lines()
.map(|result| result.unwrap())
}
pub async fn get_page(
pagestream: &mut tokio::net::TcpStream,
lsn: &Lsn,
page: &Page,
) -> anyhow::Result<Vec<u8>> {
let msg = {
let query = {
let mut query = BytesMut::new();
query.put_u8(2); // Specifies get_page query
query.put_u8(0); // Specifies this is not a "latest page" query
query.put_u64(lsn.0);
page.write(&mut query).await?;
query.freeze()
};
let mut buf = BytesMut::new();
let copy_msg = BeMessage::CopyData(&query);
BeMessage::write(&mut buf, &copy_msg)?;
buf.freeze()
};
pagestream.write(&msg).await?;
let response = match FeMessage::read_fut(pagestream).await? {
Some(FeMessage::CopyData(page)) => page,
r => panic!("Expected CopyData message, got: {:?}", r),
};
let page = {
let mut cursor = Cursor::new(response);
let tag = AsyncReadExt::read_u8(&mut cursor).await?;
match tag {
102 => {
let mut page = Vec::<u8>::new();
cursor.read_to_end(&mut page).await?;
dbg!(page.len());
if page.len() != 8 * 1024 {
panic!("Expected 8kb page, got: {:?}", page.len());
}
page
},
103 => {
let mut bytes = Vec::<u8>::new();
cursor.read_to_end(&mut bytes).await?;
let message = String::from_utf8(bytes)?;
panic!("Got error message: {}", message);
},
_ => panic!("Unhandled tag {:?}", tag)
}
};
Ok(page)
}
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
pub struct Lsn(pub u64);
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
pub struct Page {
spcnode: u32,
dbnode: u32,
relnode: u32,
forknum: u8,
blkno: u32,
}
impl Page {
async fn read<Reader>(buf: &mut Reader) -> Result<Page>
where
Reader: tokio::io::AsyncRead + Unpin,
{
let spcnode = buf.read_u32().await?;
let dbnode = buf.read_u32().await?;
let relnode = buf.read_u32().await?;
let forknum = buf.read_u8().await?;
let blkno = buf.read_u32().await?;
Ok(Page { spcnode, dbnode, relnode, forknum, blkno })
}
async fn write(&self, buf: &mut BytesMut) -> Result<()> {
buf.put_u32(self.spcnode);
buf.put_u32(self.dbnode);
buf.put_u32(self.relnode);
buf.put_u8(self.forknum);
buf.put_u32(self.blkno);
Ok(())
}
}
#[tokio::main]
async fn main() -> Result<()> {
let arg_matches = App::new("LALALA")
.about("lalala")
.version(GIT_VERSION)
.arg(
Arg::new("path")
.help("Path to file to dump")
.required(true)
.index(1),
)
.arg(
Arg::new("ps_connstr")
.help("Connection string to pageserver")
.required(true)
.index(2),
)
.arg(
Arg::new("tenant_hex")
.help("TODO")
.required(true)
.index(3),
)
.arg(
Arg::new("timeline")
.help("TODO")
.required(true)
.index(4),
)
.get_matches();
let log_file = arg_matches.value_of("path").unwrap();
let ps_connstr = arg_matches.value_of("ps_connstr").unwrap();
let tenant_hex = arg_matches.value_of("tenant_hex").unwrap();
let timeline = arg_matches.value_of("timeline").unwrap();
// Parse log lines
let relevant = read_lines_buffered(log_file) .filter_map(|line| line.strip_prefix("wal-at-lsn-modified-page ").map(|x| x.to_string()));
let mut lsn_page_pairs = Vec::<(Lsn, Page)>::new();
for line in relevant {
let (lsn, page) = line.split_once(" ").unwrap();
let lsn = hex::decode(lsn)?;
let lsn = Lsn(AsyncReadExt::read_u64(&mut Cursor::new(lsn)).await?);
let page = hex::decode(page)?;
let page = Page::read(&mut Cursor::new(page)).await?;
lsn_page_pairs.push((lsn, page))
}
// Organize write info
let mut writes_per_entry = HashMap::<Lsn, Vec<Page>>::new();
for (lsn, page) in lsn_page_pairs.clone() {
writes_per_entry.entry(lsn).or_insert(vec![]).push(page);
}
// Print some stats
let mut updates_per_page = HashMap::<Page, usize>::new();
for (_, page) in lsn_page_pairs.clone() {
updates_per_page.entry(page).or_insert(0).add_assign(1);
}
let mut updates_per_page: Vec<(&usize, &Page)> = updates_per_page
.iter().map(|(k, v)| (v, k)).collect();
updates_per_page.sort();
updates_per_page.reverse();
dbg!(&updates_per_page);
let hottest_page = updates_per_page[0].1;
let first_update = lsn_page_pairs
.iter()
.filter(|(_lsn, page)| page == hottest_page)
.map(|(lsn, _page)| lsn)
.min()
.unwrap();
// Get raw TCP connection to the pageserver postgres protocol port
let mut socket = tokio::net::TcpStream::connect("localhost:15000").await?;
let (client, conn) = tokio_postgres::Config::new()
.host("127.0.0.1")
.port(15000)
.dbname("postgres")
.user("zenith_admin")
.connect_raw(&mut socket, tokio_postgres::NoTls)
.await?;
// Enter pagestream protocol
let init_query = format!("pagestream {} {}", tenant_hex, timeline);
tokio::select! {
_ = conn => panic!("AAAA"),
_ = client.query(init_query.as_str(), &[]) => (),
};
// TODO merge with LSM branch. Nothing to test otherwise, too many images.
// - I get error: tried to request a page version that was garbage collected
// TODO be mindful of caching, take multiple measurements, use monotonic time.
// TODO make harder test case. More writes, fewer images.
// TODO concurrent requests: multiple reads, also writes.
use std::time::Instant;
for (lsn, _pages) in writes_per_entry {
if lsn >= *first_update {
println!("Running get_page {:?} at {:?}", hottest_page, lsn);
let start = Instant::now();
let _page = get_page(&mut socket, &lsn, &hottest_page).await?;
let duration = start.elapsed();
println!("Time: {:?}", duration);
}
}
Ok(())
}

427
pageserver/src/branches.rs Normal file
View File

@@ -0,0 +1,427 @@
//!
//! Branch management code
//!
// TODO: move all paths construction to conf impl
//
use anyhow::{bail, Context, Result};
use postgres_ffi::ControlFileData;
use serde::{Deserialize, Serialize};
use std::{
fs,
path::Path,
process::{Command, Stdio},
str::FromStr,
sync::Arc,
};
use tracing::*;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use zenith_utils::{crashsafe_dir, logging};
use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
use crate::{config::PageServerConf, repository::Repository};
use crate::{import_datadir, LOG_FILE_NAME};
use crate::{repository::RepositoryTimeline, tenant_mgr};
#[derive(Serialize, Deserialize, Clone)]
pub struct BranchInfo {
pub name: String,
#[serde(with = "hex")]
pub timeline_id: ZTimelineId,
pub latest_valid_lsn: Lsn,
pub ancestor_id: Option<String>,
pub ancestor_lsn: Option<String>,
pub current_logical_size: usize,
pub current_logical_size_non_incremental: Option<usize>,
}
impl BranchInfo {
pub fn from_path<T: AsRef<Path>>(
path: T,
repo: &Arc<dyn Repository>,
include_non_incremental_logical_size: bool,
) -> Result<Self> {
let path = path.as_ref();
let name = path.file_name().unwrap().to_string_lossy().to_string();
let timeline_id = std::fs::read_to_string(path)
.with_context(|| {
format!(
"Failed to read branch file contents at path '{}'",
path.display()
)
})?
.parse::<ZTimelineId>()?;
let timeline = match repo.get_timeline(timeline_id)? {
RepositoryTimeline::Local(local_entry) => local_entry,
RepositoryTimeline::Remote { .. } => {
bail!("Timeline {} is remote, no branches to display", timeline_id)
}
};
// we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() {
Some(ancestor_id) => (
Some(ancestor_id.to_string()),
Some(timeline.get_ancestor_lsn().to_string()),
),
None => (None, None),
};
// non incremental size calculation can be heavy, so let it be optional
// needed for tests to check size calculation
let current_logical_size_non_incremental = include_non_incremental_logical_size
.then(|| {
timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
})
.transpose()?;
Ok(BranchInfo {
name,
timeline_id,
latest_valid_lsn: timeline.get_last_record_lsn(),
ancestor_id,
ancestor_lsn,
current_logical_size: timeline.get_current_logical_size(),
current_logical_size_non_incremental,
})
}
}
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
pub timelineid: ZTimelineId,
pub lsn: Lsn,
}
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
// Initialize logger
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
let _log_file = logging::init(LOG_FILE_NAME, true)?;
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
// process during repository initialization.
//
// FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
// initdb in the background, and it kept running even after the "zenith init" had exited.
// In tests, we started the page server immediately after that, so that initdb was still
// running in the background, and we failed to run initdb again in the same directory. This
// has been solved for the rapid init+start case now, but the general race condition remains
// if you restart the server quickly. The WAL redo manager doesn't use a separate thread
// anymore, but I think that could still happen.
let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {});
if let Some(tenantid) = create_tenant {
let tenantid = ZTenantId::from_str(tenantid)?;
println!("initializing tenantid {}", tenantid);
create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?;
}
crashsafe_dir::create_dir_all(conf.tenants_path())?;
println!("pageserver init succeeded");
Ok(())
}
pub fn create_repo(
conf: &'static PageServerConf,
tenantid: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
) -> Result<Arc<dyn Repository>> {
let repo_dir = conf.tenant_path(&tenantid);
if repo_dir.exists() {
bail!("repo for {} already exists", tenantid)
}
// top-level dir may exist if we are creating it through CLI
crashsafe_dir::create_dir_all(&repo_dir)
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
info!("created directory structure in {}", repo_dir.display());
// create a new timeline directory
let timeline_id = ZTimelineId::generate();
let timelinedir = conf.timeline_path(&timeline_id, &tenantid);
crashsafe_dir::create_dir(&timelinedir)?;
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
conf,
wal_redo_manager,
tenantid,
conf.remote_storage_config.is_some(),
));
// Load data into pageserver
// TODO To implement zenith import we need to
// move data loading out of create_repo()
bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;
Ok(repo)
}
// Returns checkpoint LSN from controlfile
fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
let lsn = controlfile.checkPoint;
Ok(Lsn(lsn))
}
// Create the cluster temporarily in 'initdbpath' directory inside the repository
// to get bootstrap data for timeline initialization.
//
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
info!("running initdb in {}... ", initdbpath.display());
let initdb_path = conf.pg_bin_dir().join("initdb");
let initdb_output = Command::new(initdb_path)
.args(&["-D", initdbpath.to_str().unwrap()])
.args(&["-U", &conf.superuser])
.args(&["-E", "utf8"])
.arg("--no-instructions")
// This is only used for a temporary installation that is deleted shortly after,
// so no need to fsync it
.arg("--no-sync")
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null())
.output()
.context("failed to execute initdb")?;
if !initdb_output.status.success() {
anyhow::bail!(
"initdb failed: '{}'",
String::from_utf8_lossy(&initdb_output.stderr)
);
}
Ok(())
}
//
// - run initdb to init temporary instance and get bootstrap data
// - after initialization complete, remove the temp dir.
//
fn bootstrap_timeline(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tli: ZTimelineId,
repo: &dyn Repository,
) -> Result<()> {
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
// Init temporarily repo to get bootstrap data
run_initdb(conf, &initdb_path)?;
let pgdata_path = initdb_path;
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
// Import the contents of the data directory at the initial checkpoint
// LSN, and any WAL after that.
// Initdb lsn will be equal to last_record_lsn which will be set after import.
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
let timeline = repo.create_empty_timeline(tli, lsn)?;
import_datadir::import_timeline_from_postgres_datadir(
&pgdata_path,
timeline.writer().as_ref(),
lsn,
)?;
timeline.checkpoint(CheckpointConfig::Forced)?;
println!(
"created initial timeline {} timeline.lsn {}",
tli,
timeline.get_last_record_lsn()
);
let data = tli.to_string();
fs::write(conf.branch_path("main", &tenantid), data)?;
println!("created main branch");
// Remove temp dir. We don't need it anymore
fs::remove_dir_all(pgdata_path)?;
Ok(())
}
pub(crate) fn get_branches(
conf: &PageServerConf,
tenantid: &ZTenantId,
include_non_incremental_logical_size: bool,
) -> Result<Vec<BranchInfo>> {
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
// Each branch has a corresponding record (text file) in the refs/branches
// with timeline_id.
let branches_dir = conf.branches_path(tenantid);
std::fs::read_dir(&branches_dir)
.with_context(|| {
format!(
"Found no branches directory '{}' for tenant {}",
branches_dir.display(),
tenantid
)
})?
.map(|dir_entry_res| {
let dir_entry = dir_entry_res.with_context(|| {
format!(
"Failed to list branches directory '{}' content for tenant {}",
branches_dir.display(),
tenantid
)
})?;
BranchInfo::from_path(
dir_entry.path(),
&repo,
include_non_incremental_logical_size,
)
})
.collect()
}
pub(crate) fn create_branch(
conf: &PageServerConf,
branchname: &str,
startpoint_str: &str,
tenantid: &ZTenantId,
) -> Result<BranchInfo> {
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
if conf.branch_path(branchname, tenantid).exists() {
anyhow::bail!("branch {} already exists", branchname);
}
let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
let timeline = repo
.get_timeline(startpoint.timelineid)?
.local_timeline()
.context("Cannot branch off the timeline that's not present locally")?;
if startpoint.lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
startpoint.lsn = end_of_wal;
} else {
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
timeline.wait_lsn(startpoint.lsn)?;
}
startpoint.lsn = startpoint.lsn.align();
if timeline.get_ancestor_lsn() > startpoint.lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!(
"invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}",
startpoint.lsn,
branchname,
timeline.get_ancestor_lsn()
);
}
let new_timeline_id = ZTimelineId::generate();
// Forward entire timeline creation routine to repository
// backend, so it can do all needed initialization
repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;
// Remember the human-readable branch name for the new timeline.
// FIXME: there's a race condition, if you create a branch with the same
// name concurrently.
let data = new_timeline_id.to_string();
fs::write(conf.branch_path(branchname, tenantid), data)?;
Ok(BranchInfo {
name: branchname.to_string(),
timeline_id: new_timeline_id,
latest_valid_lsn: startpoint.lsn,
ancestor_id: Some(startpoint.timelineid.to_string()),
ancestor_lsn: Some(startpoint.lsn.to_string()),
current_logical_size: 0,
current_logical_size_non_incremental: Some(0),
})
}
//
// Parse user-given string that represents a point-in-time.
//
// We support multiple variants:
//
// Raw timeline id in hex, meaning the end of that timeline:
// bc62e7d612d0e6fe8f99a6dd2f281f9d
//
// A specific LSN on a timeline:
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
//
// Same, with a human-friendly branch name:
// main
// main@2/15D3DD8
//
// Human-friendly tag name:
// mytag
//
//
fn parse_point_in_time(
conf: &PageServerConf,
s: &str,
tenantid: &ZTenantId,
) -> Result<PointInTime> {
let mut strings = s.split('@');
let name = strings.next().unwrap();
let lsn = strings
.next()
.map(Lsn::from_str)
.transpose()
.context("invalid LSN in point-in-time specification")?;
// Check if it's a tag
if lsn.is_none() {
let tagpath = conf.tag_path(name, tenantid);
if tagpath.exists() {
let pointstr = fs::read_to_string(tagpath)?;
return parse_point_in_time(conf, &pointstr, tenantid);
}
}
// Check if it's a branch
// Check if it's branch @ LSN
let branchpath = conf.branch_path(name, tenantid);
if branchpath.exists() {
let pointstr = fs::read_to_string(branchpath)?;
let mut result = parse_point_in_time(conf, &pointstr, tenantid)?;
result.lsn = lsn.unwrap_or(Lsn(0));
return Ok(result);
}
// Check if it's a timelineid
// Check if it's timelineid @ LSN
if let Ok(timelineid) = ZTimelineId::from_str(name) {
let tlipath = conf.timeline_path(&timelineid, tenantid);
if tlipath.exists() {
return Ok(PointInTime {
timelineid,
lsn: lsn.unwrap_or(Lsn(0)),
});
}
}
bail!("could not parse point-in-time {}", s);
}

View File

@@ -392,6 +392,22 @@ impl PageServerConf {
self.tenants_path().join(tenantid.to_string())
}
pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf {
self.tenant_path(tenantid).join("refs").join("tags")
}
pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf {
self.tags_path(tenantid).join(tag_name)
}
pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf {
self.tenant_path(tenantid).join("refs").join("branches")
}
pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf {
self.branches_path(tenantid).join(branch_name)
}
pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
}
@@ -400,6 +416,10 @@ impl PageServerConf {
self.timelines_path(tenantid).join(timelineid.to_string())
}
pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
self.timeline_path(timelineid, tenantid).join("ancestor")
}
//
// Postgres distribution paths
//

View File

@@ -1,121 +1,20 @@
use crate::timelines::TimelineInfo;
use anyhow::{anyhow, bail, Context};
use serde::{Deserialize, Serialize};
use zenith_utils::{
lsn::Lsn,
zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId},
};
use crate::ZTenantId;
use zenith_utils::zid::ZNodeId;
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub new_timeline_id: Option<HexZTimelineId>,
pub ancestor_timeline_id: Option<HexZTimelineId>,
pub ancestor_start_lsn: Option<Lsn>,
pub struct BranchCreateRequest {
#[serde(with = "hex")]
pub tenant_id: ZTenantId,
pub name: String,
pub start_point: String,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateRequest {
pub new_tenant_id: Option<HexZTenantId>,
}
#[derive(Serialize, Deserialize)]
pub struct TimelineInfoResponse {
pub kind: String,
#[serde(with = "hex")]
timeline_id: ZTimelineId,
#[serde(with = "hex")]
tenant_id: ZTenantId,
disk_consistent_lsn: String,
last_record_lsn: Option<String>,
prev_record_lsn: Option<String>,
ancestor_timeline_id: Option<HexZTimelineId>,
ancestor_lsn: Option<String>,
current_logical_size: Option<usize>,
current_logical_size_non_incremental: Option<usize>,
}
impl From<TimelineInfo> for TimelineInfoResponse {
fn from(other: TimelineInfo) -> Self {
match other {
TimelineInfo::Local {
timeline_id,
tenant_id,
last_record_lsn,
prev_record_lsn,
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn,
current_logical_size,
current_logical_size_non_incremental,
} => TimelineInfoResponse {
kind: "Local".to_owned(),
timeline_id,
tenant_id,
disk_consistent_lsn: disk_consistent_lsn.to_string(),
last_record_lsn: Some(last_record_lsn.to_string()),
prev_record_lsn: Some(prev_record_lsn.to_string()),
ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from),
ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()),
current_logical_size: Some(current_logical_size),
current_logical_size_non_incremental,
},
TimelineInfo::Remote {
timeline_id,
tenant_id,
disk_consistent_lsn,
} => TimelineInfoResponse {
kind: "Remote".to_owned(),
timeline_id,
tenant_id,
disk_consistent_lsn: disk_consistent_lsn.to_string(),
last_record_lsn: None,
prev_record_lsn: None,
ancestor_timeline_id: None,
ancestor_lsn: None,
current_logical_size: None,
current_logical_size_non_incremental: None,
},
}
}
}
impl TryFrom<TimelineInfoResponse> for TimelineInfo {
type Error = anyhow::Error;
fn try_from(other: TimelineInfoResponse) -> anyhow::Result<Self> {
let parse_lsn_hex_string = |lsn_string: String| {
lsn_string
.parse::<Lsn>()
.with_context(|| format!("Failed to parse Lsn as hex string from '{}'", lsn_string))
};
let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?;
Ok(match other.kind.as_str() {
"Local" => TimelineInfo::Local {
timeline_id: other.timeline_id,
tenant_id: other.tenant_id,
last_record_lsn: other
.last_record_lsn
.ok_or(anyhow!("Local timeline should have last_record_lsn"))
.and_then(parse_lsn_hex_string)?,
prev_record_lsn: other
.prev_record_lsn
.ok_or(anyhow!("Local timeline should have prev_record_lsn"))
.and_then(parse_lsn_hex_string)?,
ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from),
ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?,
disk_consistent_lsn,
current_logical_size: other.current_logical_size.ok_or(anyhow!("No "))?,
current_logical_size_non_incremental: other.current_logical_size_non_incremental,
},
"Remote" => TimelineInfo::Remote {
timeline_id: other.timeline_id,
tenant_id: other.tenant_id,
disk_consistent_lsn,
},
unknown => bail!("Unknown timeline kind: {}", unknown),
})
}
pub tenant_id: ZTenantId,
}
#[derive(Serialize)]

View File

@@ -22,7 +22,7 @@ paths:
properties:
id:
type: integer
/v1/tenant/{tenant_id}/timeline:
/v1/timeline/{tenant_id}:
parameters:
- name: tenant_id
in: path
@@ -30,22 +30,19 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get timelines for tenant
description: List tenant timelines
responses:
"200":
description: TimelineInfo
description: array of brief timeline descriptions
content:
application/json:
schema:
type: array
items:
$ref: "#/components/schemas/TimelineInfo"
# currently, just a timeline id string, but when remote index gets to be accessed
# remote/local timeline field would be added at least
type: string
"400":
description: Error when no tenant id found in path
content:
@@ -70,7 +67,7 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
/v1/timeline/{tenant_id}/{timeline_id}:
parameters:
- name: tenant_id
in: path
@@ -84,13 +81,8 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get info about the timeline
description: Get timeline info for tenant's remote timeline
responses:
"200":
description: TimelineInfo
@@ -99,7 +91,7 @@ paths:
schema:
$ref: "#/components/schemas/TimelineInfo"
"400":
description: Error when no tenant id found in path or no timeline id
description: Error when no tenant id found in path or no branch name
content:
application/json:
schema:
@@ -122,7 +114,7 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/:
/v1/branch/{tenant_id}:
parameters:
- name: tenant_id
in: path
@@ -130,33 +122,24 @@ paths:
schema:
type: string
format: hex
post:
description: |
Create a timeline. Returns new timeline id on success.\
If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
requestBody:
content:
application/json:
schema:
type: object
properties:
new_timeline_id:
type: string
format: hex
ancestor_timeline_id:
type: string
format: hex
ancestor_start_lsn:
type: string
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get branches for tenant
responses:
"201":
description: TimelineInfo
"200":
description: BranchInfo
content:
application/json:
schema:
$ref: "#/components/schemas/TimelineInfo"
type: array
items:
$ref: "#/components/schemas/BranchInfo"
"400":
description: Malformed timeline create request
description: Error when no tenant id found in path
content:
application/json:
schema:
@@ -173,12 +156,108 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409":
description: Timeline already exists, creation skipped
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/AlreadyExistsError"
$ref: "#/components/schemas/Error"
/v1/branch/{tenant_id}/{branch_name}:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
- name: branch_name
in: path
required: true
schema:
type: string
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get branches for tenant
responses:
"200":
description: BranchInfo
content:
application/json:
schema:
$ref: "#/components/schemas/BranchInfo"
"400":
description: Error when no tenant id found in path or no branch name
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/branch/:
post:
description: Create branch
requestBody:
content:
application/json:
schema:
type: object
required:
- "tenant_id"
- "name"
- "start_point"
properties:
tenant_id:
type: string
format: hex
name:
type: string
start_point:
type: string
responses:
"201":
description: BranchInfo
content:
application/json:
schema:
$ref: "#/components/schemas/BranchInfo"
"400":
description: Malformed branch create request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
@@ -216,26 +295,27 @@ paths:
schema:
$ref: "#/components/schemas/Error"
post:
description: |
Create a tenant. Returns new tenant id on success.\
If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant.
description: Create tenant
requestBody:
content:
application/json:
schema:
type: object
required:
- "tenant_id"
properties:
new_tenant_id:
tenant_id:
type: string
format: hex
responses:
"201":
description: New tenant created successfully
description: CREATED
content:
application/json:
schema:
type: string
format: hex
type: array
items:
type: string
"400":
description: Malformed tenant create request
content:
@@ -254,12 +334,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409":
description: Tenant already exists, creation skipped
content:
application/json:
schema:
$ref: "#/components/schemas/AlreadyExistsError"
"500":
description: Generic operation error
content:
@@ -284,11 +358,38 @@ components:
type: string
state:
type: string
BranchInfo:
type: object
required:
- name
- timeline_id
- latest_valid_lsn
- current_logical_size
properties:
name:
type: string
timeline_id:
type: string
format: hex
ancestor_id:
type: string
format: hex
ancestor_lsn:
type: string
current_logical_size:
type: integer
current_logical_size_non_incremental:
type: integer
latest_valid_lsn:
type: integer
TimelineInfo:
type: object
required:
- timeline_id
- tenant_id
- last_record_lsn
- prev_record_lsn
- start_lsn
- disk_consistent_lsn
properties:
timeline_id:
@@ -297,21 +398,19 @@ components:
tenant_id:
type: string
format: hex
ancestor_timeline_id:
type: string
format: hex
last_record_lsn:
type: string
prev_record_lsn:
type: string
ancestor_timeline_id:
type: string
format: hex
ancestor_lsn:
start_lsn:
type: string
disk_consistent_lsn:
type: string
current_logical_size:
type: integer
current_logical_size_non_incremental:
type: integer
timeline_state:
type: string
Error:
type: object
@@ -327,13 +426,6 @@ components:
properties:
msg:
type: string
AlreadyExistsError:
type: object
required:
- msg
properties:
msg:
type: string
ForbiddenError:
type: object
required:

View File

@@ -1,8 +1,9 @@
use std::sync::Arc;
use anyhow::Result;
use anyhow::{Context, Result};
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use serde::Serialize;
use tracing::*;
use zenith_utils::auth::JwtAuth;
use zenith_utils::http::endpoint::attach_openapi_ui;
@@ -13,17 +14,21 @@ use zenith_utils::http::{
endpoint,
error::HttpErrorBody,
json::{json_request, json_response},
request::get_request_param,
request::parse_request_param,
};
use zenith_utils::http::{RequestExt, RouterBuilder};
use zenith_utils::zid::{HexZTenantId, ZTimelineId};
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::HexZTimelineId;
use zenith_utils::zid::ZTimelineId;
use super::models::{
StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse,
};
use super::models::BranchCreateRequest;
use super::models::StatusResponse;
use super::models::TenantCreateRequest;
use crate::branches::BranchInfo;
use crate::repository::RepositoryTimeline;
use crate::timelines::TimelineInfo;
use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
use crate::repository::TimelineSyncState;
use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId};
#[derive(Debug)]
struct State {
@@ -68,45 +73,23 @@ async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiErr
)?)
}
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let request_data: BranchCreateRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_id))?;
check_permission(&request, Some(request_data.tenant_id))?;
let new_timeline_info = tokio::task::spawn_blocking(move || {
let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered();
timelines::create_timeline(
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
branches::create_branch(
get_config(&request),
tenant_id,
request_data.new_timeline_id.map(ZTimelineId::from),
request_data.ancestor_timeline_id.map(ZTimelineId::from),
request_data.ancestor_start_lsn,
&request_data.name,
&request_data.start_point,
&request_data.tenant_id,
)
})
.await
.map_err(ApiError::from_err)??;
Ok(match new_timeline_info {
Some(info) => json_response(StatusCode::CREATED, TimelineInfoResponse::from(info))?,
None => json_response(StatusCode::CONFLICT, ())?,
})
}
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
let response_data: Vec<TimelineInfoResponse> = tokio::task::spawn_blocking(move || {
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size)
})
.await
.map_err(ApiError::from_err)??
.into_iter()
.map(TimelineInfoResponse::from)
.collect();
Ok(json_response(StatusCode::OK, response_data)?)
Ok(json_response(StatusCode::CREATED, response_data)?)
}
// Gate non incremental logical size calculation behind a flag
@@ -124,6 +107,112 @@ fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
.unwrap_or(false)
}
async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
check_permission(&request, Some(tenantid))?;
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
crate::branches::get_branches(
get_config(&request),
&tenantid,
include_non_incremental_logical_size,
)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::OK, response_data)?)
}
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
let conf = get_state(&request).conf;
let path = conf.branch_path(&branch_name, &tenantid);
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
BranchInfo::from_path(path, &repo, include_non_incremental_logical_size)
})
.await
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::OK, response_data)?)
}
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let conf = get_state(&request).conf;
let timelines_dir = conf.timelines_path(&tenant_id);
let mut timelines_dir_contents =
tokio::fs::read_dir(&timelines_dir).await.with_context(|| {
format!(
"Failed to list timelines dir '{}' contents",
timelines_dir.display()
)
})?;
let mut local_timelines = Vec::new();
while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| {
format!(
"Failed to list timelines dir '{}' contents",
timelines_dir.display()
)
})? {
let entry_path = entry.path();
let entry_type = entry.file_type().await.with_context(|| {
format!(
"Failed to get file type of timeline dirs' entry '{}'",
entry_path.display()
)
})?;
if entry_type.is_dir() {
match entry.file_name().to_string_lossy().parse::<ZTimelineId>() {
Ok(timeline_id) => local_timelines.push(timeline_id.to_string()),
Err(e) => error!(
"Failed to get parse timeline id from timeline dirs' entry '{}': {}",
entry_path.display(),
e
),
}
}
}
Ok(json_response(StatusCode::OK, local_timelines)?)
}
#[derive(Debug, Serialize)]
#[serde(tag = "type")]
enum TimelineInfo {
Local {
#[serde(with = "hex")]
timeline_id: ZTimelineId,
#[serde(with = "hex")]
tenant_id: ZTenantId,
ancestor_timeline_id: Option<HexZTimelineId>,
last_record_lsn: Lsn,
prev_record_lsn: Lsn,
disk_consistent_lsn: Lsn,
timeline_state: Option<TimelineSyncState>,
},
Remote {
#[serde(with = "hex")]
timeline_id: ZTimelineId,
#[serde(with = "hex")]
tenant_id: ZTenantId,
},
}
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -135,17 +224,26 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
.entered();
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
let include_non_incremental_logical_size =
get_include_non_incremental_logical_size(&request);
Ok::<_, anyhow::Error>(TimelineInfo::from_repo_timeline(
tenant_id,
repo.get_timeline(timeline_id)?,
include_non_incremental_logical_size,
))
Ok::<_, anyhow::Error>(match repo.get_timeline(timeline_id)?.local_timeline() {
None => TimelineInfo::Remote {
timeline_id,
tenant_id,
},
Some(timeline) => TimelineInfo::Local {
timeline_id,
tenant_id,
ancestor_timeline_id: timeline
.get_ancestor_timeline_id()
.map(HexZTimelineId::from),
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(),
timeline_state: repo.get_timeline_state(timeline_id),
},
})
})
.await
.map_err(ApiError::from_err)?
.map(TimelineInfoResponse::from)?;
.map_err(ApiError::from_err)??;
Ok(json_response(StatusCode::OK, response_data)?)
}
@@ -162,7 +260,7 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
.entered();
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
match repo.get_timeline(timeline_id)? {
RepositoryTimeline::Local { .. } => {
RepositoryTimeline::Local(_) => {
anyhow::bail!("Timeline with id {} is already local", timeline_id)
}
RepositoryTimeline::Remote {
@@ -222,20 +320,13 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
let request_data: TenantCreateRequest = json_request(&mut request).await?;
let new_tenant_id = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_create", tenant = ?request_data.new_tenant_id).entered();
tenant_mgr::create_tenant_repository(
get_config(&request),
request_data.new_tenant_id.map(ZTenantId::from),
)
tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
})
.await
.map_err(ApiError::from_err)??;
Ok(match new_tenant_id {
Some(id) => json_response(StatusCode::CREATED, HexZTenantId::from(id))?,
None => json_response(StatusCode::CONFLICT, ())?,
})
Ok(json_response(StatusCode::CREATED, ())?)
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -265,21 +356,23 @@ pub fn make_router(
router
.data(Arc::new(State::new(conf, auth)))
.get("/v1/status", status_handler)
.get("/v1/tenant", tenant_list_handler)
.post("/v1/tenant", tenant_create_handler)
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
.get("/v1/timeline/:tenant_id", timeline_list_handler)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
"/v1/timeline/:tenant_id/:timeline_id",
timeline_detail_handler,
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
"/v1/timeline/:tenant_id/:timeline_id/attach",
timeline_attach_handler,
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
"/v1/timeline/:tenant_id/:timeline_id/detach",
timeline_detach_handler,
)
.get("/v1/branch/:tenant_id", branch_list_handler)
.get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
.post("/v1/branch", branch_create_handler)
.get("/v1/tenant", tenant_list_handler)
.post("/v1/tenant", tenant_create_handler)
.any(handler_404)
}

View File

@@ -137,20 +137,19 @@ pub struct LayeredRepository {
/// Public interface
impl Repository for LayeredRepository {
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline> {
Ok(RepositoryTimeline::from(self.get_or_init_timeline(
timelineid,
&mut self.timelines.lock().unwrap(),
)?))
}
fn list_timelines(&self) -> Result<Vec<RepositoryTimeline>> {
Ok(self
.timelines
.lock()
.unwrap()
.values()
.map(|timeline_entry| RepositoryTimeline::from(timeline_entry.clone()))
.collect())
let mut timelines = self.timelines.lock().unwrap();
Ok(
match self.get_or_init_timeline(timelineid, &mut timelines)? {
LayeredTimelineEntry::Local(local) => RepositoryTimeline::Local(local),
LayeredTimelineEntry::Remote {
id,
disk_consistent_lsn,
} => RepositoryTimeline::Remote {
id,
disk_consistent_lsn,
},
},
)
}
fn create_empty_timeline(
@@ -429,24 +428,6 @@ impl LayeredTimelineEntry {
}
}
impl From<LayeredTimelineEntry> for RepositoryTimeline {
fn from(layered_timeline: LayeredTimelineEntry) -> Self {
match layered_timeline {
LayeredTimelineEntry::Local(timeline) => RepositoryTimeline::Local {
id: timeline.timelineid,
timeline,
},
LayeredTimelineEntry::Remote {
id,
disk_consistent_lsn,
} => RepositoryTimeline::Remote {
id,
disk_consistent_lsn,
},
}
}
}
/// Private functions
impl LayeredRepository {
// Implementation of the public `get_timeline` function. This differs from the public

View File

@@ -1,4 +1,5 @@
pub mod basebackup;
pub mod branches;
pub mod config;
pub mod http;
pub mod import_datadir;
@@ -11,7 +12,6 @@ pub mod repository;
pub mod tenant_mgr;
pub mod tenant_threads;
pub mod thread_mgr;
pub mod timelines;
pub mod virtual_file;
pub mod walingest;
pub mod walreceiver;

View File

@@ -62,3 +62,11 @@ Based on previous evaluation, even `rusoto-s3` could be a better choice over thi
So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.
* bracnhes implementaion could be improved
Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded,
on the timeline download, missing remote branch files are downlaoded.
A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally.
Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated.

View File

@@ -14,6 +14,13 @@
//! Only GC removes local timeline files, the GC support is not added to sync currently,
//! yet downloading extra files is not critically bad at this stage, GC can remove those again.
//!
//! Along the timeline files, branch files are uploaded and downloaded every time a corresponding sync task is processed.
//! For simplicity, branch files are also treated as immutable: only missing files are uploaded or downloaded, no removals, amendments or file contents checks are done.
//! Also, the branches are copied as separate files, with no extra compressions done.
//! Despite branches information currently belonging to tenants, a tenants' timeline sync is required to upload or download the branch files, also, there's no way to know
//! the branch sync state outside of the sync loop.
//! This implementation is currently considered as temporary and is a subjec to change later.
//!
//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents.
//! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has
//! an exclusive write access to the remote storage: new files appear in the storage only after the same
@@ -59,6 +66,7 @@
//! NOTE: No real contents or checksum check happens right now and is a subject to improve later.
//!
//! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed.
//! No extra branch registration is done.
//!
//! When pageserver signals shutdown, current sync task gets finished and the loop exists.
@@ -69,7 +77,7 @@ pub mod index;
mod upload;
use std::{
collections::{BTreeSet, HashMap, VecDeque},
collections::{BTreeSet, HashMap, HashSet, VecDeque},
num::{NonZeroU32, NonZeroUsize},
path::{Path, PathBuf},
sync::Arc,
@@ -79,6 +87,7 @@ use anyhow::{bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use lazy_static::lazy_static;
use tokio::{
fs,
runtime::Runtime,
sync::{
mpsc::{self, UnboundedReceiver},
@@ -92,7 +101,8 @@ use self::{
compression::ArchiveHeader,
download::{download_timeline, DownloadedTimeline},
index::{
ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry,
ArchiveDescription, ArchiveId, RelativePath, RemoteTimeline, RemoteTimelineIndex,
TimelineIndexEntry,
},
upload::upload_timeline_checkpoint,
};
@@ -833,6 +843,28 @@ async fn download_archive_header<
Ok(header)
}
async fn tenant_branch_files(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
) -> anyhow::Result<HashSet<RelativePath>> {
let branches_dir = conf.branches_path(&tenant_id);
if !branches_dir.exists() {
return Ok(HashSet::new());
}
let mut branch_entries = fs::read_dir(&branches_dir)
.await
.context("Failed to list tenant branches dir contents")?;
let mut branch_files = HashSet::new();
while let Some(branch_entry) = branch_entries.next_entry().await? {
if branch_entry.file_type().await?.is_file() {
branch_files.insert(RelativePath::new(&branches_dir, branch_entry.path())?);
}
}
Ok(branch_files)
}
#[cfg(test)]
mod test_utils {
use std::{
@@ -939,9 +971,30 @@ mod test_utils {
"Index contains unexpected sync ids"
);
let mut actual_branches = BTreeMap::new();
let mut expected_branches = BTreeMap::new();
let mut actual_timeline_entries = BTreeMap::new();
let mut expected_timeline_entries = BTreeMap::new();
for sync_id in actual_sync_ids {
actual_branches.insert(
sync_id.tenant_id,
index_read
.branch_files(sync_id.tenant_id)
.into_iter()
.flat_map(|branch_paths| branch_paths.iter())
.cloned()
.collect::<BTreeSet<_>>(),
);
expected_branches.insert(
sync_id.tenant_id,
expected_index_with_descriptions
.branch_files(sync_id.tenant_id)
.into_iter()
.flat_map(|branch_paths| branch_paths.iter())
.cloned()
.collect::<BTreeSet<_>>(),
);
actual_timeline_entries.insert(
sync_id,
index_read.timeline_entry(&sync_id).unwrap().clone(),
@@ -956,6 +1009,11 @@ mod test_utils {
}
drop(index_read);
assert_eq!(
actual_branches, expected_branches,
"Index contains unexpected branches"
);
for (sync_id, actual_timeline_entry) in actual_timeline_entries {
let expected_timeline_description = expected_timeline_entries
.remove(&sync_id)

View File

@@ -1,8 +1,10 @@
//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
//! Currently, tenant branch files are also downloaded, but this does not appear final.
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
use anyhow::{ensure, Context};
use futures::{stream::FuturesUnordered, StreamExt};
use tokio::{fs, sync::RwLock};
use tracing::{debug, error, trace, warn};
use zenith_utils::{lsn::Lsn, zid::ZTenantId};
@@ -12,8 +14,8 @@ use crate::{
layered_repository::metadata::{metadata_path, TimelineMetadata},
remote_storage::{
storage_sync::{
compression, index::TimelineIndexEntry, sync_queue, update_index_description, SyncKind,
SyncTask,
compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files,
update_index_description, SyncKind, SyncTask,
},
RemoteStorage, ZTenantTimelineId,
},
@@ -40,6 +42,8 @@ pub(super) enum DownloadedTimeline {
/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
/// updated in the end of every checkpoint archive extraction.
///
/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded.
///
/// On an error, bumps the retries count and reschedules the download, with updated archive skip list
/// (for any new successful archive downloads and extractions).
pub(super) async fn download_timeline<
@@ -109,6 +113,22 @@ pub(super) async fn download_timeline<
}
};
if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.tenant_id).await
{
error!(
"Failed to download missing branches for sync id {}: {:?}",
sync_id, e
);
sync_queue::push(SyncTask::new(
sync_id,
retries,
SyncKind::Download(download),
));
return DownloadedTimeline::FailedAndRescheduled {
disk_consistent_lsn,
};
}
debug!("Downloading timeline archives");
let archives_to_download = remote_timeline
.checkpoints()
@@ -230,6 +250,82 @@ async fn read_local_metadata(
.context("Failed to read local metadata files bytes")?)
}
async fn download_missing_branches<
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
>(
conf: &'static PageServerConf,
(storage, index): &(S, RwLock<RemoteTimelineIndex>),
tenant_id: ZTenantId,
) -> anyhow::Result<()> {
let local_branches = tenant_branch_files(conf, tenant_id)
.await
.context("Failed to list local branch files for the tenant")?;
let local_branches_dir = conf.branches_path(&tenant_id);
if !local_branches_dir.exists() {
fs::create_dir_all(&local_branches_dir)
.await
.with_context(|| {
format!(
"Failed to create local branches directory at path '{}'",
local_branches_dir.display()
)
})?;
}
if let Some(remote_branches) = index.read().await.branch_files(tenant_id) {
let mut remote_only_branches_downloads = remote_branches
.difference(&local_branches)
.map(|remote_only_branch| async move {
let branches_dir = conf.branches_path(&tenant_id);
let remote_branch_path = remote_only_branch.as_path(&branches_dir);
let storage_path =
storage.storage_path(&remote_branch_path).with_context(|| {
format!(
"Failed to derive a storage path for branch with local path '{}'",
remote_branch_path.display()
)
})?;
let mut target_file = fs::OpenOptions::new()
.write(true)
.create_new(true)
.open(&remote_branch_path)
.await
.with_context(|| {
format!(
"Failed to create local branch file at '{}'",
remote_branch_path.display()
)
})?;
storage
.download(&storage_path, &mut target_file)
.await
.with_context(|| {
format!(
"Failed to download branch file from the remote path {:?}",
storage_path
)
})?;
Ok::<_, anyhow::Error>(())
})
.collect::<FuturesUnordered<_>>();
let mut branch_downloads_failed = false;
while let Some(download_result) = remote_only_branches_downloads.next().await {
if let Err(e) = download_result {
branch_downloads_failed = true;
error!("Failed to download a branch file: {:?}", e);
}
}
ensure!(
!branch_downloads_failed,
"Failed to download all branch files"
);
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::collections::BTreeSet;

View File

@@ -5,7 +5,7 @@
//! This way in the future, the index could be restored fast from its serialized stored form.
use std::{
collections::{BTreeMap, BTreeSet, HashMap},
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
path::{Path, PathBuf},
};
@@ -49,9 +49,10 @@ impl RelativePath {
}
/// An index to track tenant files that exist on the remote storage.
/// Currently, timeline archive files are tracked only.
/// Currently, timeline archives and branch files are tracked.
#[derive(Debug, Clone)]
pub struct RemoteTimelineIndex {
branch_files: HashMap<ZTenantId, HashSet<RelativePath>>,
timeline_files: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
}
@@ -64,6 +65,7 @@ impl RemoteTimelineIndex {
paths: impl Iterator<Item = P>,
) -> Self {
let mut index = Self {
branch_files: HashMap::new(),
timeline_files: HashMap::new(),
};
for path in paths {
@@ -96,6 +98,17 @@ impl RemoteTimelineIndex {
pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
self.timeline_files.keys().copied()
}
pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) {
self.branch_files
.entry(tenant_id)
.or_insert_with(HashSet::new)
.insert(path);
}
pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet<RelativePath>> {
self.branch_files.get(&tenant_id)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
@@ -293,9 +306,20 @@ fn try_parse_index_entry(
.parse::<ZTenantId>()
.with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
let branches_path = conf.branches_path(&tenant_id);
let timelines_path = conf.timelines_path(&tenant_id);
match path.strip_prefix(&timelines_path) {
Ok(timelines_subpath) => {
match (
RelativePath::new(&branches_path, &path),
path.strip_prefix(&timelines_path),
) {
(Ok(_), Ok(_)) => bail!(
"Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes",
path.display(),
branches_path.display(),
timelines_path.display()
),
(Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry),
(Err(_), Ok(timelines_subpath)) => {
let mut segments = timelines_subpath.iter();
let timeline_id = segments
.next()
@@ -351,10 +375,11 @@ fn try_parse_index_entry(
}
}
}
Err(timelines_strip_error) => {
(Err(branches_error), Err(timelines_strip_error)) => {
bail!(
"Path '{}' is not an archive entry '{}'",
"Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'",
path.display(),
branches_error,
timelines_strip_error,
)
}

View File

@@ -1,10 +1,13 @@
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
//! Currently, tenant branch files are also uploaded, but this does not appear final.
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
use anyhow::ensure;
use tokio::sync::RwLock;
use anyhow::{ensure, Context};
use futures::{stream::FuturesUnordered, StreamExt};
use tokio::{fs, sync::RwLock};
use tracing::{debug, error, warn};
use zenith_utils::zid::ZTenantId;
use crate::{
config::PageServerConf,
@@ -12,7 +15,7 @@ use crate::{
storage_sync::{
compression,
index::{RemoteTimeline, TimelineIndexEntry},
sync_queue, update_index_description, SyncKind, SyncTask,
sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask,
},
RemoteStorage, ZTenantTimelineId,
},
@@ -23,6 +26,8 @@ use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoin
/// Attempts to compress and upload given checkpoint files.
/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
///
/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely.
///
/// On an error, bumps the retries count and reschedules the entire task.
/// On success, populates index data with new downloads.
pub(super) async fn upload_timeline_checkpoint<
@@ -36,6 +41,19 @@ pub(super) async fn upload_timeline_checkpoint<
retries: u32,
) -> Option<bool> {
debug!("Uploading checkpoint for sync id {}", sync_id);
if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.tenant_id).await
{
error!(
"Failed to upload missing branches for sync id {}: {:?}",
sync_id, e
);
sync_queue::push(SyncTask::new(
sync_id,
retries,
SyncKind::Upload(new_checkpoint),
));
return Some(false);
}
let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();
let index = &remote_assets.1;
@@ -183,6 +201,76 @@ async fn try_upload_checkpoint<
.map(|(header, header_size, _)| (header, header_size))
}
async fn upload_missing_branches<
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
>(
config: &'static PageServerConf,
(storage, index): &(S, RwLock<RemoteTimelineIndex>),
tenant_id: ZTenantId,
) -> anyhow::Result<()> {
let local_branches = tenant_branch_files(config, tenant_id)
.await
.context("Failed to list local branch files for the tenant")?;
let index_read = index.read().await;
let remote_branches = index_read
.branch_files(tenant_id)
.cloned()
.unwrap_or_default();
drop(index_read);
let mut branch_uploads = local_branches
.difference(&remote_branches)
.map(|local_only_branch| async move {
let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id));
let storage_path = storage.storage_path(&local_branch_path).with_context(|| {
format!(
"Failed to derive a storage path for branch with local path '{}'",
local_branch_path.display()
)
})?;
let local_branch_file = fs::OpenOptions::new()
.read(true)
.open(&local_branch_path)
.await
.with_context(|| {
format!(
"Failed to open local branch file {} for reading",
local_branch_path.display()
)
})?;
storage
.upload(local_branch_file, &storage_path)
.await
.with_context(|| {
format!(
"Failed to upload branch file to the remote path {:?}",
storage_path
)
})?;
Ok::<_, anyhow::Error>(local_only_branch)
})
.collect::<FuturesUnordered<_>>();
let mut branch_uploads_failed = false;
while let Some(upload_result) = branch_uploads.next().await {
match upload_result {
Ok(local_only_branch) => index
.write()
.await
.add_branch_file(tenant_id, local_only_branch.clone()),
Err(e) => {
error!("Failed to upload branch file: {:?}", e);
branch_uploads_failed = true;
}
}
}
ensure!(!branch_uploads_failed, "Failed to upload all branch files");
Ok(())
}
#[cfg(test)]
mod tests {
use tempfile::tempdir;

View File

@@ -36,10 +36,6 @@ pub trait Repository: Send + Sync {
/// Get Timeline handle for given zenith timeline ID.
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline>;
/// Lists timelines the repository contains.
/// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
fn list_timelines(&self) -> Result<Vec<RepositoryTimeline>>;
/// Create a new, empty timeline. The caller is responsible for loading data into it
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
fn create_empty_timeline(
@@ -76,10 +72,7 @@ pub trait Repository: Send + Sync {
pub enum RepositoryTimeline {
/// Timeline, with its files present locally in pageserver's working directory.
/// Loaded into pageserver's memory and ready to be used.
Local {
id: ZTimelineId,
timeline: Arc<dyn Timeline>,
},
Local(Arc<dyn Timeline>),
/// Timeline, found on the pageserver's remote storage, but not yet downloaded locally.
Remote {
id: ZTimelineId,
@@ -90,24 +83,17 @@ pub enum RepositoryTimeline {
impl RepositoryTimeline {
pub fn local_timeline(&self) -> Option<Arc<dyn Timeline>> {
if let Self::Local { timeline, .. } = self {
Some(Arc::clone(timeline))
if let Self::Local(local_timeline) = self {
Some(Arc::clone(local_timeline))
} else {
None
}
}
pub fn id(&self) -> ZTimelineId {
match self {
Self::Local { id, .. } => *id,
Self::Remote { id, .. } => *id,
}
}
}
/// A state of the timeline synchronization with the remote storage.
/// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
pub enum TimelineSyncState {
/// No further downloads from the remote storage are needed.
/// The timeline state is up-to-date or ahead of the remote storage one,
@@ -404,6 +390,7 @@ pub mod repo_harness {
let tenant_id = ZTenantId::generate();
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
fs::create_dir_all(conf.branches_path(&tenant_id))?;
Ok(Self { conf, tenant_id })
}

View File

@@ -1,19 +1,19 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use crate::branches;
use crate::config::PageServerConf;
use crate::layered_repository::LayeredRepository;
use crate::repository::{Repository, Timeline, TimelineSyncState};
use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
use crate::timelines;
use crate::walredo::PostgresRedoManager;
use crate::CheckpointConfig;
use anyhow::{Context, Result};
use anyhow::{bail, Context, Result};
use lazy_static::lazy_static;
use log::*;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::collections::{hash_map, HashMap};
use std::fmt;
use std::sync::{Arc, Mutex, MutexGuard};
use zenith_utils::zid::{ZTenantId, ZTimelineId};
@@ -177,27 +177,24 @@ pub fn shutdown_all_tenants() {
}
}
pub fn create_tenant_repository(
pub fn create_repository_for_tenant(
conf: &'static PageServerConf,
new_tenant_id: Option<ZTenantId>,
) -> Result<Option<ZTenantId>> {
let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate);
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id));
match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? {
Some(repo) => {
access_tenants()
.entry(new_tenant_id)
.or_insert_with(|| Tenant {
state: TenantState::Idle,
repo,
});
Ok(Some(new_tenant_id))
}
None => {
debug!("repository already exists for tenant {}", new_tenant_id);
Ok(None)
tenantid: ZTenantId,
) -> Result<()> {
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
match access_tenants().entry(tenantid) {
hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid),
hash_map::Entry::Vacant(v) => {
v.insert(Tenant {
state: TenantState::Idle,
repo,
});
}
}
Ok(())
}
pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {

View File

@@ -1,408 +0,0 @@
//!
//! Timeline management code
//
use anyhow::{anyhow, bail, Context, Result};
use postgres_ffi::ControlFileData;
use std::{
fs,
path::Path,
process::{Command, Stdio},
sync::Arc,
};
use tracing::*;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use zenith_utils::{crashsafe_dir, logging};
use crate::{config::PageServerConf, repository::Repository};
use crate::{import_datadir, LOG_FILE_NAME};
use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
use crate::{repository::RepositoryTimeline, tenant_mgr};
use crate::{repository::Timeline, CheckpointConfig};
#[derive(Clone)]
pub enum TimelineInfo {
Local {
timeline_id: ZTimelineId,
tenant_id: ZTenantId,
last_record_lsn: Lsn,
prev_record_lsn: Lsn,
ancestor_timeline_id: Option<ZTimelineId>,
ancestor_lsn: Option<Lsn>,
disk_consistent_lsn: Lsn,
current_logical_size: usize,
current_logical_size_non_incremental: Option<usize>,
},
Remote {
timeline_id: ZTimelineId,
tenant_id: ZTenantId,
disk_consistent_lsn: Lsn,
},
}
impl TimelineInfo {
pub fn from_repo_timeline(
tenant_id: ZTenantId,
repo_timeline: RepositoryTimeline,
include_non_incremental_logical_size: bool,
) -> Self {
match repo_timeline {
RepositoryTimeline::Local { id, timeline } => {
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
let ancestor_lsn = if ancestor_timeline_id.is_some() {
Some(timeline.get_ancestor_lsn())
} else {
None
};
Self::Local {
timeline_id: id,
tenant_id,
last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(),
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
current_logical_size: timeline.get_current_logical_size(),
current_logical_size_non_incremental: get_current_logical_size_non_incremental(
include_non_incremental_logical_size,
timeline.as_ref(),
),
}
}
RepositoryTimeline::Remote {
id,
disk_consistent_lsn,
} => Self::Remote {
timeline_id: id,
tenant_id,
disk_consistent_lsn,
},
}
}
pub fn from_dyn_timeline(
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
timeline: &dyn Timeline,
include_non_incremental_logical_size: bool,
) -> Self {
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
let ancestor_lsn = if ancestor_timeline_id.is_some() {
Some(timeline.get_ancestor_lsn())
} else {
None
};
Self::Local {
timeline_id,
tenant_id,
last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(),
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
current_logical_size: timeline.get_current_logical_size(),
current_logical_size_non_incremental: get_current_logical_size_non_incremental(
include_non_incremental_logical_size,
timeline,
),
}
}
pub fn timeline_id(&self) -> ZTimelineId {
match *self {
TimelineInfo::Local { timeline_id, .. } => timeline_id,
TimelineInfo::Remote { timeline_id, .. } => timeline_id,
}
}
pub fn tenant_id(&self) -> ZTenantId {
match *self {
TimelineInfo::Local { tenant_id, .. } => tenant_id,
TimelineInfo::Remote { tenant_id, .. } => tenant_id,
}
}
}
fn get_current_logical_size_non_incremental(
include_non_incremental_logical_size: bool,
timeline: &dyn Timeline,
) -> Option<usize> {
if !include_non_incremental_logical_size {
return None;
}
match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) {
Ok(size) => Some(size),
Err(e) => {
error!("Failed to get non-incremental logical size: {:?}", e);
None
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
pub timeline_id: ZTimelineId,
pub lsn: Lsn,
}
pub fn init_pageserver(
conf: &'static PageServerConf,
create_tenant: Option<ZTenantId>,
initial_timeline_id: Option<ZTimelineId>,
) -> anyhow::Result<()> {
// Initialize logger
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
let _log_file = logging::init(LOG_FILE_NAME, true)?;
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
// process during repository initialization.
//
// FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
// initdb in the background, and it kept running even after the "zenith init" had exited.
// In tests, we started the page server immediately after that, so that initdb was still
// running in the background, and we failed to run initdb again in the same directory. This
// has been solved for the rapid init+start case now, but the general race condition remains
// if you restart the server quickly. The WAL redo manager doesn't use a separate thread
// anymore, but I think that could still happen.
let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {});
crashsafe_dir::create_dir_all(conf.tenants_path())?;
if let Some(tenant_id) = create_tenant {
println!("initializing tenantid {}", tenant_id);
let repo = create_repo(conf, tenant_id, dummy_redo_mgr)
.context("failed to create repo")?
.ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?;
let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())
.context("failed to create initial timeline")?;
println!("initial timeline {} created", new_timeline_id)
} else if initial_timeline_id.is_some() {
println!("Ignoring initial timeline parameter, due to no tenant id to create given");
}
println!("pageserver init succeeded");
Ok(())
}
pub fn create_repo(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
) -> Result<Option<Arc<dyn Repository>>> {
let repo_dir = conf.tenant_path(&tenant_id);
if repo_dir.exists() {
debug!("repo for {} already exists", tenant_id);
return Ok(None);
}
// top-level dir may exist if we are creating it through CLI
crashsafe_dir::create_dir_all(&repo_dir)
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?;
info!("created directory structure in {}", repo_dir.display());
Ok(Some(Arc::new(LayeredRepository::new(
conf,
wal_redo_manager,
tenant_id,
conf.remote_storage_config.is_some(),
))))
}
// Returns checkpoint LSN from controlfile
fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
let lsn = controlfile.checkPoint;
Ok(Lsn(lsn))
}
// Create the cluster temporarily in 'initdbpath' directory inside the repository
// to get bootstrap data for timeline initialization.
//
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
info!("running initdb in {}... ", initdbpath.display());
let initdb_path = conf.pg_bin_dir().join("initdb");
let initdb_output = Command::new(initdb_path)
.args(&["-D", initdbpath.to_str().unwrap()])
.args(&["-U", &conf.superuser])
.args(&["-E", "utf8"])
.arg("--no-instructions")
// This is only used for a temporary installation that is deleted shortly after,
// so no need to fsync it
.arg("--no-sync")
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null())
.output()
.context("failed to execute initdb")?;
if !initdb_output.status.success() {
bail!(
"initdb failed: '{}'",
String::from_utf8_lossy(&initdb_output.stderr)
);
}
Ok(())
}
//
// - run initdb to init temporary instance and get bootstrap data
// - after initialization complete, remove the temp dir.
//
fn bootstrap_timeline(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tli: ZTimelineId,
repo: &dyn Repository,
) -> Result<Arc<dyn Timeline>> {
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
// Init temporarily repo to get bootstrap data
run_initdb(conf, &initdb_path)?;
let pgdata_path = initdb_path;
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
// Import the contents of the data directory at the initial checkpoint
// LSN, and any WAL after that.
// Initdb lsn will be equal to last_record_lsn which will be set after import.
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
let timeline = repo.create_empty_timeline(tli, lsn)?;
import_datadir::import_timeline_from_postgres_datadir(
&pgdata_path,
timeline.writer().as_ref(),
lsn,
)?;
timeline.checkpoint(CheckpointConfig::Forced)?;
println!(
"created initial timeline {} timeline.lsn {}",
tli,
timeline.get_last_record_lsn()
);
// Remove temp dir. We don't need it anymore
fs::remove_dir_all(pgdata_path)?;
Ok(timeline)
}
pub(crate) fn get_timelines(
tenant_id: ZTenantId,
include_non_incremental_logical_size: bool,
) -> Result<Vec<TimelineInfo>> {
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
.with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
Ok(repo
.list_timelines()
.with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))?
.into_iter()
.filter_map(|timeline| match timeline {
RepositoryTimeline::Local { timeline, id } => Some((id, timeline)),
RepositoryTimeline::Remote { .. } => None,
})
.map(|(timeline_id, timeline)| {
TimelineInfo::from_dyn_timeline(
tenant_id,
timeline_id,
timeline.as_ref(),
include_non_incremental_logical_size,
)
})
.collect())
}
pub(crate) fn create_timeline(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
new_timeline_id: Option<ZTimelineId>,
ancestor_timeline_id: Option<ZTimelineId>,
ancestor_start_lsn: Option<Lsn>,
) -> Result<Option<TimelineInfo>> {
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
if conf.timeline_path(&new_timeline_id, &tenant_id).exists() {
match repo.get_timeline(new_timeline_id)? {
RepositoryTimeline::Local { id, .. } => {
debug!("timeline {} already exists", id);
return Ok(None);
}
RepositoryTimeline::Remote { id, .. } => bail!(
"timeline {} already exists in pageserver's remote storage",
id
),
}
}
let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));
let new_timeline_info = match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
let ancestor_timeline = repo
.get_timeline(ancestor_timeline_id)
.with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))?
.local_timeline()
.with_context(|| {
format!(
"Cannot branch off the timeline {} that's not present locally",
ancestor_timeline_id
)
})?;
if start_lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = ancestor_timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
start_lsn = end_of_wal;
} else {
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline.wait_lsn(start_lsn)?;
}
start_lsn = start_lsn.align();
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
if ancestor_ancestor_lsn > start_lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!(
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
start_lsn,
ancestor_timeline_id,
ancestor_ancestor_lsn,
);
}
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
// load the timeline into memory
let loaded_timeline = repo.get_timeline(new_timeline_id)?;
TimelineInfo::from_repo_timeline(tenant_id, loaded_timeline, false)
}
None => {
let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
TimelineInfo::from_dyn_timeline(
tenant_id,
new_timeline_id,
new_timeline.as_ref(),
false,
)
}
};
Ok(Some(new_timeline_info))
}

View File

@@ -21,7 +21,6 @@
//! redo Postgres process, but some records it can handle directly with
//! bespoken Rust code.
use chrono::format::format;
use postgres_ffi::nonrelfile_utils::clogpage_precedes;
use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
use std::cmp::min;
@@ -271,25 +270,6 @@ impl WalIngest {
// Iterate through all the blocks that the record modifies, and
// "put" a separate copy of the record for each block.
for blk in decoded.blocks.iter() {
let lsn_hex = {
use bytes::BufMut;
let mut bytes = BytesMut::new();
bytes.put_u64(lsn.0);
hex::encode(bytes.freeze())
};
let page_hex = {
use bytes::BufMut;
let mut page = BytesMut::new();
page.put_u32(blk.rnode_spcnode);
page.put_u32(blk.rnode_dbnode);
page.put_u32(blk.rnode_relnode);
page.put_u8(blk.forknum);
page.put_u32(blk.blkno);
hex::encode(page.freeze())
};
println!("wal-at-lsn-modified-page {} {}", lsn_hex, page_hex);
self.ingest_decoded_block(timeline, lsn, &decoded, blk)?;
}

View File

@@ -7,11 +7,13 @@ use std::collections::HashMap;
use tokio::io::{AsyncRead, AsyncWrite};
use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage, FeMessage as Fe};
// TODO rename the struct to ClientParams or something
/// Various client credentials which we use for authentication.
#[derive(Debug, PartialEq, Eq)]
pub struct ClientCredentials {
pub user: String,
pub dbname: String,
pub options: Option<String>,
}
impl TryFrom<HashMap<String, String>> for ClientCredentials {
@@ -25,9 +27,22 @@ impl TryFrom<HashMap<String, String>> for ClientCredentials {
};
let user = get_param("user")?;
let db = get_param("database")?;
let dbname = get_param("database")?;
Ok(Self { user, dbname: db })
// TODO see what other options should be recognized, possibly all.
let options = match get_param("search_path") {
Ok(path) => Some(format!("-c search_path={}", path)),
Err(_) => None,
};
// TODO investigate why "" is always a key
// TODO warn on unrecognized options?
Ok(Self {
user,
dbname,
options,
})
}
}
@@ -85,6 +100,7 @@ async fn handle_static(
dbname: creds.dbname.clone(),
user: creds.user.clone(),
password: Some(cleartext_password.into()),
options: creds.options,
};
client
@@ -117,15 +133,22 @@ async fn handle_existing_user(
.ok_or_else(|| anyhow!("unexpected password message"))?;
let cplane = CPlaneApi::new(&config.auth_endpoint);
let db_info = cplane
.authenticate_proxy_request(creds, md5_response, &md5_salt, &psql_session_id)
let db_info_response = cplane
.authenticate_proxy_request(&creds, md5_response, &md5_salt, &psql_session_id)
.await?;
client
.write_message_noflush(&Be::AuthenticationOk)?
.write_message_noflush(&BeParameterStatusMessage::encoding())?;
Ok(db_info)
Ok(DatabaseInfo {
host: db_info_response.host,
port: db_info_response.port,
dbname: db_info_response.dbname,
user: db_info_response.user,
password: db_info_response.password,
options: creds.options,
})
}
async fn handle_new_user(
@@ -135,7 +158,7 @@ async fn handle_new_user(
let psql_session_id = new_psql_session_id();
let greeting = hello_message(&config.redirect_uri, &psql_session_id);
let db_info = cplane_api::with_waiter(psql_session_id, |waiter| async {
let db_info_response = cplane_api::with_waiter(psql_session_id, |waiter| async {
// Give user a URL to spawn a new database
client
.write_message_noflush(&Be::AuthenticationOk)?
@@ -150,7 +173,14 @@ async fn handle_new_user(
client.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?;
Ok(db_info)
Ok(DatabaseInfo {
host: db_info_response.host,
port: db_info_response.port,
dbname: db_info_response.dbname,
user: db_info_response.user,
password: db_info_response.password,
options: None,
})
}
fn hello_message(redirect_uri: &str, session_id: &str) -> String {

View File

@@ -10,6 +10,7 @@ pub struct DatabaseInfo {
pub dbname: String,
pub user: String,
pub password: Option<String>,
pub options: Option<String>,
}
impl DatabaseInfo {
@@ -33,6 +34,10 @@ impl From<DatabaseInfo> for tokio_postgres::Config {
.dbname(&db_info.dbname)
.user(&db_info.user);
if let Some(options) = db_info.options {
config.options(&options);
}
if let Some(password) = db_info.password {
config.password(password);
}

View File

@@ -1,25 +1,37 @@
use crate::auth::ClientCredentials;
use crate::compute::DatabaseInfo;
use crate::waiters::{Waiter, Waiters};
use anyhow::{anyhow, bail};
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
/// Part of the legacy cplane responses
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct DatabaseInfoResponse {
pub host: String,
pub port: u16,
pub dbname: String,
pub user: String,
pub password: Option<String>,
}
lazy_static! {
static ref CPLANE_WAITERS: Waiters<Result<DatabaseInfo, String>> = Default::default();
static ref CPLANE_WAITERS: Waiters<Result<DatabaseInfoResponse, String>> = Default::default();
}
/// Give caller an opportunity to wait for cplane's reply.
pub async fn with_waiter<F, R, T>(psql_session_id: impl Into<String>, f: F) -> anyhow::Result<T>
where
F: FnOnce(Waiter<'static, Result<DatabaseInfo, String>>) -> R,
F: FnOnce(Waiter<'static, Result<DatabaseInfoResponse, String>>) -> R,
R: std::future::Future<Output = anyhow::Result<T>>,
{
let waiter = CPLANE_WAITERS.register(psql_session_id.into())?;
f(waiter).await
}
pub fn notify(psql_session_id: &str, msg: Result<DatabaseInfo, String>) -> anyhow::Result<()> {
pub fn notify(
psql_session_id: &str,
msg: Result<DatabaseInfoResponse, String>,
) -> anyhow::Result<()> {
CPLANE_WAITERS.notify(psql_session_id, msg)
}
@@ -37,11 +49,11 @@ impl<'a> CPlaneApi<'a> {
impl CPlaneApi<'_> {
pub async fn authenticate_proxy_request(
&self,
creds: ClientCredentials,
creds: &ClientCredentials,
md5_response: &[u8],
salt: &[u8; 4],
psql_session_id: &str,
) -> anyhow::Result<DatabaseInfo> {
) -> anyhow::Result<DatabaseInfoResponse> {
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
url.query_pairs_mut()
.append_pair("login", &creds.user)
@@ -77,7 +89,7 @@ impl CPlaneApi<'_> {
#[derive(Serialize, Deserialize, Debug)]
#[serde(untagged)]
enum ProxyAuthResponse {
Ready { conn_info: DatabaseInfo },
Ready { conn_info: DatabaseInfoResponse },
Error { error: String },
NotReady { ready: bool }, // TODO: get rid of `ready`
}
@@ -92,13 +104,13 @@ mod tests {
// Ready
let auth: ProxyAuthResponse = serde_json::from_value(json!({
"ready": true,
"conn_info": DatabaseInfo::default(),
"conn_info": DatabaseInfoResponse::default(),
}))
.unwrap();
assert!(matches!(
auth,
ProxyAuthResponse::Ready {
conn_info: DatabaseInfo { .. }
conn_info: DatabaseInfoResponse { .. }
}
));

View File

@@ -1,4 +1,4 @@
use crate::{compute::DatabaseInfo, cplane_api};
use crate::cplane_api;
use anyhow::Context;
use serde::Deserialize;
use std::{
@@ -75,7 +75,7 @@ struct PsqlSessionResponse {
#[derive(Deserialize)]
enum PsqlSessionResult {
Success(DatabaseInfo),
Success(cplane_api::DatabaseInfoResponse),
Failure(String),
}

View File

@@ -1,4 +1,4 @@
use crate::auth;
use crate::auth::{self, ClientCredentials};
use crate::cancellation::{self, CancelClosure, CancelMap};
use crate::compute::DatabaseInfo;
use crate::config::{ProxyConfig, TlsConfig};
@@ -138,7 +138,6 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
stream.write_message(&Be::ErrorResponse(msg)).await?;
bail!(msg);
}
break Ok(Some((stream, params.try_into()?)));
}
CancelRequest(cancel_key_data) => {

View File

@@ -1,8 +1,8 @@
from contextlib import closing
from typing import Iterator
from uuid import UUID, uuid4
import psycopg2
from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException
from requests.exceptions import HTTPError
import pytest
@@ -25,31 +25,25 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
ps.safe_psql("set FOO", password=tenant_token)
ps.safe_psql("set FOO", password=management_token)
new_timeline_id = env.zenith_cli.create_branch('test_pageserver_auth',
tenant_id=env.initial_tenant)
# tenant can create branches
tenant_http_client.timeline_create(tenant_id=env.initial_tenant,
ancestor_timeline_id=new_timeline_id)
tenant_http_client.branch_create(env.initial_tenant, 'new1', 'main')
# console can create branches for tenant
management_http_client.timeline_create(tenant_id=env.initial_tenant,
ancestor_timeline_id=new_timeline_id)
management_http_client.branch_create(env.initial_tenant, 'new2', 'main')
# fail to create branch using token with different tenant_id
with pytest.raises(ZenithPageserverApiException,
match='Forbidden: Tenant id mismatch. Permission denied'):
invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant,
ancestor_timeline_id=new_timeline_id)
invalid_tenant_http_client.branch_create(env.initial_tenant, "new3", "main")
# create tenant using management token
management_http_client.tenant_create()
management_http_client.tenant_create(uuid4())
# fail to create tenant using tenant token
with pytest.raises(
ZenithPageserverApiException,
match='Forbidden: Attempt to access management api with tenant scope. Permission denied'
):
tenant_http_client.tenant_create()
tenant_http_client.tenant_create(uuid4())
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
@@ -59,8 +53,9 @@ def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_w
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
branch = f'test_compute_auth_to_pageserver{with_wal_acceptors}'
env.zenith_cli.create_branch(branch)
branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
env.zenith_cli.create_branch(branch, "main")
pg = env.postgres.create_start(branch)
with closing(pg.connect()) as conn:

View File

@@ -95,7 +95,7 @@ def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
# Create a branch for us
env.zenith_cli.create_branch('test_backpressure')
env.zenith_cli.create_branch("test_backpressure", "main")
pg = env.postgres.create_start('test_backpressure',
config_lines=['max_replication_write_lag=30MB'])

View File

@@ -22,7 +22,8 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
env = zenith_env_builder.init_start()
# Branch at the point where only 100 rows were inserted
env.zenith_cli.create_branch('test_branch_behind')
env.zenith_cli.create_branch("test_branch_behind", "main")
pgmain = env.postgres.create_start('test_branch_behind')
log.info("postgres is running on 'test_branch_behind' branch")
@@ -59,9 +60,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
log.info(f'LSN after 200100 rows: {lsn_b}')
# Branch at the point where only 100 rows were inserted
env.zenith_cli.create_branch('test_branch_behind_hundred',
'test_branch_behind',
ancestor_start_lsn=lsn_a)
env.zenith_cli.create_branch("test_branch_behind_hundred", "test_branch_behind@" + lsn_a)
# Insert many more rows. This generates enough WAL to fill a few segments.
main_cur.execute('''
@@ -76,12 +75,10 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
log.info(f'LSN after 400100 rows: {lsn_c}')
# Branch at the point where only 200100 rows were inserted
env.zenith_cli.create_branch('test_branch_behind_more',
'test_branch_behind',
ancestor_start_lsn=lsn_b)
env.zenith_cli.create_branch("test_branch_behind_more", "test_branch_behind@" + lsn_b)
pg_hundred = env.postgres.create_start('test_branch_behind_hundred')
pg_more = env.postgres.create_start('test_branch_behind_more')
pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
pg_more = env.postgres.create_start("test_branch_behind_more")
# On the 'hundred' branch, we should see only 100 rows
hundred_pg_conn = pg_hundred.connect()
@@ -102,23 +99,19 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
# Check bad lsn's for branching
# branch at segment boundary
env.zenith_cli.create_branch('test_branch_segment_boundary',
'test_branch_behind',
ancestor_start_lsn="0/3000000")
pg = env.postgres.create_start('test_branch_segment_boundary')
env.zenith_cli.create_branch("test_branch_segment_boundary", "test_branch_behind@0/3000000")
pg = env.postgres.create_start("test_branch_segment_boundary")
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
# branch at pre-initdb lsn
with pytest.raises(Exception, match="invalid branch start lsn"):
env.zenith_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42")
env.zenith_cli.create_branch("test_branch_preinitdb", "main@0/42")
# branch at pre-ancestor lsn
with pytest.raises(Exception, match="less than timeline ancestor lsn"):
env.zenith_cli.create_branch('test_branch_preinitdb',
'test_branch_behind',
ancestor_start_lsn="0/42")
env.zenith_cli.create_branch("test_branch_preinitdb", "test_branch_behind@0/42")
# check that we cannot create branch based on garbage collected data
with closing(env.pageserver.connect()) as psconn:
@@ -130,9 +123,7 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder):
with pytest.raises(Exception, match="invalid branch start lsn"):
# this gced_lsn is pretty random, so if gc is disabled this woudln't fail
env.zenith_cli.create_branch('test_branch_create_fail',
'test_branch_behind',
ancestor_start_lsn=gced_lsn)
env.zenith_cli.create_branch("test_branch_create_fail", f"test_branch_behind@{gced_lsn}")
# check that after gc everything is still there
hundred_cur.execute('SELECT count(*) FROM foo')

View File

@@ -12,7 +12,7 @@ from fixtures.log_helper import log
#
def test_clog_truncate(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch('test_clog_truncate', 'empty')
env.zenith_cli.create_branch("test_clog_truncate", "empty")
# set agressive autovacuum to make sure that truncation will happen
config = [
@@ -62,9 +62,9 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv):
# create new branch after clog truncation and start a compute node on it
log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
env.zenith_cli.create_branch('test_clog_truncate_new',
'test_clog_truncate',
ancestor_start_lsn=lsn_after_truncation)
env.zenith_cli.create_branch("test_clog_truncate_new",
"test_clog_truncate@" + lsn_after_truncation)
pg2 = env.postgres.create_start('test_clog_truncate_new')
log.info('postgres is running on test_clog_truncate_new branch')

View File

@@ -11,7 +11,7 @@ from fixtures.log_helper import log
#
def test_createdb(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch('test_createdb', 'empty')
env.zenith_cli.create_branch("test_createdb", "empty")
pg = env.postgres.create_start('test_createdb')
log.info("postgres is running on 'test_createdb' branch")
@@ -27,7 +27,8 @@ def test_createdb(zenith_simple_env: ZenithEnv):
lsn = cur.fetchone()[0]
# Create a branch
env.zenith_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn)
env.zenith_cli.create_branch("test_createdb2", "test_createdb@" + lsn)
pg2 = env.postgres.create_start('test_createdb2')
# Test that you can connect to the new database on both branches
@@ -40,7 +41,8 @@ def test_createdb(zenith_simple_env: ZenithEnv):
#
def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
env = zenith_simple_env
env.zenith_cli.create_branch('test_dropdb', 'empty')
env.zenith_cli.create_branch("test_dropdb", "empty")
pg = env.postgres.create_start('test_dropdb')
log.info("postgres is running on 'test_dropdb' branch")
@@ -64,14 +66,10 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
lsn_after_drop = cur.fetchone()[0]
# Create two branches before and after database drop.
env.zenith_cli.create_branch('test_before_dropdb',
'test_dropdb',
ancestor_start_lsn=lsn_before_drop)
env.zenith_cli.create_branch("test_before_dropdb", "test_dropdb@" + lsn_before_drop)
pg_before = env.postgres.create_start('test_before_dropdb')
env.zenith_cli.create_branch('test_after_dropdb',
'test_dropdb',
ancestor_start_lsn=lsn_after_drop)
env.zenith_cli.create_branch("test_after_dropdb", "test_dropdb@" + lsn_after_drop)
pg_after = env.postgres.create_start('test_after_dropdb')
# Test that database exists on the branch before drop

View File

@@ -9,7 +9,8 @@ from fixtures.log_helper import log
#
def test_createuser(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch('test_createuser', 'empty')
env.zenith_cli.create_branch("test_createuser", "empty")
pg = env.postgres.create_start('test_createuser')
log.info("postgres is running on 'test_createuser' branch")
@@ -24,7 +25,8 @@ def test_createuser(zenith_simple_env: ZenithEnv):
lsn = cur.fetchone()[0]
# Create a branch
env.zenith_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn)
env.zenith_cli.create_branch("test_createuser2", "test_createuser@" + lsn)
pg2 = env.postgres.create_start('test_createuser2')
# Test that you can connect to new branch as a new user

View File

@@ -10,7 +10,7 @@ from fixtures.log_helper import log
#
def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
env = zenith_simple_env
env.zenith_cli.create_branch('test_multixact', 'empty')
env.zenith_cli.create_branch("test_multixact", "empty")
pg = env.postgres.create_start('test_multixact')
log.info("postgres is running on 'test_multixact' branch")
@@ -60,7 +60,7 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
assert int(next_multixact_id) > int(next_multixact_id_old)
# Branch at this point
env.zenith_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn)
env.zenith_cli.create_branch("test_multixact_new", "test_multixact@" + lsn)
pg_new = env.postgres.create_start('test_multixact_new')
log.info("postgres is running on 'test_multixact_new' branch")

View File

@@ -23,26 +23,22 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID):
client.tenant_create(tenant_id)
assert tenant_id.hex in {t['id'] for t in client.tenant_list()}
timelines = client.timeline_list(tenant_id)
assert len(timelines) == 0, "initial tenant should not have any timelines"
# create timeline
timeline_id = uuid4()
client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id)
# check its timelines
timelines = client.timeline_list(tenant_id)
assert len(timelines) > 0
# check it is there
assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)}
for timeline in timelines:
timeline_id_str = str(timeline['timeline_id'])
timeline_details = client.timeline_detail(tenant_id=tenant_id,
timeline_id=UUID(timeline_id_str))
assert timeline_details['kind'] == 'Local'
for timeline_id_str in timelines:
timeline_details = client.timeline_detail(tenant_id, UUID(timeline_id_str))
assert timeline_details['type'] == 'Local'
assert timeline_details['tenant_id'] == tenant_id.hex
assert timeline_details['timeline_id'] == timeline_id_str
# create branch
branch_name = uuid4().hex
client.branch_create(tenant_id, branch_name, "main")
# check it is there
assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}
def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
env = zenith_simple_env

View File

@@ -16,7 +16,7 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down')
env.zenith_cli.create_branch("test_pageserver_catchup_while_compute_down", "main")
pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down')
pg_conn = pg.connect()

View File

@@ -15,7 +15,7 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_pageserver_restart')
env.zenith_cli.create_branch("test_pageserver_restart", "main")
pg = env.postgres.create_start('test_pageserver_restart')
pg_conn = pg.connect()

View File

@@ -1,5 +1,7 @@
from io import BytesIO
import asyncio
import asyncpg
import subprocess
from fixtures.zenith_fixtures import ZenithEnv, Postgres
from fixtures.log_helper import log

View File

@@ -5,7 +5,6 @@ def test_proxy_select_1(static_proxy):
static_proxy.safe_psql("select 1;")
@pytest.mark.xfail # Proxy eats the extra connection options
def test_proxy_options(static_proxy):
schema_name = "tmp_schema_1"
with static_proxy.connect(schema=schema_name) as conn:

View File

@@ -11,7 +11,8 @@ from fixtures.zenith_fixtures import ZenithEnv
#
def test_readonly_node(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch('test_readonly_node', 'empty')
env.zenith_cli.create_branch("test_readonly_node", "empty")
pgmain = env.postgres.create_start('test_readonly_node')
log.info("postgres is running on 'test_readonly_node' branch")
@@ -52,14 +53,12 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
log.info('LSN after 400100 rows: ' + lsn_c)
# Create first read-only node at the point where only 100 rows were inserted
pg_hundred = env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_readonly_node_hundred',
lsn=lsn_a)
pg_hundred = env.postgres.create_start("test_readonly_node_hundred",
branch=f'test_readonly_node@{lsn_a}')
# And another at the point where 200100 rows were inserted
pg_more = env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_readonly_node_more',
lsn=lsn_b)
pg_more = env.postgres.create_start("test_readonly_node_more",
branch=f'test_readonly_node@{lsn_b}')
# On the 'hundred' node, we should see only 100 rows
hundred_pg_conn = pg_hundred.connect()
@@ -78,9 +77,8 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
assert main_cur.fetchone() == (400100, )
# Check creating a node at segment boundary
pg = env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_branch_segment_boundary',
lsn='0/3000000')
pg = env.postgres.create_start("test_branch_segment_boundary",
branch="test_readonly_node@0/3000000")
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
@@ -88,6 +86,5 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
# Create node at pre-initdb lsn
with pytest.raises(Exception, match="invalid basebackup lsn"):
# compute node startup with invalid LSN should fail
env.postgres.create_start(branch_name='test_readonly_node',
node_name='test_readonly_node_preinitdb',
lsn='0/42')
env.zenith_cli.pg_start("test_readonly_node_preinitdb",
timeline_spec="test_readonly_node@0/42")

View File

@@ -43,7 +43,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
##### First start, insert secret data and upload it to the remote storage
env = zenith_env_builder.init_start()
pg = env.postgres.create_start('main')
pg = env.postgres.create_start()
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
@@ -85,7 +85,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id))
assert timeline_details['timeline_id'] == timeline_id
assert timeline_details['tenant_id'] == tenant_id
if timeline_details['kind'] == 'Local':
if timeline_details['type'] == 'Local':
log.info("timeline downloaded, checking its data")
break
attempts += 1
@@ -94,7 +94,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
log.debug("still waiting")
time.sleep(1)
pg = env.postgres.create_start('main')
pg = env.postgres.create_start()
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};')

View File

@@ -15,7 +15,8 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_restart_compute')
env.zenith_cli.create_branch("test_restart_compute", "main")
pg = env.postgres.create_start('test_restart_compute')
log.info("postgres is running on 'test_restart_compute' branch")

View File

@@ -127,14 +127,16 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
# create folder for remote storage mock
remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage'
tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
tenant = env.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
log.info("tenant to relocate %s", tenant)
env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant)
env.zenith_cli.create_branch("test_tenant_relocation", "main", tenant_id=tenant)
tenant_pg = env.postgres.create_start(branch_name='main',
node_name='test_tenant_relocation',
tenant_id=tenant)
tenant_pg = env.postgres.create_start(
"test_tenant_relocation",
"main", # branch name, None means same as node name
tenant_id=tenant,
)
# insert some data
with closing(tenant_pg.connect()) as conn:

View File

@@ -12,21 +12,25 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce
env = zenith_env_builder.init_start()
"""Tests tenants with and without wal acceptors"""
tenant_1 = env.zenith_cli.create_tenant()
tenant_2 = env.zenith_cli.create_tenant()
tenant_1 = env.create_tenant()
tenant_2 = env.create_tenant()
env.zenith_cli.create_timeline(
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_1)
env.zenith_cli.create_timeline(
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_2)
env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
"main",
tenant_id=tenant_1)
env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
"main",
tenant_id=tenant_2)
pg_tenant1 = env.postgres.create_start(
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}',
tenant_id=tenant_1,
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
None, # branch name, None means same as node name
tenant_1,
)
pg_tenant2 = env.postgres.create_start(
f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}',
tenant_id=tenant_2,
f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
None, # branch name, None means same as node name
tenant_2,
)
for pg in [pg_tenant1, pg_tenant2]:

View File

@@ -10,10 +10,10 @@ import time
def test_timeline_size(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
# Branch at the point where only 100 rows were inserted
new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty')
env.zenith_cli.create_branch("test_timeline_size", "empty")
client = env.pageserver.http_client()
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
res = client.branch_detail(env.initial_tenant, "test_timeline_size")
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
pgmain = env.postgres.create_start("test_timeline_size")
@@ -31,11 +31,11 @@ def test_timeline_size(zenith_simple_env: ZenithEnv):
FROM generate_series(1, 10) g
""")
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
res = client.branch_detail(env.initial_tenant, "test_timeline_size")
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
cur.execute("TRUNCATE foo")
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
res = client.branch_detail(env.initial_tenant, "test_timeline_size")
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
@@ -68,16 +68,17 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60
def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota')
env.zenith_cli.create_branch("test_timeline_size_quota", "main")
client = env.pageserver.http_client()
res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
res = client.branch_detail(env.initial_tenant, "test_timeline_size_quota")
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
pgmain = env.postgres.create_start(
"test_timeline_size_quota",
# Set small limit for the test
config_lines=['zenith.max_cluster_size=30MB'])
config_lines=['zenith.max_cluster_size=30MB'],
)
log.info("postgres is running on 'test_timeline_size_quota' branch")
with closing(pgmain.connect()) as conn:

View File

@@ -10,6 +10,7 @@ from fixtures.log_helper import log
def test_twophase(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
env.zenith_cli.create_branch("test_twophase", "empty")
pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
log.info("postgres is running on 'test_twophase' branch")

View File

@@ -13,7 +13,7 @@ from dataclasses import dataclass, field
from multiprocessing import Process, Value
from pathlib import Path
from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol
from fixtures.utils import lsn_to_hex, mkdir_if_needed, lsn_from_hex
from fixtures.utils import lsn_to_hex, mkdir_if_needed
from fixtures.log_helper import log
from typing import List, Optional, Any
@@ -24,7 +24,8 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_wal_acceptors_normal_work')
env.zenith_cli.create_branch("test_wal_acceptors_normal_work", "main")
pg = env.postgres.create_start('test_wal_acceptors_normal_work')
with closing(pg.connect()) as conn:
@@ -38,9 +39,9 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
@dataclass
class TimelineMetrics:
timeline_id: str
last_record_lsn: int
class BranchMetrics:
name: str
latest_valid_lsn: int
# One entry per each Safekeeper, order is the same
flush_lsns: List[int] = field(default_factory=list)
commit_lsns: List[int] = field(default_factory=list)
@@ -54,32 +55,23 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
n_timelines = 3
branch_names = [
"test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)
]
# pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418')
# that's not really human readable, so the branch names are introduced in Zenith CLI.
# Zenith CLI stores its branch <-> timeline mapping in its internals,
# but we need this to collect metrics from other servers, related to the timeline.
branch_names_to_timeline_ids = {}
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
# start postgres on each timeline
pgs = []
for branch_name in branch_names:
new_timeline_id = env.zenith_cli.create_branch(branch_name)
pgs.append(env.postgres.create_start(branch_name))
branch_names_to_timeline_ids[branch_name] = new_timeline_id
for branch in branches:
env.zenith_cli.create_branch(branch, "main")
pgs.append(env.postgres.create_start(branch))
tenant_id = env.initial_tenant
def collect_metrics(message: str) -> List[TimelineMetrics]:
def collect_metrics(message: str) -> List[BranchMetrics]:
with env.pageserver.http_client() as pageserver_http:
timeline_details = [
pageserver_http.timeline_detail(
tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name])
for branch_name in branch_names
branch_details = [
pageserver_http.branch_detail(tenant_id=tenant_id, name=branch)
for branch in branches
]
# All changes visible to pageserver (last_record_lsn) should be
# All changes visible to pageserver (latest_valid_lsn) should be
# confirmed by safekeepers first. As we cannot atomically get
# state of both pageserver and safekeepers, we should start with
# pageserver. Looking at outdated data from pageserver is ok.
@@ -88,14 +80,14 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
# safekeepers' state, it will look contradictory.
sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers]
timeline_metrics = []
branch_metrics = []
with env.pageserver.http_client() as pageserver_http:
for timeline_detail in timeline_details:
timeline_id: str = timeline_detail["timeline_id"]
for branch_detail in branch_details:
timeline_id: str = branch_detail["timeline_id"]
m = TimelineMetrics(
timeline_id=timeline_id,
last_record_lsn=lsn_from_hex(timeline_detail["last_record_lsn"]),
m = BranchMetrics(
name=branch_detail["name"],
latest_valid_lsn=branch_detail["latest_valid_lsn"],
)
for sk_m in sk_metrics:
m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)])
@@ -107,13 +99,13 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
# We only call collect_metrics() after a transaction is confirmed by
# the compute node, which only happens after a consensus of safekeepers
# has confirmed the transaction. We assume majority consensus here.
assert (2 * sum(m.last_record_lsn <= lsn
assert (2 * sum(m.latest_valid_lsn <= lsn
for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers)
assert (2 * sum(m.last_record_lsn <= lsn
assert (2 * sum(m.latest_valid_lsn <= lsn
for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers)
timeline_metrics.append(m)
log.info(f"{message}: {timeline_metrics}")
return timeline_metrics
branch_metrics.append(m)
log.info(f"{message}: {branch_metrics}")
return branch_metrics
# TODO: https://github.com/zenithdb/zenith/issues/809
# collect_metrics("before CREATE TABLE")
@@ -125,7 +117,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
init_m = collect_metrics("after CREATE TABLE")
# Populate data for 2/3 timelines
# Populate data for 2/3 branches
class MetricsChecker(threading.Thread):
def __init__(self) -> None:
super().__init__(daemon=True)
@@ -163,15 +155,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
collect_metrics("after INSERT INTO")
# Check data for 2/3 timelines
# Check data for 2/3 branches
for pg in pgs[:-1]:
res = pg.safe_psql("SELECT sum(key) FROM t")
assert res[0] == (5000050000, )
final_m = collect_metrics("after SELECT")
# Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly.
# Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly.
# Also assume that safekeepers will not be significantly out of sync in this test.
middle_lsn = (init_m[0].last_record_lsn + final_m[0].last_record_lsn) // 2
middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2
assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns)
assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns)
assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns)
@@ -191,7 +183,7 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = n_acceptors
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_wal_acceptors_restarts')
env.zenith_cli.create_branch("test_wal_acceptors_restarts", "main")
pg = env.postgres.create_start('test_wal_acceptors_restarts')
# we rely upon autocommit after each statement
@@ -228,7 +220,7 @@ def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 2
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_wal_acceptors_unavailability')
env.zenith_cli.create_branch("test_wal_acceptors_unavailability", "main")
pg = env.postgres.create_start('test_wal_acceptors_unavailability')
# we rely upon autocommit after each statement
@@ -299,7 +291,7 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_wal_acceptors_race_conditions')
env.zenith_cli.create_branch("test_wal_acceptors_race_conditions", "main")
pg = env.postgres.create_start('test_wal_acceptors_race_conditions')
# we rely upon autocommit after each statement
@@ -464,7 +456,7 @@ def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 1
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_timeline_status')
env.zenith_cli.create_branch("test_timeline_status", "main")
pg = env.postgres.create_start('test_timeline_status')
wa = env.safekeepers[0]
@@ -638,7 +630,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 4
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_replace_safekeeper')
env.zenith_cli.create_branch("test_replace_safekeeper", "main")
log.info("Use only first 3 safekeepers")
env.safekeepers[3].stop()

View File

@@ -202,7 +202,7 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder):
zenith_env_builder.num_safekeepers = 3
env = zenith_env_builder.init_start()
env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load')
env.zenith_cli.create_branch("test_wal_acceptors_restarts_under_load", "main")
pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load')
asyncio.run(run_restarts_under_load(pg, env.safekeepers))

View File

@@ -7,46 +7,52 @@ from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserv
from typing import cast
def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpClient,
env: ZenithEnv,
initial_tenant: uuid.UUID):
def helper_compare_branch_list(pageserver_http_client: ZenithPageserverHttpClient,
env: ZenithEnv,
initial_tenant: uuid.UUID):
"""
Compare timelines list returned by CLI and directly via API.
Filters out timelines created by other tests.
Compare branches list returned by CLI and directly via API.
Filters out branches created by other tests.
"""
branches = pageserver_http_client.branch_list(initial_tenant)
branches_api = sorted(map(lambda b: cast(str, b['name']), branches))
branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
timelines_api = sorted(
map(lambda t: cast(str, t['timeline_id']),
pageserver_http_client.timeline_list(initial_tenant)))
res = env.zenith_cli.list_branches()
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
timelines_cli = env.zenith_cli.list_timelines()
assert timelines_cli == env.zenith_cli.list_timelines(initial_tenant)
res = env.zenith_cli.list_branches(tenant_id=initial_tenant)
branches_cli_with_tenant_arg = sorted(
map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
branches_cli_with_tenant_arg = [
b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')
]
cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli])
assert timelines_api == cli_timeline_ids
assert branches_api == branches_cli == branches_cli_with_tenant_arg
def test_cli_timeline_list(zenith_simple_env: ZenithEnv):
def test_cli_branch_list(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
pageserver_http_client = env.pageserver.http_client()
# Initial sanity check
helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant)
# Create a branch for us
main_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_main')
helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant)
helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant)
env.zenith_cli.create_branch("test_cli_branch_list_main", "empty")
helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant)
# Create a nested branch
nested_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_nested',
'test_cli_branch_list_main')
helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant)
res = env.zenith_cli.create_branch("test_cli_branch_list_nested", "test_cli_branch_list_main")
assert res.stderr == ''
helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant)
# Check that all new branches are visible via CLI
timelines_cli = [timeline_id for (_, timeline_id) in env.zenith_cli.list_timelines()]
res = env.zenith_cli.list_branches()
assert res.stderr == ''
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
assert main_timeline_id.hex in timelines_cli
assert nested_timeline_id.hex in timelines_cli
assert 'test_cli_branch_list_main' in branches_cli
assert 'test_cli_branch_list_nested' in branches_cli
def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv):
@@ -54,6 +60,7 @@ def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClien
tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants))
res = env.zenith_cli.list_tenants()
assert res.stderr == ''
tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
assert tenants_api == tenants_cli
@@ -66,13 +73,15 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
helper_compare_tenant_list(pageserver_http_client, env)
# Create new tenant
tenant1 = env.zenith_cli.create_tenant()
tenant1 = uuid.uuid4()
env.zenith_cli.create_tenant(tenant1)
# check tenant1 appeared
helper_compare_tenant_list(pageserver_http_client, env)
# Create new tenant
tenant2 = env.zenith_cli.create_tenant()
tenant2 = uuid.uuid4()
env.zenith_cli.create_tenant(tenant2)
# check tenant2 appeared
helper_compare_tenant_list(pageserver_http_client, env)

View File

@@ -64,8 +64,9 @@ class ZenithCompare(PgCompare):
self._pg_bin = pg_bin
# We only use one branch and one timeline
self.env.zenith_cli.create_branch(branch_name, 'empty')
self._pg = self.env.postgres.create_start(branch_name)
self.branch = branch_name
self.env.zenith_cli.create_branch(self.branch, "empty")
self._pg = self.env.postgres.create_start(self.branch)
self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0]
# Long-lived cursor, useful for flushing

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from dataclasses import field
from dataclasses import dataclass, field
import textwrap
from cached_property import cached_property
import asyncpg
@@ -29,6 +29,7 @@ from dataclasses import dataclass
from psycopg2.extensions import connection as PgConnection
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple
from typing_extensions import Literal
import pytest
import requests
import backoff # type: ignore
@@ -57,7 +58,6 @@ Fn = TypeVar('Fn', bound=Callable[..., Any])
DEFAULT_OUTPUT_DIR = 'test_output'
DEFAULT_POSTGRES_DIR = 'tmp_install'
DEFAULT_BRANCH_NAME = 'main'
BASE_PORT = 15000
WORKER_PORT_NUM = 100
@@ -219,7 +219,7 @@ def can_bind(host: str, port: int) -> bool:
class PortDistributor:
def __init__(self, base_port: int, port_number: int):
def __init__(self, base_port: int, port_number: int) -> None:
self.iterator = iter(range(base_port, base_port + port_number))
def get_port(self) -> int:
@@ -424,8 +424,7 @@ class ZenithEnvBuilder:
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 0,
pageserver_auth_enabled: bool = False,
rust_log_override: Optional[str] = None,
default_branch_name=DEFAULT_BRANCH_NAME):
rust_log_override: Optional[str] = None):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
self.port_distributor = port_distributor
@@ -433,7 +432,6 @@ class ZenithEnvBuilder:
self.pageserver_config_override = pageserver_config_override
self.num_safekeepers = num_safekeepers
self.pageserver_auth_enabled = pageserver_auth_enabled
self.default_branch_name = default_branch_name
self.env: Optional[ZenithEnv] = None
self.s3_mock_server: Optional[MockS3Server] = None
@@ -538,7 +536,7 @@ class ZenithEnv:
initial_tenant - tenant ID of the initial tenant created in the repository
zenith_cli - can be used to run the 'zenith' CLI tool
zenith_cli() - zenith_cli() can be used to run the 'zenith' CLI tool
create_tenant() - initializes a new tenant in the page server, returns
the tenant id
@@ -549,7 +547,9 @@ class ZenithEnv:
self.port_distributor = config.port_distributor
self.s3_mock_server = config.s3_mock_server
self.zenith_cli = ZenithCli(env=self)
self.postgres = PostgresFactory(self)
self.safekeepers: List[Safekeeper] = []
# generate initial tenant ID here instead of letting 'zenith init' generate it,
@@ -558,7 +558,7 @@ class ZenithEnv:
# Create a config file corresponding to the options
toml = textwrap.dedent(f"""
default_tenant_id = '{self.initial_tenant.hex}'
default_tenantid = '{self.initial_tenant.hex}'
""")
# Create config for pageserver
@@ -600,6 +600,7 @@ class ZenithEnv:
self.safekeepers.append(safekeeper)
log.info(f"Config: {toml}")
self.zenith_cli.init(toml)
def start(self):
@@ -613,14 +614,11 @@ class ZenithEnv:
""" Get list of safekeeper endpoints suitable for wal_acceptors GUC """
return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers])
def run_psbench(self, timeline):
ps_log_filename = os.path.join(self.repo_dir, "pageserver.log")
ps_connstr = self.pageserver.connstr()
psbench_binpath = os.path.join(str(zenith_binpath), 'psbench')
tenant_hex = self.initial_tenant.hex
print("AAAAAAAA", ps_connstr)
args = [psbench_binpath, ps_log_filename, ps_connstr, tenant_hex, timeline]
subprocess.run(args)
def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
if tenant_id is None:
tenant_id = uuid.uuid4()
self.zenith_cli.create_tenant(tenant_id)
return tenant_id
@cached_property
def auth_keys(self) -> AuthKeys:
@@ -645,11 +643,13 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]:
shutil.rmtree(repo_dir, ignore_errors=True)
with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder:
env = builder.init_start()
# For convenience in tests, create a branch from the freshly-initialized cluster.
env.zenith_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME)
env.zenith_cli.create_branch("empty", "main")
# Return the builder to the caller
yield env
@@ -698,7 +698,7 @@ class ZenithPageserverApiException(Exception):
class ZenithPageserverHttpClient(requests.Session):
def __init__(self, port: int, auth_token: Optional[str] = None):
def __init__(self, port: int, auth_token: Optional[str] = None) -> None:
super().__init__()
self.port = port
self.auth_token = auth_token
@@ -721,36 +721,38 @@ class ZenithPageserverHttpClient(requests.Session):
def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/attach",
)
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/attach", )
self.verbose_error(res)
def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/detach",
)
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", )
self.verbose_error(res)
def timeline_create(
self,
tenant_id: uuid.UUID,
new_timeline_id: Optional[uuid.UUID] = None,
ancestor_timeline_id: Optional[uuid.UUID] = None,
ancestor_start_lsn: Optional[str] = None,
) -> Dict[Any, Any]:
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline",
def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]:
res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict[Any, Any]:
res = self.post(f"http://localhost:{self.port}/v1/branch",
json={
'new_timeline_id':
new_timeline_id.hex if new_timeline_id else None,
'ancestor_start_lsn':
ancestor_start_lsn,
'ancestor_timeline_id':
ancestor_timeline_id.hex if ancestor_timeline_id else None,
'tenant_id': tenant_id.hex,
'name': name,
'start_point': start_point,
})
self.verbose_error(res)
if res.status_code == 409:
raise Exception(f'could not create timeline: already exists for id {new_timeline_id}')
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict[Any, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}?include-non-incremental-logical-size=1",
)
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
@@ -762,22 +764,18 @@ class ZenithPageserverHttpClient(requests.Session):
assert isinstance(res_json, list)
return res_json
def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
def tenant_create(self, tenant_id: uuid.UUID):
res = self.post(
f"http://localhost:{self.port}/v1/tenant",
json={
'new_tenant_id': new_tenant_id.hex if new_tenant_id else None,
'tenant_id': tenant_id.hex,
},
)
self.verbose_error(res)
if res.status_code == 409:
raise Exception(f'could not create tenant: already exists for id {new_tenant_id}')
new_tenant_id = res.json()
assert isinstance(new_tenant_id, str)
return uuid.UUID(new_tenant_id)
return res.json()
def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]:
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
def timeline_list(self, tenant_id: uuid.UUID) -> List[str]:
res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
@@ -785,8 +783,7 @@ class ZenithPageserverHttpClient(requests.Session):
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1"
)
f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
@@ -820,124 +817,54 @@ class S3Storage:
RemoteStorage = Union[LocalFsStorage, S3Storage]
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
re.MULTILINE)
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
re.MULTILINE)
TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P<branch_name>[^\s]+)\s\[(?P<timeline_id>[^\]]+)\]",
re.MULTILINE)
class ZenithCli:
"""
A typed wrapper around the `zenith` CLI tool.
Supports main commands via typed methods and a way to run arbitrary command directly via CLI.
"""
def __init__(self, env: ZenithEnv):
def __init__(self, env: ZenithEnv) -> None:
self.env = env
pass
def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
"""
Creates a new tenant, returns its id and its initial timeline's id.
"""
if tenant_id is None:
tenant_id = uuid.uuid4()
res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex])
res.check_returncode()
self.raw_cli(['tenant', 'create', tenant_id.hex])
return tenant_id
def list_tenants(self) -> 'subprocess.CompletedProcess[str]':
res = self.raw_cli(['tenant', 'list'])
res.check_returncode()
return res
def create_timeline(self,
new_branch_name: str,
tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID:
cmd = [
'timeline',
'create',
'--branch-name',
new_branch_name,
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
res = self.raw_cli(cmd)
res.check_returncode()
matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
created_timeline_id = None
if matches is not None:
created_timeline_id = matches.group('timeline_id')
return uuid.UUID(created_timeline_id)
return self.raw_cli(['tenant', 'list'])
def create_branch(self,
new_branch_name: str = DEFAULT_BRANCH_NAME,
ancestor_branch_name: Optional[str] = None,
tenant_id: Optional[uuid.UUID] = None,
ancestor_start_lsn: Optional[str] = None) -> uuid.UUID:
cmd = [
'timeline',
'branch',
'--branch-name',
new_branch_name,
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
if ancestor_branch_name is not None:
cmd.extend(['--ancestor-branch-name', ancestor_branch_name])
if ancestor_start_lsn is not None:
cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn])
branch_name: str,
starting_point: str,
tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]':
args = ['branch']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
args.extend([branch_name, starting_point])
res = self.raw_cli(cmd)
res.check_returncode()
return self.raw_cli(args)
matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
def list_branches(self,
tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]':
args = ['branch']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
return self.raw_cli(args)
created_timeline_id = None
if matches is not None:
created_timeline_id = matches.group('timeline_id')
if created_timeline_id is None:
raise Exception('could not find timeline id after `zenith timeline create` invocation')
else:
return uuid.UUID(created_timeline_id)
def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]:
"""
Returns a list of (branch_name, timeline_id) tuples out of parsed `zenith timeline list` CLI output.
"""
# (L) main [b49f7954224a0ad25cc0013ea107b54b]
# (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540]
res = self.raw_cli(
['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex])
timelines_cli = sorted(
map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]),
TIMELINE_DATA_EXTRACTOR.findall(res.stdout)))
return timelines_cli
def init(self,
config_toml: str,
initial_timeline_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]':
def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]':
with tempfile.NamedTemporaryFile(mode='w+') as tmp:
tmp.write(config_toml)
tmp.flush()
cmd = ['init', f'--config={tmp.name}']
if initial_timeline_id:
cmd.extend(['--timeline-id', initial_timeline_id.hex])
append_pageserver_param_overrides(cmd,
self.env.pageserver.remote_storage,
self.env.pageserver.config_override)
res = self.raw_cli(cmd)
res.check_returncode()
return res
return self.raw_cli(cmd)
def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]':
start_args = ['pageserver', 'start', *overrides]
@@ -969,54 +896,38 @@ class ZenithCli:
def pg_create(
self,
branch_name: str,
node_name: Optional[str] = None,
node_name: str,
tenant_id: Optional[uuid.UUID] = None,
lsn: Optional[str] = None,
timeline_spec: Optional[str] = None,
port: Optional[int] = None,
) -> 'subprocess.CompletedProcess[str]':
args = [
'pg',
'create',
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
'--branch-name',
branch_name,
]
if lsn is not None:
args.extend(['--lsn', lsn])
args = ['pg', 'create']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
if port is not None:
args.extend(['--port', str(port)])
if node_name is not None:
args.append(node_name)
res = self.raw_cli(args)
res.check_returncode()
return res
args.append(f'--port={port}')
args.append(node_name)
if timeline_spec is not None:
args.append(timeline_spec)
return self.raw_cli(args)
def pg_start(
self,
node_name: str,
tenant_id: Optional[uuid.UUID] = None,
lsn: Optional[str] = None,
timeline_spec: Optional[str] = None,
port: Optional[int] = None,
) -> 'subprocess.CompletedProcess[str]':
args = [
'pg',
'start',
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
if lsn is not None:
args.append(f'--lsn={lsn}')
args = ['pg', 'start']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
if port is not None:
args.append(f'--port={port}')
if node_name is not None:
args.append(node_name)
args.append(node_name)
if timeline_spec is not None:
args.append(timeline_spec)
res = self.raw_cli(args)
res.check_returncode()
return res
return self.raw_cli(args)
def pg_stop(
self,
@@ -1024,16 +935,12 @@ class ZenithCli:
tenant_id: Optional[uuid.UUID] = None,
destroy=False,
) -> 'subprocess.CompletedProcess[str]':
args = [
'pg',
'stop',
'--tenant-id',
(tenant_id or self.env.initial_tenant).hex,
]
args = ['pg', 'stop']
if tenant_id is not None:
args.extend(['--tenantid', tenant_id.hex])
if destroy:
args.append('--destroy')
if node_name is not None:
args.append(node_name)
args.append(node_name)
return self.raw_cli(args)
@@ -1108,7 +1015,8 @@ class ZenithPageserver(PgProtocol):
env: ZenithEnv,
port: PageserverPort,
remote_storage: Optional[RemoteStorage] = None,
config_override: Optional[str] = None):
config_override: Optional[str] = None,
enable_auth=False):
super().__init__(host='localhost', port=port.pg, username='zenith_admin')
self.env = env
self.running = False
@@ -1136,6 +1044,7 @@ class ZenithPageserver(PgProtocol):
if self.running:
self.env.zenith_cli.pageserver_stop(immediate)
self.running = False
return self
def __enter__(self):
@@ -1196,7 +1105,7 @@ class PgBin:
self.env = os.environ.copy()
self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib')
def _fixpath(self, command: List[str]):
def _fixpath(self, command: List[str]) -> None:
if '/' not in command[0]:
command[0] = os.path.join(self.pg_bin_path, command[0])
@@ -1207,7 +1116,7 @@ class PgBin:
env.update(env_add)
return env
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None) -> None:
"""
Run one of the postgres binaries.
@@ -1257,18 +1166,18 @@ class VanillaPostgres(PgProtocol):
self.running = False
self.pg_bin.run_capture(['initdb', '-D', pgdatadir])
def configure(self, options: List[str]):
def configure(self, options: List[str]) -> None:
"""Append lines into postgresql.conf file."""
assert not self.running
with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file:
conf_file.writelines(options)
def start(self):
def start(self) -> None:
assert not self.running
self.running = True
self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'start'])
def stop(self):
def stop(self) -> None:
assert self.running
self.running = False
self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop'])
@@ -1351,9 +1260,8 @@ class Postgres(PgProtocol):
def create(
self,
branch_name: str,
node_name: Optional[str] = None,
lsn: Optional[str] = None,
node_name: str,
branch: Optional[str] = None,
config_lines: Optional[List[str]] = None,
) -> 'Postgres':
"""
@@ -1364,21 +1272,19 @@ class Postgres(PgProtocol):
if not config_lines:
config_lines = []
self.node_name = node_name or f'{branch_name}_pg_node'
self.env.zenith_cli.pg_create(branch_name,
node_name=self.node_name,
if branch is None:
branch = node_name
self.env.zenith_cli.pg_create(node_name,
tenant_id=self.tenant_id,
lsn=lsn,
port=self.port)
port=self.port,
timeline_spec=branch)
self.node_name = node_name
path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name
self.pgdata_dir = os.path.join(self.env.repo_dir, path)
if config_lines is None:
config_lines = []
# set small 'max_replication_write_lag' to enable backpressure
# and make tests more stable.
config_lines = ['max_replication_write_lag=15MB'] + config_lines
self.config(config_lines)
return self
@@ -1465,7 +1371,7 @@ class Postgres(PgProtocol):
if self.running:
assert self.node_name is not None
self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id)
self.env.zenith_cli.pg_stop(self.node_name, tenant_id=self.tenant_id)
self.running = False
return self
@@ -1477,16 +1383,15 @@ class Postgres(PgProtocol):
"""
assert self.node_name is not None
self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True)
self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, destroy=True)
self.node_name = None
return self
def create_start(
self,
branch_name: str,
node_name: Optional[str] = None,
lsn: Optional[str] = None,
node_name: str,
branch: Optional[str] = None,
config_lines: Optional[List[str]] = None,
) -> 'Postgres':
"""
@@ -1496,10 +1401,9 @@ class Postgres(PgProtocol):
"""
self.create(
branch_name=branch_name,
node_name=node_name,
branch=branch,
config_lines=config_lines,
lsn=lsn,
).start()
return self
@@ -1519,10 +1423,9 @@ class PostgresFactory:
self.instances: List[Postgres] = []
def create_start(self,
branch_name: str,
node_name: Optional[str] = None,
node_name: str = "main",
branch: Optional[str] = None,
tenant_id: Optional[uuid.UUID] = None,
lsn: Optional[str] = None,
config_lines: Optional[List[str]] = None) -> Postgres:
pg = Postgres(
@@ -1534,17 +1437,15 @@ class PostgresFactory:
self.instances.append(pg)
return pg.create_start(
branch_name=branch_name,
node_name=node_name,
branch=branch,
config_lines=config_lines,
lsn=lsn,
)
def create(self,
branch_name: str,
node_name: Optional[str] = None,
node_name: str = "main",
branch: Optional[str] = None,
tenant_id: Optional[uuid.UUID] = None,
lsn: Optional[str] = None,
config_lines: Optional[List[str]] = None) -> Postgres:
pg = Postgres(
@@ -1557,9 +1458,8 @@ class PostgresFactory:
self.instances.append(pg)
return pg.create(
branch_name=branch_name,
node_name=node_name,
lsn=lsn,
branch=branch,
config_lines=config_lines,
)
@@ -1662,7 +1562,7 @@ class SafekeeperMetrics:
class SafekeeperHttpClient(requests.Session):
def __init__(self, port: int):
def __init__(self, port: int) -> None:
super().__init__()
self.port = port
@@ -1780,7 +1680,7 @@ def list_files_to_compare(pgdata_dir: str):
# pg is the existing and running compute node, that we want to compare with a basebackup
def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres):
# Get the timeline ID. We need it for the 'basebackup' command
# Get the timeline ID of our branch. We need it for the 'basebackup' command
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("SHOW zenith.zenith_timeline")

View File

@@ -30,16 +30,21 @@ def test_bulk_tenant_create(
for i in range(tenants_count):
start = timeit.default_timer()
tenant = env.zenith_cli.create_tenant()
env.zenith_cli.create_timeline(
f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant)
tenant = env.create_tenant()
env.zenith_cli.create_branch(
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
"main",
tenant_id=tenant)
# FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now?
#if use_wal_acceptors == 'with_wa':
# wa_factory.start_n_new(3)
pg_tenant = env.postgres.create_start(
f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant)
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
None, # branch name, None means same as node name
tenant,
)
end = timeit.default_timer()
time_slices.append(end - start)

View File

@@ -1,32 +0,0 @@
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
def test_get_page(zenith_simple_env: ZenithEnv, zenbenchmark: ZenithBenchmarker):
env = zenith_simple_env
env.zenith_cli.create_branch("test_pageserver", "empty")
pg = env.postgres.create_start('test_pageserver')
tenant_hex = env.initial_tenant.hex
timeline = pg.safe_psql("SHOW zenith.zenith_timeline")[0][0]
# Long-lived cursor, useful for flushing
psconn = env.pageserver.connect()
pscur = psconn.cursor()
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute('create table t (i integer);')
cur.execute('insert into t values (0);')
for i in range(1000):
cur.execute(f'update t set i = {i};')
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
cur.execute("select * from t;")
res = cur.fetchall()
print("AAAA")
print(res)
env.run_psbench(timeline)

View File

@@ -11,7 +11,7 @@ use std::io::{ErrorKind, Write};
use std::path::{Path, PathBuf};
use std::thread;
use tracing::*;
use walkeeper::control_file::{self};
use walkeeper::control_file::{self, CreateControlFile};
use zenith_utils::http::endpoint;
use zenith_utils::zid::ZNodeId;
use zenith_utils::{logging, tcp_listener, GIT_VERSION};
@@ -108,7 +108,10 @@ fn main() -> Result<()> {
.get_matches();
if let Some(addr) = arg_matches.value_of("dump-control-file") {
let state = control_file::FileStorage::load_control_file(Path::new(addr))?;
let state = control_file::FileStorage::load_control_file(
Path::new(addr),
CreateControlFile::False,
)?;
let json = serde_json::to_string(&state)?;
print!("{}", json);
return Ok(());

View File

@@ -27,6 +27,13 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control";
const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
// A named boolean.
#[derive(Debug)]
pub enum CreateControlFile {
True,
False,
}
lazy_static! {
static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_persist_control_file_seconds",
@@ -87,22 +94,28 @@ impl FileStorage {
pub fn load_control_file_conf(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
create: CreateControlFile,
) -> Result<SafeKeeperState> {
let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME);
Self::load_control_file(path)
Self::load_control_file(path, create)
}
/// Read in the control file.
/// If create=false and file doesn't exist, bails out.
pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
pub fn load_control_file<P: AsRef<Path>>(
control_file_path: P,
create: CreateControlFile,
) -> Result<SafeKeeperState> {
info!(
"loading control file {}",
"loading control file {}, create={:?}",
control_file_path.as_ref().display(),
create,
);
let mut control_file = OpenOptions::new()
.read(true)
.write(true)
.create(matches!(create, CreateControlFile::True))
.open(&control_file_path)
.with_context(|| {
format!(
@@ -111,32 +124,41 @@ impl FileStorage {
)
})?;
let mut buf = Vec::new();
control_file
.read_to_end(&mut buf)
.context("failed to read control file")?;
// Empty file is legit on 'create', don't try to deser from it.
let state = if control_file.metadata().unwrap().len() == 0 {
if let CreateControlFile::False = create {
bail!("control file is empty");
}
SafeKeeperState::new()
} else {
let mut buf = Vec::new();
control_file
.read_to_end(&mut buf)
.context("failed to read control file")?;
let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]);
let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]);
let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] =
buf[buf.len() - CHECKSUM_SIZE..].try_into()?;
let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes);
let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] =
buf[buf.len() - CHECKSUM_SIZE..].try_into()?;
let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes);
ensure!(
calculated_checksum == expected_checksum,
format!(
"safekeeper control file checksum mismatch: expected {} got {}",
expected_checksum, calculated_checksum
)
);
let state = FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE])
.with_context(|| {
ensure!(
calculated_checksum == expected_checksum,
format!(
"while reading control file {}",
control_file_path.as_ref().display(),
"safekeeper control file checksum mismatch: expected {} got {}",
expected_checksum, calculated_checksum
)
})?;
);
FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]).with_context(
|| {
format!(
"while reading control file {}",
control_file_path.as_ref().display(),
)
},
)?
};
Ok(state)
}
}
@@ -225,38 +247,31 @@ mod test {
fn load_from_control_file(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
create: CreateControlFile,
) -> Result<(FileStorage, SafeKeeperState)> {
fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir");
Ok((
FileStorage::new(zttid, conf),
FileStorage::load_control_file_conf(conf, zttid)?,
FileStorage::load_control_file_conf(conf, zttid, create)?,
))
}
fn create(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
) -> Result<(FileStorage, SafeKeeperState)> {
fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir");
let state = SafeKeeperState::empty();
let mut storage = FileStorage::new(zttid, conf);
storage.persist(&state)?;
Ok((storage, state))
}
#[test]
fn test_read_write_safekeeper_state() {
let conf = stub_conf();
let zttid = ZTenantTimelineId::generate();
{
let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state");
let (mut storage, mut state) =
load_from_control_file(&conf, &zttid, CreateControlFile::True)
.expect("failed to read state");
// change something
state.commit_lsn = Lsn(42);
state.wal_start_lsn = Lsn(42);
storage.persist(&state).expect("failed to persist state");
}
let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state");
assert_eq!(state.commit_lsn, Lsn(42));
let (_, state) = load_from_control_file(&conf, &zttid, CreateControlFile::False)
.expect("failed to read state");
assert_eq!(state.wal_start_lsn, Lsn(42));
}
#[test]
@@ -264,10 +279,11 @@ mod test {
let conf = stub_conf();
let zttid = ZTenantTimelineId::generate();
{
let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state");
let (mut storage, mut state) =
load_from_control_file(&conf, &zttid, CreateControlFile::True)
.expect("failed to read state");
// change something
state.commit_lsn = Lsn(42);
state.wal_start_lsn = Lsn(42);
storage.persist(&state).expect("failed to persist state");
}
let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME);
@@ -275,7 +291,7 @@ mod test {
data[0] += 1; // change the first byte of the file to fail checksum validation
fs::write(&control_path, &data).expect("failed to write control file");
match load_from_control_file(&conf, &zttid) {
match load_from_control_file(&conf, &zttid, CreateControlFile::False) {
Err(err) => assert!(err
.to_string()
.contains("safekeeper control file checksum mismatch")),

View File

@@ -1,6 +1,6 @@
//! Code to deal with safekeeper control file upgrades
use crate::safekeeper::{
AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry,
AcceptorState, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry,
};
use anyhow::{bail, Result};
use serde::{Deserialize, Serialize};
@@ -26,7 +26,7 @@ struct SafeKeeperStateV1 {
/// persistent acceptor state
acceptor_state: AcceptorStateV1,
/// information about server
server: ServerInfoV2,
server: ServerInfo,
/// Unique id of the last *elected* proposer we dealed with. Not needed
/// for correctness, exists for monitoring purposes.
proposer_uuid: PgUuid,
@@ -70,39 +70,6 @@ pub struct SafeKeeperStateV2 {
pub wal_start_lsn: Lsn,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ServerInfoV3 {
/// Postgres server version
pub pg_version: u32,
pub system_id: SystemId,
#[serde(with = "hex")]
pub tenant_id: ZTenantId,
/// Zenith timelineid
#[serde(with = "hex")]
pub timeline_id: ZTimelineId,
pub wal_seg_size: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SafeKeeperStateV3 {
/// persistent acceptor state
pub acceptor_state: AcceptorState,
/// information about server
pub server: ServerInfoV3,
/// Unique id of the last *elected* proposer we dealed with. Not needed
/// for correctness, exists for monitoring purposes.
#[serde(with = "hex")]
pub proposer_uuid: PgUuid,
/// part of WAL acknowledged by quorum and available locally
pub commit_lsn: Lsn,
/// minimal LSN which may be needed for recovery of some safekeeper (end_lsn
/// of last record streamed to everyone)
pub truncate_lsn: Lsn,
// Safekeeper starts receiving WAL from this LSN, zeros before it ought to
// be skipped during decoding.
pub wal_start_lsn: Lsn,
}
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState> {
// migrate to storing full term history
if version == 1 {
@@ -116,20 +83,12 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
}]),
};
return Ok(SafeKeeperState {
tenant_id: oldstate.server.tenant_id,
timeline_id: oldstate.server.ztli,
acceptor_state: ac,
server: ServerInfo {
pg_version: oldstate.server.pg_version,
system_id: oldstate.server.system_id,
wal_seg_size: oldstate.server.wal_seg_size,
},
server: oldstate.server.clone(),
proposer_uuid: oldstate.proposer_uuid,
commit_lsn: oldstate.commit_lsn,
s3_wal_lsn: Lsn(0),
peer_horizon_lsn: oldstate.truncate_lsn,
remote_consistent_lsn: Lsn(0),
peers: Peers(vec![]),
truncate_lsn: oldstate.truncate_lsn,
wal_start_lsn: oldstate.wal_start_lsn,
});
// migrate to hexing some zids
} else if version == 2 {
@@ -138,40 +97,17 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
let server = ServerInfo {
pg_version: oldstate.server.pg_version,
system_id: oldstate.server.system_id,
wal_seg_size: oldstate.server.wal_seg_size,
};
return Ok(SafeKeeperState {
tenant_id: oldstate.server.tenant_id,
timeline_id: oldstate.server.ztli,
acceptor_state: oldstate.acceptor_state,
server,
proposer_uuid: oldstate.proposer_uuid,
commit_lsn: oldstate.commit_lsn,
s3_wal_lsn: Lsn(0),
peer_horizon_lsn: oldstate.truncate_lsn,
remote_consistent_lsn: Lsn(0),
peers: Peers(vec![]),
});
// migrate to moving ztenantid/ztli to the top and adding some lsns
} else if version == 3 {
info!("reading safekeeper control file version {}", version);
let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?;
let server = ServerInfo {
pg_version: oldstate.server.pg_version,
system_id: oldstate.server.system_id,
wal_seg_size: oldstate.server.wal_seg_size,
};
return Ok(SafeKeeperState {
tenant_id: oldstate.server.tenant_id,
timeline_id: oldstate.server.timeline_id,
acceptor_state: oldstate.acceptor_state,
server,
proposer_uuid: oldstate.proposer_uuid,
commit_lsn: oldstate.commit_lsn,
s3_wal_lsn: Lsn(0),
peer_horizon_lsn: oldstate.truncate_lsn,
remote_consistent_lsn: Lsn(0),
peers: Peers(vec![]),
truncate_lsn: oldstate.truncate_lsn,
wal_start_lsn: oldstate.wal_start_lsn,
});
}
bail!("unsupported safekeeper control file version {}", version)

View File

@@ -13,7 +13,6 @@ use postgres_ffi::xlog_utils::PG_TLI;
use regex::Regex;
use std::str::FromStr;
use std::sync::Arc;
use tracing::info;
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend;
use zenith_utils::postgres_backend::PostgresBackend;
@@ -21,6 +20,7 @@ use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID
use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
use crate::callmemaybe::CallmeEvent;
use crate::control_file::CreateControlFile;
use tokio::sync::mpsc::UnboundedSender;
/// Safekeeper handler of postgres commands
@@ -101,19 +101,29 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> {
let cmd = parse_cmd(query_string)?;
info!("got query {:?}", query_string);
let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. })
|| matches!(cmd, SafekeeperPostgresCommand::IdentifySystem));
let tenantid = self.ztenantid.context("tenantid is required")?;
let timelineid = self.ztimelineid.context("timelineid is required")?;
if self.timeline.is_none() {
self.timeline.set(
&self.conf,
ZTenantTimelineId::new(tenantid, timelineid),
create,
)?;
// Is this command is ztimeline scoped?
match cmd {
SafekeeperPostgresCommand::StartWalPush { .. }
| SafekeeperPostgresCommand::StartReplication { .. }
| SafekeeperPostgresCommand::IdentifySystem
| SafekeeperPostgresCommand::JSONCtrl { .. } => {
let tenantid = self.ztenantid.context("tenantid is required")?;
let timelineid = self.ztimelineid.context("timelineid is required")?;
if self.timeline.is_none() {
// START_WAL_PUSH is the only command that initializes the timeline in production.
// There is also JSON_CTRL command, which should initialize the timeline for testing.
let create_control_file = match cmd {
SafekeeperPostgresCommand::StartWalPush { .. }
| SafekeeperPostgresCommand::JSONCtrl { .. } => CreateControlFile::True,
_ => CreateControlFile::False,
};
self.timeline.set(
&self.conf,
ZTenantTimelineId::new(tenantid, timelineid),
create_control_file,
)?;
}
}
}
match cmd {

View File

@@ -1,3 +1,2 @@
pub mod models;
pub mod routes;
pub use routes::make_router;

View File

@@ -1,9 +0,0 @@
use serde::{Deserialize, Serialize};
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub tenant_id: ZTenantId,
pub timeline_id: ZTimelineId,
pub peer_ids: Vec<ZNodeId>,
}

View File

@@ -1,15 +1,14 @@
use hyper::{Body, Request, Response, StatusCode};
use serde::Serialize;
use serde::Serializer;
use std::fmt::Display;
use std::sync::Arc;
use zenith_utils::http::json::json_request;
use zenith_utils::http::{RequestExt, RouterBuilder};
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZNodeId;
use zenith_utils::zid::ZTenantTimelineId;
use crate::control_file::CreateControlFile;
use crate::safekeeper::Term;
use crate::safekeeper::TermHistory;
use crate::timeline::GlobalTimelines;
@@ -20,8 +19,6 @@ use zenith_utils::http::json::json_response;
use zenith_utils::http::request::parse_request_param;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use super::models::TimelineCreateRequest;
#[derive(Debug, Serialize)]
struct SafekeeperStatus {
id: ZNodeId,
@@ -69,11 +66,7 @@ struct TimelineStatus {
#[serde(serialize_with = "display_serialize")]
commit_lsn: Lsn,
#[serde(serialize_with = "display_serialize")]
s3_wal_lsn: Lsn,
#[serde(serialize_with = "display_serialize")]
peer_horizon_lsn: Lsn,
#[serde(serialize_with = "display_serialize")]
remote_consistent_lsn: Lsn,
truncate_lsn: Lsn,
#[serde(serialize_with = "display_serialize")]
flush_lsn: Lsn,
}
@@ -85,7 +78,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
parse_request_param(&request, "timeline_id")?,
);
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
let tli = GlobalTimelines::get(get_conf(&request), zttid, CreateControlFile::False)
.map_err(ApiError::from_err)?;
let sk_state = tli.get_info();
let flush_lsn = tli.get_end_of_wal();
@@ -100,27 +94,12 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
timeline_id: zttid.timeline_id,
acceptor_state: acc_state,
commit_lsn: sk_state.commit_lsn,
s3_wal_lsn: sk_state.s3_wal_lsn,
peer_horizon_lsn: sk_state.peer_horizon_lsn,
remote_consistent_lsn: sk_state.remote_consistent_lsn,
truncate_lsn: sk_state.truncate_lsn,
flush_lsn,
};
Ok(json_response(StatusCode::OK, status)?)
}
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
let zttid = ZTenantTimelineId {
tenant_id: request_data.tenant_id,
timeline_id: request_data.timeline_id,
};
GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids)
.map_err(ApiError::from_err)?;
Ok(json_response(StatusCode::CREATED, ())?)
}
/// Safekeeper http router.
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
let router = endpoint::make_router();
@@ -131,5 +110,4 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
"/v1/timeline/:tenant_id/:timeline_id",
timeline_status_handler,
)
.post("/v1/timeline", timeline_create_handler)
}

View File

@@ -10,8 +10,6 @@ use std::cmp::min;
use std::fmt;
use std::io::Read;
use tracing::*;
use zenith_utils::zid::ZNodeId;
use zenith_utils::zid::ZTenantTimelineId;
use lazy_static::lazy_static;
@@ -27,13 +25,12 @@ use zenith_utils::pq_proto::ZenithFeedback;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
pub const SK_MAGIC: u32 = 0xcafeceefu32;
pub const SK_FORMAT_VERSION: u32 = 4;
pub const SK_FORMAT_VERSION: u32 = 3;
const SK_PROTOCOL_VERSION: u32 = 1;
const UNKNOWN_SERVER_VERSION: u32 = 0;
/// Consensus logical timestamp.
pub type Term = u64;
const INVALID_TERM: Term = 0;
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct TermSwitchEntry {
@@ -131,47 +128,18 @@ pub struct ServerInfo {
/// Postgres server version
pub pg_version: u32,
pub system_id: SystemId,
pub wal_seg_size: u32,
}
/// Data published by safekeeper to the peers
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PeerInfo {
/// LSN up to which safekeeper offloaded WAL to s3.
s3_wal_lsn: Lsn,
/// Term of the last entry.
term: Term,
/// LSN of the last record.
flush_lsn: Lsn,
/// Up to which LSN safekeeper regards its WAL as committed.
commit_lsn: Lsn,
}
impl PeerInfo {
fn new() -> Self {
Self {
s3_wal_lsn: Lsn(0),
term: INVALID_TERM,
flush_lsn: Lsn(0),
commit_lsn: Lsn(0),
}
}
}
// vector-based node id -> peer state map with very limited functionality we
// need/
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>);
/// Persistent information stored on safekeeper node
/// On disk data is prefixed by magic and format version and followed by checksum.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SafeKeeperState {
#[serde(with = "hex")]
pub tenant_id: ZTenantId,
/// Zenith timelineid
#[serde(with = "hex")]
pub timeline_id: ZTimelineId,
pub wal_seg_size: u32,
}
/// Persistent information stored on safekeeper node
/// On disk data is prefixed by magic and format version and followed by checksum.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SafeKeeperState {
/// persistent acceptor state
pub acceptor_state: AcceptorState,
/// information about server
@@ -180,33 +148,19 @@ pub struct SafeKeeperState {
/// for correctness, exists for monitoring purposes.
#[serde(with = "hex")]
pub proposer_uuid: PgUuid,
/// Part of WAL acknowledged by quorum and available locally. Always points
/// to record boundary.
/// part of WAL acknowledged by quorum and available locally
pub commit_lsn: Lsn,
/// First LSN not yet offloaded to s3. Useful to persist to avoid finding
/// out offloading progress on boot.
pub s3_wal_lsn: Lsn,
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
/// of last record streamed to everyone). Persisting it helps skipping
/// recovery in walproposer, generally we compute it from peers. In
/// walproposer proto called 'truncate_lsn'.
pub peer_horizon_lsn: Lsn,
/// LSN of the oldest known checkpoint made by pageserver and successfully
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
/// informational purposes, we receive it from pageserver.
pub remote_consistent_lsn: Lsn,
// Peers and their state as we remember it. Knowing peers themselves is
// fundamental; but state is saved here only for informational purposes and
// obviously can be stale. (Currently not saved at all, but let's provision
// place to have less file version upgrades).
pub peers: Peers,
/// minimal LSN which may be needed for recovery of some safekeeper (end_lsn
/// of last record streamed to everyone)
pub truncate_lsn: Lsn,
// Safekeeper starts receiving WAL from this LSN, zeros before it ought to
// be skipped during decoding.
pub wal_start_lsn: Lsn,
}
impl SafeKeeperState {
pub fn new(zttid: &ZTenantTimelineId, peers: Vec<ZNodeId>) -> SafeKeeperState {
pub fn new() -> SafeKeeperState {
SafeKeeperState {
tenant_id: zttid.tenant_id,
timeline_id: zttid.timeline_id,
acceptor_state: AcceptorState {
term: 0,
term_history: TermHistory::empty(),
@@ -214,20 +168,21 @@ impl SafeKeeperState {
server: ServerInfo {
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
system_id: 0, /* Postgres system identifier */
tenant_id: ZTenantId::from([0u8; 16]),
timeline_id: ZTimelineId::from([0u8; 16]),
wal_seg_size: 0,
},
proposer_uuid: [0; 16],
commit_lsn: Lsn(0),
s3_wal_lsn: Lsn(0),
peer_horizon_lsn: Lsn(0),
remote_consistent_lsn: Lsn(0),
peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()),
commit_lsn: Lsn(0), /* part of WAL acknowledged by quorum */
truncate_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */
wal_start_lsn: Lsn(0),
}
}
}
#[cfg(test)]
pub fn empty() -> Self {
SafeKeeperState::new(&ZTenantTimelineId::empty(), vec![])
impl Default for SafeKeeperState {
fn default() -> Self {
Self::new()
}
}
@@ -466,7 +421,6 @@ lazy_static! {
struct SafeKeeperMetrics {
commit_lsn: Gauge,
// WAL-related metrics are in WalStorageMetrics
}
impl SafeKeeperMetrics {
@@ -489,7 +443,7 @@ pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
/// not-yet-flushed pairs of same named fields in s.*
pub commit_lsn: Lsn,
pub peer_horizon_lsn: Lsn,
pub truncate_lsn: Lsn,
pub s: SafeKeeperState, // persistent part
pub control_store: CTRL,
@@ -508,14 +462,16 @@ where
wal_store: WAL,
state: SafeKeeperState,
) -> SafeKeeper<CTRL, WAL> {
if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id {
panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id);
if state.server.timeline_id != ZTimelineId::from([0u8; 16])
&& ztli != state.server.timeline_id
{
panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.server.timeline_id);
}
SafeKeeper {
metrics: SafeKeeperMetrics::new(state.tenant_id, ztli, state.commit_lsn),
metrics: SafeKeeperMetrics::new(state.server.tenant_id, ztli, state.commit_lsn),
commit_lsn: state.commit_lsn,
peer_horizon_lsn: state.peer_horizon_lsn,
truncate_lsn: state.truncate_lsn,
s: state,
control_store,
wal_store,
@@ -576,24 +532,12 @@ where
msg.pg_version, self.s.server.pg_version
);
}
if msg.tenant_id != self.s.tenant_id {
bail!(
"invalid tenant ID, got {}, expected {}",
msg.tenant_id,
self.s.tenant_id
);
}
if msg.ztli != self.s.timeline_id {
bail!(
"invalid timeline ID, got {}, expected {}",
msg.ztli,
self.s.timeline_id
);
}
// set basic info about server, if not yet
// TODO: verify that is doesn't change after
self.s.server.system_id = msg.system_id;
self.s.server.tenant_id = msg.tenant_id;
self.s.server.timeline_id = msg.ztli;
self.s.server.wal_seg_size = msg.wal_seg_size;
self.control_store
.persist(&self.s)
@@ -624,7 +568,7 @@ where
term: self.s.acceptor_state.term,
vote_given: false as u64,
flush_lsn: self.wal_store.flush_lsn(),
truncate_lsn: self.s.peer_horizon_lsn,
truncate_lsn: self.s.truncate_lsn,
term_history: self.get_term_history(),
};
if self.s.acceptor_state.term < msg.term {
@@ -649,16 +593,14 @@ where
/// Form AppendResponse from current state.
fn append_response(&self) -> AppendResponse {
let ar = AppendResponse {
AppendResponse {
term: self.s.acceptor_state.term,
flush_lsn: self.wal_store.flush_lsn(),
commit_lsn: self.s.commit_lsn,
// will be filled by the upper code to avoid bothering safekeeper
hs_feedback: HotStandbyFeedback::empty(),
zenith_feedback: ZenithFeedback::empty(),
};
trace!("formed AppendResponse {:?}", ar);
ar
}
}
fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
@@ -713,11 +655,10 @@ where
if !msg.wal_data.is_empty() {
self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?;
// If this was the first record we ever receieved, initialize
// commit_lsn to help find_end_of_wal skip the hole in the
// beginning.
if self.s.commit_lsn == Lsn(0) {
self.s.commit_lsn = msg.h.begin_lsn;
// If this was the first record we ever receieved, remember LSN to help
// find_end_of_wal skip the hole in the beginning.
if self.s.wal_start_lsn == Lsn(0) {
self.s.wal_start_lsn = msg.h.begin_lsn;
sync_control_file = true;
require_flush = true;
}
@@ -744,36 +685,35 @@ where
.set(u64::from(self.commit_lsn) as f64);
}
self.peer_horizon_lsn = msg.h.truncate_lsn;
self.truncate_lsn = msg.h.truncate_lsn;
// Update truncate and commit LSN in control file.
// To avoid negative impact on performance of extra fsync, do it only
// when truncate_lsn delta exceeds WAL segment size.
sync_control_file |=
self.s.peer_horizon_lsn + (self.s.server.wal_seg_size as u64) < self.peer_horizon_lsn;
self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn;
if sync_control_file {
self.s.commit_lsn = self.commit_lsn;
self.s.peer_horizon_lsn = self.peer_horizon_lsn;
self.s.truncate_lsn = self.truncate_lsn;
}
if sync_control_file {
self.control_store.persist(&self.s)?;
}
trace!(
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
msg.wal_data.len(),
msg.h.end_lsn,
msg.h.commit_lsn,
msg.h.truncate_lsn,
require_flush,
);
// If flush_lsn hasn't updated, AppendResponse is not very useful.
if !require_flush {
return Ok(None);
}
let resp = self.append_response();
trace!(
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, resp {:?}",
msg.wal_data.len(),
msg.h.end_lsn,
msg.h.commit_lsn,
msg.h.truncate_lsn,
&resp,
);
Ok(Some(AcceptorProposerMessage::AppendResponse(resp)))
}
@@ -834,11 +774,11 @@ mod tests {
#[test]
fn test_voting() {
let storage = InMemoryState {
persisted_state: SafeKeeperState::empty(),
persisted_state: SafeKeeperState::new(),
};
let wal_store = DummyWalStore { lsn: Lsn(0) };
let ztli = ZTimelineId::from([0u8; 16]);
let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty());
let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new());
// check voting for 1 is ok
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
@@ -866,11 +806,11 @@ mod tests {
#[test]
fn test_epoch_switch() {
let storage = InMemoryState {
persisted_state: SafeKeeperState::empty(),
persisted_state: SafeKeeperState::new(),
};
let wal_store = DummyWalStore { lsn: Lsn(0) };
let ztli = ZTimelineId::from([0u8; 16]);
let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty());
let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new());
let mut ar_hdr = AppendRequestHeader {
term: 1,

View File

@@ -1,7 +1,7 @@
//! This module contains timeline id -> safekeeper state map with file-backed
//! persistence and support for interaction between sending and receiving wal.
use anyhow::{bail, Context, Result};
use anyhow::{Context, Result};
use lazy_static::lazy_static;
@@ -9,24 +9,22 @@ use std::cmp::{max, min};
use std::collections::HashMap;
use std::fs::{self};
use std::sync::{Arc, Condvar, Mutex, MutexGuard};
use std::sync::{Arc, Condvar, Mutex};
use std::time::Duration;
use tokio::sync::mpsc::UnboundedSender;
use tracing::*;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZNodeId, ZTenantTimelineId};
use zenith_utils::zid::ZTenantTimelineId;
use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey};
use crate::control_file::{self, CreateControlFile};
use crate::control_file;
use crate::control_file::Storage as cf_storage;
use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
};
use crate::send_wal::HotStandbyFeedback;
use crate::wal_storage;
use crate::wal_storage::Storage as wal_storage_iface;
use crate::wal_storage::{self, Storage};
use crate::SafeKeeperConf;
use zenith_utils::pq_proto::ZenithFeedback;
@@ -89,39 +87,21 @@ struct SharedState {
}
impl SharedState {
/// Initialize timeline state, creating control file
fn create(
/// Restore SharedState from control file.
/// If create=false and file doesn't exist, bails out.
fn create_restore(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
peer_ids: Vec<ZNodeId>,
create: CreateControlFile,
) -> Result<Self> {
let state = SafeKeeperState::new(zttid, peer_ids);
let control_store = control_file::FileStorage::new(zttid, conf);
let wal_store = wal_storage::PhysicalStorage::new(zttid, conf);
let mut sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state);
sk.control_store.persist(&sk.s)?;
Ok(Self {
notified_commit_lsn: Lsn(0),
sk,
replicas: Vec::new(),
active: false,
num_computes: 0,
pageserver_connstr: None,
})
}
/// Restore SharedState from control file.
/// If file doesn't exist, bails out.
fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result<Self> {
let state = control_file::FileStorage::load_control_file_conf(conf, zttid)
let state = control_file::FileStorage::load_control_file_conf(conf, zttid, create)
.context("failed to load from control file")?;
let control_store = control_file::FileStorage::new(zttid, conf);
let wal_store = wal_storage::PhysicalStorage::new(zttid, conf);
info!("timeline {} restored", zttid.timeline_id);
info!("timeline {} created or restored", zttid.timeline_id);
Ok(Self {
notified_commit_lsn: Lsn(0),
@@ -438,13 +418,26 @@ impl Timeline {
// Utilities needed by various Connection-like objects
pub trait TimelineTools {
fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>;
fn set(
&mut self,
conf: &SafeKeeperConf,
zttid: ZTenantTimelineId,
create: CreateControlFile,
) -> Result<()>;
fn get(&self) -> &Arc<Timeline>;
}
impl TimelineTools for Option<Arc<Timeline>> {
fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()> {
fn set(
&mut self,
conf: &SafeKeeperConf,
zttid: ZTenantTimelineId,
create: CreateControlFile,
) -> Result<()> {
// We will only set the timeline once. If it were to ever change,
// anyone who cloned the Arc would be out of date.
assert!(self.is_none());
*self = Some(GlobalTimelines::get(conf, zttid, create)?);
Ok(())
}
@@ -463,73 +456,30 @@ lazy_static! {
pub struct GlobalTimelines;
impl GlobalTimelines {
fn create_internal(
mut timelines: MutexGuard<HashMap<ZTenantTimelineId, Arc<Timeline>>>,
conf: &SafeKeeperConf,
zttid: ZTenantTimelineId,
peer_ids: Vec<ZNodeId>,
) -> Result<Arc<Timeline>> {
match timelines.get(&zttid) {
Some(_) => bail!("timeline {} already exists", zttid),
None => {
// TODO: check directory existence
let dir = conf.timeline_dir(&zttid);
fs::create_dir_all(dir)?;
let shared_state = SharedState::create(conf, &zttid, peer_ids)
.context("failed to create shared state")?;
let new_tli = Arc::new(Timeline::new(zttid, shared_state));
timelines.insert(zttid, Arc::clone(&new_tli));
Ok(new_tli)
}
}
}
pub fn create(
conf: &SafeKeeperConf,
zttid: ZTenantTimelineId,
peer_ids: Vec<ZNodeId>,
) -> Result<Arc<Timeline>> {
let timelines = TIMELINES.lock().unwrap();
GlobalTimelines::create_internal(timelines, conf, zttid, peer_ids)
}
/// Get a timeline with control file loaded from the global TIMELINES map.
/// If control file doesn't exist, bails out.
/// If control file doesn't exist and create=false, bails out.
pub fn get(
conf: &SafeKeeperConf,
zttid: ZTenantTimelineId,
create: bool,
create: CreateControlFile,
) -> Result<Arc<Timeline>> {
let mut timelines = TIMELINES.lock().unwrap();
match timelines.get(&zttid) {
Some(result) => Ok(Arc::clone(result)),
None => {
let shared_state =
SharedState::restore(conf, &zttid).context("failed to restore shared state");
if let CreateControlFile::True = create {
let dir = conf.timeline_dir(&zttid);
info!(
"creating timeline dir {}, create is {:?}",
dir.display(),
create
);
fs::create_dir_all(dir)?;
}
let shared_state = match shared_state {
Ok(shared_state) => shared_state,
Err(error) => {
// TODO: always create timeline explicitly
if error
.root_cause()
.to_string()
.contains("No such file or directory")
&& create
{
return GlobalTimelines::create_internal(
timelines,
conf,
zttid,
vec![],
);
} else {
return Err(error);
}
}
};
let shared_state = SharedState::create_restore(conf, &zttid, create)
.context("failed to restore shared state")?;
let new_tli = Arc::new(Timeline::new(zttid, shared_state));
timelines.insert(zttid, Arc::clone(&new_tli));

View File

@@ -301,8 +301,7 @@ impl Storage for PhysicalStorage {
/// allows to postpone its initialization.
fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> {
if state.server.wal_seg_size == 0 {
// wal_seg_size is still unknown. This is dead path normally, should
// be used only in tests.
// wal_seg_size is still unknown
return Ok(());
}
@@ -316,13 +315,9 @@ impl Storage for PhysicalStorage {
let wal_seg_size = state.server.wal_seg_size as usize;
self.wal_seg_size = Some(wal_seg_size);
// Find out where stored WAL ends, starting at commit_lsn which is a
// known recent record boundary (unless we don't have WAL at all).
self.write_lsn = if state.commit_lsn == Lsn(0) {
Lsn(0)
} else {
Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.commit_lsn)?.0)
};
// we need to read WAL from disk to know which LSNs are stored on disk
self.write_lsn =
Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.wal_start_lsn)?.0);
self.write_record_lsn = self.write_lsn;
@@ -331,13 +326,11 @@ impl Storage for PhysicalStorage {
self.update_flush_lsn();
info!(
"initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}",
self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn,
"initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, truncate_lsn={}",
self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.truncate_lsn,
);
if self.flush_record_lsn < state.commit_lsn
|| self.flush_record_lsn < state.peer_horizon_lsn
{
warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id);
if self.flush_record_lsn < state.commit_lsn || self.flush_record_lsn < state.truncate_lsn {
warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or truncate_lsn from control file", self.zttid.timeline_id);
}
Ok(())

View File

@@ -1,4 +1,4 @@
use anyhow::{anyhow, bail, Context, Result};
use anyhow::{bail, Context, Result};
use clap::{App, AppSettings, Arg, ArgMatches};
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env;
@@ -9,7 +9,7 @@ use pageserver::config::defaults::{
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
};
use std::collections::{BTreeSet, HashMap};
use std::collections::HashMap;
use std::process::exit;
use std::str::FromStr;
use walkeeper::defaults::{
@@ -17,17 +17,15 @@ use walkeeper::defaults::{
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
};
use zenith_utils::auth::{Claims, Scope};
use zenith_utils::lsn::Lsn;
use zenith_utils::postgres_backend::AuthType;
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
use zenith_utils::GIT_VERSION;
use pageserver::timelines::TimelineInfo;
use pageserver::branches::BranchInfo;
// Default id of a safekeeper node, if not specified on the command line.
const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1);
const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1);
const DEFAULT_BRANCH_NAME: &str = "main";
fn default_conf() -> String {
format!(
@@ -55,15 +53,13 @@ http_port = {safekeeper_http_port}
}
///
/// Timelines tree element used as a value in the HashMap.
/// Branches tree element used as a value in the HashMap.
///
struct TimelineTreeEl {
/// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call.
pub info: TimelineInfo,
/// Name, recovered from zenith config mappings
pub name: Option<String>,
/// Holds all direct children of this timeline referenced using `timeline_id`.
pub children: BTreeSet<ZTimelineId>,
struct BranchTreeEl {
/// `BranchInfo` received from the `pageserver` via the `branch_list` libpq API call.
pub info: BranchInfo,
/// Holds all direct children of this branch referenced using `timeline_id`.
pub children: Vec<String>,
}
// Main entry point for the 'zenith' CLI utility
@@ -74,28 +70,29 @@ struct TimelineTreeEl {
// * Providing CLI api to the pageserver
// * TODO: export/import to/from usual postgres
fn main() -> Result<()> {
let branch_name_arg = Arg::new("branch-name")
.long("branch-name")
.takes_value(true)
.help("Name of the branch to be created or used as an alias for other services")
#[rustfmt::skip] // rustfmt squashes these into a single line otherwise
let pg_node_arg = Arg::new("node")
.index(1)
.help("Node name")
.required(true);
#[rustfmt::skip]
let safekeeper_id_arg = Arg::new("id")
.index(1)
.help("safekeeper id")
.required(false);
let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
let timeline_arg = Arg::new("timeline")
.index(2)
.help("Branch name or a point-in time specification")
.required(false);
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
let tenant_id_arg = Arg::new("tenant-id")
.long("tenant-id")
let tenantid_arg = Arg::new("tenantid")
.long("tenantid")
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
.takes_value(true)
.required(false);
let timeline_id_arg = Arg::new("timeline-id")
.long("timeline-id")
.help("Timeline id. Represented as a hexadecimal string 32 symbols length")
.takes_value(true)
.required(false);
let port_arg = Arg::new("port")
.long("port")
.required(false)
@@ -117,12 +114,6 @@ fn main() -> Result<()> {
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let lsn_arg = Arg::new("lsn")
.long("lsn")
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
.takes_value(true)
.required(false);
let matches = App::new("Zenith CLI")
.setting(AppSettings::ArgRequiredElseHelp)
.version(GIT_VERSION)
@@ -130,7 +121,6 @@ fn main() -> Result<()> {
App::new("init")
.about("Initialize a new Zenith repository")
.arg(pageserver_config_args.clone())
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
.arg(
Arg::new("config")
.long("config")
@@ -139,32 +129,17 @@ fn main() -> Result<()> {
)
)
.subcommand(
App::new("timeline")
.about("Manage timelines")
.subcommand(App::new("list")
.about("List all timelines, available to this pageserver")
.arg(tenant_id_arg.clone()))
.subcommand(App::new("branch")
.about("Create a new timeline, using another timeline as a base, copying its data")
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true)
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
.arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true)
.help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
.subcommand(App::new("create")
.about("Create a new blank timeline")
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone()))
App::new("branch")
.about("Create a new branch")
.arg(Arg::new("branchname").required(false).index(1))
.arg(Arg::new("start-point").required(false).index(2))
.arg(tenantid_arg.clone()),
).subcommand(
App::new("tenant")
.setting(AppSettings::ArgRequiredElseHelp)
.about("Manage tenants")
.subcommand(App::new("list"))
.subcommand(App::new("create")
.arg(tenant_id_arg.clone())
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
)
.subcommand(App::new("create").arg(Arg::new("tenantid").required(false).index(1)))
)
.subcommand(
App::new("pageserver")
@@ -199,13 +174,12 @@ fn main() -> Result<()> {
App::new("pg")
.setting(AppSettings::ArgRequiredElseHelp)
.about("Manage postgres instances")
.subcommand(App::new("list").arg(tenant_id_arg.clone()))
.subcommand(App::new("list").arg(tenantid_arg.clone()))
.subcommand(App::new("create")
.about("Create a postgres compute node")
.arg(pg_node_arg.clone())
.arg(branch_name_arg.clone())
.arg(tenant_id_arg.clone())
.arg(lsn_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(port_arg.clone())
.arg(
Arg::new("config-only")
@@ -216,21 +190,20 @@ fn main() -> Result<()> {
.subcommand(App::new("start")
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
.arg(pg_node_arg.clone())
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(timeline_id_arg.clone())
.arg(lsn_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(port_arg.clone()))
.subcommand(
App::new("stop")
.arg(pg_node_arg.clone())
.arg(tenant_id_arg.clone())
.arg(
Arg::new("destroy")
.help("Also delete data directory (now optional, should be default in future)")
.long("destroy")
.required(false)
)
.arg(pg_node_arg.clone())
.arg(timeline_arg.clone())
.arg(tenantid_arg.clone())
.arg(
Arg::new("destroy")
.help("Also delete data directory (now optional, should be default in future)")
.long("destroy")
.required(false)
)
)
)
@@ -252,89 +225,75 @@ fn main() -> Result<()> {
};
// Check for 'zenith init' command first.
let subcommand_result = if sub_name == "init" {
handle_init(sub_args).map(Some)
let subcmd_result = if sub_name == "init" {
handle_init(sub_args)
} else {
// all other commands need an existing config
let mut env = LocalEnv::load_config().context("Error loading config")?;
let original_env = env.clone();
let env = match LocalEnv::load_config() {
Ok(conf) => conf,
Err(e) => {
eprintln!("Error loading config: {}", e);
exit(1);
}
};
let subcommand_result = match sub_name {
"tenant" => handle_tenant(sub_args, &mut env),
"timeline" => handle_timeline(sub_args, &mut env),
match sub_name {
"tenant" => handle_tenant(sub_args, &env),
"branch" => handle_branch(sub_args, &env),
"start" => handle_start_all(sub_args, &env),
"stop" => handle_stop_all(sub_args, &env),
"pageserver" => handle_pageserver(sub_args, &env),
"pg" => handle_pg(sub_args, &env),
"safekeeper" => handle_safekeeper(sub_args, &env),
_ => bail!("unexpected subcommand {}", sub_name),
};
if original_env != env {
subcommand_result.map(|()| Some(env))
} else {
subcommand_result.map(|()| None)
}
};
match subcommand_result {
Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
Ok(None) => (),
Err(e) => {
eprintln!("command failed: {:?}", e);
exit(1);
}
if let Err(e) = subcmd_result {
eprintln!("command failed: {:#}", e);
exit(1);
}
Ok(())
}
///
/// Prints timelines list as a tree-like structure.
/// Prints branches list as a tree-like structure.
///
fn print_timelines_tree(
timelines: Vec<TimelineInfo>,
mut timeline_name_mappings: HashMap<ZTenantTimelineId, String>,
) -> Result<()> {
let mut timelines_hash = timelines
.iter()
.map(|t| {
(
t.timeline_id(),
TimelineTreeEl {
info: t.clone(),
children: BTreeSet::new(),
name: timeline_name_mappings
.remove(&ZTenantTimelineId::new(t.tenant_id(), t.timeline_id())),
},
)
})
.collect::<HashMap<_, _>>();
fn print_branches_tree(branches: Vec<BranchInfo>) -> Result<()> {
let mut branches_hash: HashMap<String, BranchTreeEl> = HashMap::new();
// Memorize all direct children of each timeline.
for timeline in &timelines {
if let TimelineInfo::Local {
ancestor_timeline_id: Some(tid),
..
} = timeline
{
timelines_hash
// Form a hash table of branch timeline_id -> BranchTreeEl.
for branch in &branches {
branches_hash.insert(
branch.timeline_id.to_string(),
BranchTreeEl {
info: branch.clone(),
children: Vec::new(),
},
);
}
// Memorize all direct children of each branch.
for branch in &branches {
if let Some(tid) = &branch.ancestor_id {
branches_hash
.get_mut(tid)
.context("missing timeline info in the HashMap")?
.context("missing branch info in the HashMap")?
.children
.insert(timeline.timeline_id());
.push(branch.timeline_id.to_string());
}
}
for timeline in timelines_hash.values() {
// Start with root local timelines (no ancestors) first.
if let TimelineInfo::Local {
ancestor_timeline_id,
..
} = &timeline.info
{
if ancestor_timeline_id.is_none() {
print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?;
}
// Sort children by tid to bring some minimal order.
for branch in &mut branches_hash.values_mut() {
branch.children.sort();
}
for branch in branches_hash.values() {
// Start with root branches (no ancestors) first.
// Now there is 'main' branch only, but things may change.
if branch.info.ancestor_id.is_none() {
print_branch(0, &Vec::from([true]), branch, &branches_hash)?;
}
}
@@ -342,32 +301,27 @@ fn print_timelines_tree(
}
///
/// Recursively prints timeline info with all its children.
/// Recursively prints branch info with all its children.
///
fn print_timeline(
fn print_branch(
nesting_level: usize,
is_last: &[bool],
timeline: &TimelineTreeEl,
timelines: &HashMap<ZTimelineId, TimelineTreeEl>,
branch: &BranchTreeEl,
branches: &HashMap<String, BranchTreeEl>,
) -> Result<()> {
let local_or_remote = match timeline.info {
TimelineInfo::Local { .. } => "(L)",
TimelineInfo::Remote { .. } => "(R)",
};
// Draw main padding
print!("{} ", local_or_remote);
print!(" ");
if nesting_level > 0 {
let lsn_string = match &timeline.info {
TimelineInfo::Local { ancestor_lsn, .. } => ancestor_lsn
.map(|lsn| lsn.to_string())
.unwrap_or_else(|| "Unknown local Lsn".to_string()),
TimelineInfo::Remote { .. } => "unknown Lsn (remote)".to_string(),
};
let lsn = branch
.info
.ancestor_lsn
.as_ref()
.context("missing branch info in the HashMap")?;
let mut br_sym = "┣━";
// Draw each nesting padding with proper style
// depending on whether its timeline ended or not.
// depending on whether its branch ended or not.
if nesting_level > 1 {
for l in &is_last[1..is_last.len() - 1] {
if *l {
@@ -378,92 +332,73 @@ fn print_timeline(
}
}
// We are the last in this sub-timeline
// We are the last in this sub-branch
if *is_last.last().unwrap() {
br_sym = "┗━";
}
print!("{} @{}: ", br_sym, lsn_string);
print!("{} @{}: ", br_sym, lsn);
}
// Finally print a timeline id and name with new line
println!(
"{} [{}]",
timeline.name.as_deref().unwrap_or("_no_name_"),
timeline.info.timeline_id()
);
// Finally print a branch name with new line
println!("{}", branch.info.name);
let len = timeline.children.len();
let len = branch.children.len();
let mut i: usize = 0;
let mut is_last_new = Vec::from(is_last);
is_last_new.push(false);
for child in &timeline.children {
for child in &branch.children {
i += 1;
// Mark that the last padding is the end of the timeline
// Mark that the last padding is the end of the branch
if i == len {
if let Some(last) = is_last_new.last_mut() {
*last = true;
}
}
print_timeline(
print_branch(
nesting_level + 1,
&is_last_new,
timelines
branches
.get(child)
.context("missing timeline info in the HashMap")?,
timelines,
.context("missing branch info in the HashMap")?,
branches,
)?;
}
Ok(())
}
/// Returns a map of timeline IDs to timeline_id@lsn strings.
/// Returns a map of timeline IDs to branch_name@lsn strings.
/// Connects to the pageserver to query this information.
fn get_timeline_infos(
fn get_branch_infos(
env: &local_env::LocalEnv,
tenant_id: &ZTenantId,
) -> Result<HashMap<ZTimelineId, TimelineInfo>> {
Ok(PageServerNode::from_env(env)
.timeline_list(tenant_id)?
tenantid: &ZTenantId,
) -> Result<HashMap<ZTimelineId, BranchInfo>> {
let page_server = PageServerNode::from_env(env);
let branch_infos: Vec<BranchInfo> = page_server.branch_list(tenantid)?;
let branch_infos: HashMap<ZTimelineId, BranchInfo> = branch_infos
.into_iter()
.map(|timeline_info| (timeline_info.timeline_id(), timeline_info))
.collect())
.map(|branch_info| (branch_info.timeline_id, branch_info))
.collect();
Ok(branch_infos)
}
// Helper function to parse --tenant_id option, or get the default from config file
fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<ZTenantId> {
if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() {
tenant_id_from_arguments
} else if let Some(tenantid_conf) = env.default_tenant_id {
// Helper function to parse --tenantid option, or get the default from config file
fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<ZTenantId> {
if let Some(tenantid_cmd) = sub_match.value_of("tenantid") {
Ok(ZTenantId::from_str(tenantid_cmd)?)
} else if let Some(tenantid_conf) = env.default_tenantid {
Ok(ZTenantId::from(tenantid_conf))
} else {
bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file");
}
}
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<ZTenantId>> {
sub_match
.value_of("tenant-id")
.map(ZTenantId::from_str)
.transpose()
.context("Failed to parse tenant id from the argument string")
}
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<ZTimelineId>> {
sub_match
.value_of("timeline-id")
.map(ZTimelineId::from_str)
.transpose()
.context("Failed to parse timeline id from the argument string")
}
fn handle_init(init_match: &ArgMatches) -> Result<LocalEnv> {
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
fn handle_init(init_match: &ArgMatches) -> Result<()> {
// Create config file
let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
// load and parse the file
@@ -479,29 +414,18 @@ fn handle_init(init_match: &ArgMatches) -> Result<LocalEnv> {
env.init()
.context("Failed to initialize zenith repository")?;
// default_tenantid was generated by the `env.init()` call above
let initial_tenant_id = ZTenantId::from(env.default_tenant_id.unwrap());
// Call 'pageserver init'.
let pageserver = PageServerNode::from_env(&env);
let initial_timeline_id = pageserver
.init(
Some(initial_tenant_id),
initial_timeline_id_arg,
&pageserver_config_overrides(init_match),
)
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {}", e);
exit(1);
});
if let Err(e) = pageserver.init(
// default_tenantid was generated by the `env.init()` call above
Some(&ZTenantId::from(env.default_tenantid.unwrap()).to_string()),
&pageserver_config_overrides(init_match),
) {
eprintln!("pageserver init failed: {}", e);
exit(1);
}
env.register_branch_mapping(
DEFAULT_BRANCH_NAME.to_owned(),
initial_tenant_id,
initial_timeline_id,
)?;
Ok(env)
Ok(())
}
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
@@ -512,7 +436,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
.collect()
}
fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
match tenant_match.subcommand() {
Some(("list", _)) => {
@@ -521,16 +445,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re
}
}
Some(("create", create_match)) => {
let initial_tenant_id = parse_tenant_id(create_match)?;
let new_tenant_id = pageserver
.tenant_create(initial_tenant_id)?
.ok_or_else(|| {
anyhow!("Tenant with id {:?} was already created", initial_tenant_id)
})?;
println!(
"tenant {} successfully created on the pageserver",
new_tenant_id
);
let tenantid = match create_match.value_of("tenantid") {
Some(tenantid) => ZTenantId::from_str(tenantid)?,
None => ZTenantId::generate(),
};
println!("using tenant id {}", tenantid);
pageserver.tenant_create(tenantid)?;
println!("tenant successfully created on the pageserver");
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
None => bail!("no tenant subcommand provided"),
@@ -538,94 +459,24 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re
Ok(())
}
fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let pageserver = PageServerNode::from_env(env);
match timeline_match.subcommand() {
Some(("list", list_match)) => {
let tenant_id = get_tenant_id(list_match, env)?;
let timelines = pageserver.timeline_list(&tenant_id)?;
print_timelines_tree(timelines, env.timeline_name_mappings())?;
}
Some(("create", create_match)) => {
let tenant_id = get_tenant_id(create_match, env)?;
let new_branch_name = create_match
.value_of("branch-name")
.ok_or(anyhow!("No branch name provided"))?;
let timeline = pageserver
.timeline_create(tenant_id, None, None, None)?
.ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?;
let new_timeline_id = timeline.timeline_id();
let tenantid = get_tenantid(branch_match, env)?;
let last_record_lsn = match timeline {
TimelineInfo::Local {
last_record_lsn, ..
} => last_record_lsn,
TimelineInfo::Remote { .. } => {
bail!(
"Timeline {} was created as remote, not local",
new_timeline_id
)
}
};
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
println!(
"Created timeline '{}' at Lsn {} for tenant: {}",
timeline.timeline_id(),
last_record_lsn,
tenant_id,
);
}
Some(("branch", branch_match)) => {
let tenant_id = get_tenant_id(branch_match, env)?;
let new_branch_name = branch_match
.value_of("branch-name")
.ok_or(anyhow!("No branch name provided"))?;
let ancestor_branch_name = branch_match
.value_of("ancestor-branch-name")
.unwrap_or(DEFAULT_BRANCH_NAME);
let ancestor_timeline_id = env
.get_branch_timeline_id(ancestor_branch_name, tenant_id)
.ok_or_else(|| {
anyhow!(
"Found no timeline id for branch name '{}'",
ancestor_branch_name
)
})?;
let start_lsn = branch_match
.value_of("ancestor-start-lsn")
.map(Lsn::from_str)
.transpose()
.context("Failed to parse ancestor start Lsn from the request")?;
let timeline = pageserver
.timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))?
.ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?;
let new_timeline_id = timeline.timeline_id();
let last_record_lsn = match timeline {
TimelineInfo::Local {
last_record_lsn, ..
} => last_record_lsn,
TimelineInfo::Remote { .. } => bail!(
"Timeline {} was created as remote, not local",
new_timeline_id
),
};
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
println!(
"Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'",
timeline.timeline_id(),
last_record_lsn,
tenant_id,
ancestor_branch_name,
);
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
None => bail!("no tenant subcommand provided"),
if let Some(branchname) = branch_match.value_of("branchname") {
let startpoint_str = branch_match
.value_of("start-point")
.context("Missing start-point")?;
let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?;
println!(
"Created branch '{}' at {:?} for tenant: {}",
branch.name, branch.latest_valid_lsn, tenantid,
);
} else {
// No arguments, list branches for tenant
let branches = pageserver.branch_list(&tenantid)?;
print_branches_tree(branches)?;
}
Ok(())
@@ -639,90 +490,63 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let mut cplane = ComputeControlPlane::load(env.clone())?;
// All subcommands take an optional --tenant-id option
let tenant_id = get_tenant_id(sub_args, env)?;
// All subcommands take an optional --tenantid option
let tenantid = get_tenantid(sub_args, env)?;
match sub_name {
"list" => {
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| {
eprintln!("Failed to load branch info: {}", e);
HashMap::new()
});
let timeline_name_mappings = env.timeline_name_mappings();
println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS");
println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS");
for ((_, node_name), node) in cplane
.nodes
.iter()
.filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id)
.filter(|((node_tenantid, _), _)| node_tenantid == &tenantid)
{
// FIXME: This shows the LSN at the end of the timeline. It's not the
// right thing to do for read-only nodes that might be anchored at an
// older point in time, or following but lagging behind the primary.
let lsn_str = timeline_infos
.get(&node.timeline_id)
.map(|bi| match bi {
TimelineInfo::Local {
last_record_lsn, ..
} => last_record_lsn.to_string(),
TimelineInfo::Remote { .. } => "? (remote)".to_string(),
})
.unwrap_or_else(|| '?'.to_string());
let branch_name = timeline_name_mappings
.get(&ZTenantTimelineId::new(tenant_id, node.timeline_id))
.map(|name| name.as_str())
.unwrap_or("?");
let lsn_str = branch_infos
.get(&node.timelineid)
.map(|bi| bi.latest_valid_lsn.to_string())
.unwrap_or_else(|| "?".to_string());
println!(
"{}\t{}\t{}\t{}\t{}\t{}",
"{}\t{}\t{}\t{}\t{}",
node_name,
node.address,
node.timeline_id,
branch_name,
node.timelineid, // FIXME: resolve human-friendly branch name
lsn_str,
node.status(),
);
}
}
"create" => {
let branch_name = sub_args
.value_of("branch-name")
.unwrap_or(DEFAULT_BRANCH_NAME);
let node_name = sub_args
.value_of("node")
.map(ToString::to_string)
.unwrap_or_else(|| format!("{}_node", branch_name));
let lsn = sub_args
.value_of("lsn")
.map(Lsn::from_str)
.transpose()
.context("Failed to parse Lsn from the request")?;
let timeline_id = env
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?;
let node_name = sub_args.value_of("node").unwrap_or("main");
let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name);
let port: Option<u16> = match sub_args.value_of("port") {
Some(p) => Some(p.parse()?),
None => None,
};
cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?;
cplane.new_node(tenantid, node_name, timeline_name, port)?;
}
"start" => {
let node_name = sub_args.value_of("node").unwrap_or("main");
let timeline_name = sub_args.value_of("timeline");
let port: Option<u16> = match sub_args.value_of("port") {
Some(p) => Some(p.parse()?),
None => None,
};
let node_name = sub_args
.value_of("node")
.ok_or_else(|| anyhow!("No node name was provided to start"))?;
let node = cplane.nodes.get(&(tenant_id, node_name.to_owned()));
let node = cplane.nodes.get(&(tenantid, node_name.to_owned()));
let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) {
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
let claims = Claims::new(Some(tenantid), Scope::Tenant);
Some(env.generate_auth_token(&claims)?)
} else {
@@ -730,49 +554,40 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
};
if let Some(node) = node {
if timeline_name.is_some() {
println!("timeline name ignored because node exists already");
}
println!("Starting existing postgres {}...", node_name);
node.start(&auth_token)?;
} else {
let branch_name = sub_args
.value_of("branch-name")
.unwrap_or(DEFAULT_BRANCH_NAME);
let timeline_id = env
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| {
anyhow!("Found no timeline id for branch name '{}'", branch_name)
})?;
let lsn = sub_args
.value_of("lsn")
.map(Lsn::from_str)
.transpose()
.context("Failed to parse Lsn from the request")?;
// when used with custom port this results in non obvious behaviour
// port is remembered from first start command, i e
// start --port X
// stop
// start <-- will also use port X even without explicit port argument
let timeline_name = timeline_name.unwrap_or(node_name);
println!(
"Starting new postgres {} on timeline {} ...",
node_name, timeline_id
"Starting new postgres {} on {}...",
node_name, timeline_name
);
let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?;
let node = cplane.new_node(tenantid, node_name, timeline_name, port)?;
node.start(&auth_token)?;
}
}
"stop" => {
let node_name = sub_args
.value_of("node")
.ok_or_else(|| anyhow!("No node name was provided to stop"))?;
let node_name = sub_args.value_of("node").unwrap_or("main");
let destroy = sub_args.is_present("destroy");
let node = cplane
.nodes
.get(&(tenant_id, node_name.to_owned()))
.get(&(tenantid, node_name.to_owned()))
.with_context(|| format!("postgres {} is not found", node_name))?;
node.stop(destroy)?;
}
_ => bail!("Unexpected pg subcommand '{}'", sub_name),
_ => {
bail!("Unexpected pg subcommand '{}'", sub_name)
}
}
Ok(())

View File

@@ -334,10 +334,6 @@ impl ZTenantTimelineId {
pub fn generate() -> Self {
Self::new(ZTenantId::generate(), ZTimelineId::generate())
}
pub fn empty() -> Self {
Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16]))
}
}
impl fmt::Display for ZTenantTimelineId {