Merge with main

This commit is contained in:
Konstantin Knizhnik
2021-08-20 10:55:34 +03:00
41 changed files with 4298 additions and 245 deletions

View File

@@ -7,7 +7,7 @@ executors:
zenith-build-executor:
resource_class: xlarge
docker:
- image: cimg/rust:1.51.0
- image: cimg/rust:1.52.1
jobs:
@@ -237,6 +237,23 @@ jobs:
- store_test_results:
path: /tmp/test_output
# Build zenithdb/zenith:latest image and push it to Docker hub
docker-image:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push Docker image
command: |
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
workflows:
build_and_test:
jobs:
@@ -265,3 +282,14 @@ workflows:
test_selection: batch_others
requires:
- build-zenith-<< matrix.build_type >>
- docker-image:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- main
requires:
- pg_regress tests release
- other tests release

53
Cargo.lock generated
View File

@@ -82,6 +82,30 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "aversion"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41992ab8cfcc3026ef9abceffe0c2b0479c043183fc23825e30d22baab6df334"
dependencies = [
"aversion-macros",
"byteorder",
"serde",
"serde_cbor",
"thiserror",
]
[[package]]
name = "aversion-macros"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ba5785f953985aa0caca927ba4005880f3b4f53de87f134e810ae3549f744d2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "aws-creds"
version = "0.26.0"
@@ -166,6 +190,18 @@ dependencies = [
"generic-array",
]
[[package]]
name = "bookfile"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "efa3e2086414e1bbecbc10730f265e5b079ab4ea0b830e7219a70dab6471e753"
dependencies = [
"aversion",
"byteorder",
"serde",
"thiserror",
]
[[package]]
name = "boxfnonce"
version = "0.1.1"
@@ -646,6 +682,12 @@ dependencies = [
"tracing",
]
[[package]]
name = "half"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
[[package]]
name = "hashbrown"
version = "0.9.1"
@@ -1139,6 +1181,7 @@ name = "pageserver"
version = "0.1.0"
dependencies = [
"anyhow",
"bookfile",
"byteorder",
"bytes",
"chrono",
@@ -1735,6 +1778,16 @@ dependencies = [
"xml-rs",
]
[[package]]
name = "serde_cbor"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622"
dependencies = [
"half",
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.126"

View File

@@ -1,94 +1,77 @@
#
# Docker image for console integration testing.
#
# We may also reuse it in CI to unify installation process and as a general binaries building
# tool for production servers.
#
# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
# images which are statically linked and have guards against any dlopen. I would rather
# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
# we will have our own storage and drop rockdb dependency.
#
# Cargo-chef is used to separate dependencies building from main binaries building. This
# way `docker build` will download and install dependencies only of there are changes to
# out Cargo.toml files.
#
#
# build postgres separately -- this layer will be rebuilt only if one of
# mentioned paths will get any changes
# Build Postgres separately --- this layer will be rebuilt only if one of
# mentioned paths will get any changes.
#
FROM alpine:3.13 as pg-build
RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev
WORKDIR zenith
FROM zenithdb/build:buster AS pg-build
WORKDIR /zenith
COPY ./vendor/postgres vendor/postgres
COPY ./Makefile Makefile
# Build using clang and lld
RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
#
# Calculate cargo dependencies.
# This will always run, but only generate recipe.json with list of dependencies without
# installing them.
#
FROM alpine:20210212 as cargo-deps-inspect
RUN apk add --update rust cargo
RUN cargo install cargo-chef
WORKDIR zenith
FROM zenithdb/build:buster AS cargo-deps-inspect
WORKDIR /zenith
COPY . .
RUN cargo chef prepare --recipe-path recipe.json
RUN cargo chef prepare --recipe-path /zenith/recipe.json
#
# Build cargo dependencies.
# This temp cantainner would be build only if recipe.json was changed.
# This temp cantainner should be rebuilt only if recipe.json was changed.
#
FROM alpine:20210212 as deps-build
RUN apk add --update rust cargo openssl-dev clang build-base
# rust-rocksdb can be built against system-wide rocksdb -- that saves about
# 10 minutes during build. Rocksdb apk package is in testing now, but use it
# anyway. In case of any troubles we can download and build rocksdb here manually
# (to cache it as a docker layer).
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
WORKDIR zenith
FROM zenithdb/build:buster AS deps-build
WORKDIR /zenith
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
COPY --from=cargo-deps-inspect /usr/local/cargo/bin/cargo-chef /usr/local/cargo/bin/
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
#
# Build zenith binaries
#
FROM alpine:20210212 as build
RUN apk add --update rust cargo openssl-dev clang build-base
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
WORKDIR zenith
FROM zenithdb/build:buster AS build
WORKDIR /zenith
COPY . .
# Copy cached dependencies
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY --from=deps-build /zenith/target target
COPY --from=deps-build /root/.cargo /root/.cargo
COPY --from=deps-build /usr/local/cargo/ /usr/local/cargo/
RUN cargo build --release
#
# Copy binaries to resulting image.
# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
# out how to statically link rocksdb or avoid it at all).
#
FROM alpine:3.13
RUN apk add --update openssl build-base libseccomp-dev
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
FROM debian:buster-slim
WORKDIR /data
RUN apt-get update && apt-get -yq install librocksdb-dev libseccomp-dev openssl && \
mkdir zenith_install
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
COPY --from=build /zenith/target/release/proxy /usr/local/bin
COPY --from=pg-build /zenith/tmp_install /usr/local
COPY --from=pg-build /zenith/tmp_install postgres_install
COPY docker-entrypoint.sh /docker-entrypoint.sh
RUN addgroup zenith && adduser -h /data -D -G zenith zenith
# Remove build artifacts (~ 500 MB)
RUN rm -rf postgres_install/build && \
# 'Install' Postgres binaries locally
cp -r postgres_install/* /usr/local/ && \
# Prepare an archive of Postgres binaries (should be around 11 MB)
# and keep it inside container for an ease of deploy pipeline.
cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \
rm -rf postgres_install
RUN useradd -m -d /data zenith
VOLUME ["/data"]
WORKDIR /data
USER zenith
EXPOSE 6400
ENTRYPOINT ["/docker-entrypoint.sh"]

95
Dockerfile.alpine Normal file
View File

@@ -0,0 +1,95 @@
#
# Docker image for console integration testing.
#
# We may also reuse it in CI to unify installation process and as a general binaries building
# tool for production servers.
#
# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
# images which are statically linked and have guards against any dlopen. I would rather
# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
# we will have our own storage and drop rockdb dependency.
#
# Cargo-chef is used to separate dependencies building from main binaries building. This
# way `docker build` will download and install dependencies only of there are changes to
# out Cargo.toml files.
#
#
# build postgres separately -- this layer will be rebuilt only if one of
# mentioned paths will get any changes
#
FROM alpine:3.13 as pg-build
RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev
WORKDIR zenith
COPY ./vendor/postgres vendor/postgres
COPY ./Makefile Makefile
# Build using clang and lld
RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
#
# Calculate cargo dependencies.
# This will always run, but only generate recipe.json with list of dependencies without
# installing them.
#
FROM alpine:20210212 as cargo-deps-inspect
RUN apk add --update rust cargo
RUN cargo install cargo-chef
WORKDIR zenith
COPY . .
RUN cargo chef prepare --recipe-path recipe.json
#
# Build cargo dependencies.
# This temp cantainner would be build only if recipe.json was changed.
#
FROM alpine:20210212 as deps-build
RUN apk add --update rust cargo openssl-dev clang build-base
# rust-rocksdb can be built against system-wide rocksdb -- that saves about
# 10 minutes during build. Rocksdb apk package is in testing now, but use it
# anyway. In case of any troubles we can download and build rocksdb here manually
# (to cache it as a docker layer).
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
WORKDIR zenith
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
#
# Build zenith binaries
#
FROM alpine:20210212 as build
RUN apk add --update rust cargo openssl-dev clang build-base
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
WORKDIR zenith
COPY . .
# Copy cached dependencies
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY --from=deps-build /zenith/target target
COPY --from=deps-build /root/.cargo /root/.cargo
RUN cargo build --release
#
# Copy binaries to resulting image.
# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
# out how to statically link rocksdb or avoid it at all).
#
FROM alpine:3.13
RUN apk add --update openssl build-base libseccomp-dev
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
COPY --from=build /zenith/target/release/proxy /usr/local/bin
COPY --from=pg-build /zenith/tmp_install /usr/local
COPY docker-entrypoint.sh /docker-entrypoint.sh
RUN addgroup zenith && adduser -h /data -D -G zenith zenith
VOLUME ["/data"]
WORKDIR /data
USER zenith
EXPOSE 6400
ENTRYPOINT ["/docker-entrypoint.sh"]
CMD ["pageserver"]

15
Dockerfile.build Normal file
View File

@@ -0,0 +1,15 @@
#
# Image with all the required dependencies to build https://github.com/zenithdb/zenith
# and Postgres from https://github.com/zenithdb/postgres
# Also includes some rust development and build tools.
#
FROM rust:slim-buster
WORKDIR /zenith
# Install postgres and zenith build dependencies
# clang is for rocksdb
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libseccomp-dev pkg-config libssl-dev librocksdb-dev clang
# Install rust tools
RUN rustup component add clippy && cargo install cargo-chef cargo-audit

View File

@@ -12,15 +12,12 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
libssl-dev clang
```
[Rust] 1.48 or later is also required.
[Rust] 1.52 or later is also required.
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
To run the integration tests (not required to use the code), install
Python (3.6 or higher), and install python3 packages with `pip` (called `pip3` on some systems):
```
pip install pytest psycopg2
```
Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
2. Build zenith and patched postgres
```sh
@@ -106,10 +103,9 @@ pytest
## Documentation
Now we use README files to cover design ideas and overall architecture for each module.
And rustdoc style documentation comments.
Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation.
To view your documentation in a browser, try running `cargo doc --no-deps --open`
To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
## Source tree layout

View File

@@ -7,7 +7,7 @@ use std::sync::Arc;
use std::time::Duration;
use std::{collections::BTreeMap, path::PathBuf};
use std::{
fs::{self, OpenOptions},
fs::{self, File, OpenOptions},
io::Read,
};
@@ -85,48 +85,36 @@ impl ComputeControlPlane {
}
}
/// Connect to a page server, get base backup, and untar it to initialize a
/// new data directory
pub fn new_from_page_server(
&mut self,
is_test: bool,
timelineid: ZTimelineId,
name: &str,
tenantid: ZTenantId,
) -> Result<Arc<PostgresNode>> {
let node = Arc::new(PostgresNode {
name: name.to_owned(),
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
is_test,
timelineid,
tenantid,
});
node.init_from_page_server(self.env.auth_type)?;
self.nodes
.insert((tenantid, node.name.clone()), Arc::clone(&node));
Ok(node)
}
pub fn new_node(
&mut self,
tenantid: ZTenantId,
branch_name: &str,
config_only: bool,
) -> Result<Arc<PostgresNode>> {
let timeline_id = self
.pageserver
.branch_get_by_name(&tenantid, branch_name)?
.timeline_id;
let node = self.new_from_page_server(false, timeline_id, branch_name, tenantid)?;
let node = Arc::new(PostgresNode {
name: branch_name.to_owned(),
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
is_test: false,
timelineid: timeline_id,
tenantid,
});
node.init_from_page_server(self.env.auth_type, config_only)?;
self.nodes
.insert((tenantid, node.name.clone()), Arc::clone(&node));
// Configure the node to stream WAL directly to the pageserver
node.append_conf(
"postgresql.conf",
format!(
concat!(
"shared_preload_libraries = zenith\n",
"synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
"zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
),
@@ -246,39 +234,15 @@ impl PostgresNode {
})
}
// Connect to a page server, get base backup, and untar it to initialize a
// new data directory
pub fn init_from_page_server(&self, auth_type: AuthType) -> Result<()> {
pub fn do_basebackup(&self) -> Result<()> {
let pgdata = self.pgdata();
println!(
"Extracting base backup to create postgres instance: path={} port={}",
pgdata.display(),
self.address.port()
);
// initialize data directory
if self.is_test {
fs::remove_dir_all(&pgdata).ok();
}
let sql = format!("basebackup {} {}", self.tenantid, self.timelineid);
let mut client = self
.pageserver
.page_server_psql_client()
.with_context(|| "connecting to page server failed")?;
fs::create_dir_all(&pgdata)
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|| {
format!(
"could not set permissions in data directory {}",
pgdata.display()
)
},
)?;
let mut copyreader = client
.copy_out(sql.as_str())
.with_context(|| "page server 'basebackup' command failed")?;
@@ -294,6 +258,45 @@ impl PostgresNode {
ar.unpack(&pgdata)
.with_context(|| "extracting page backup failed")?;
Ok(())
}
// Connect to a page server, get base backup, and untar it to initialize a
// new data directory
pub fn init_from_page_server(&self, auth_type: AuthType, config_only: bool) -> Result<()> {
let pgdata = self.pgdata();
println!(
"Extracting base backup to create postgres instance: path={} port={}",
pgdata.display(),
self.address.port()
);
// initialize data directory
if self.is_test {
fs::remove_dir_all(&pgdata).ok();
}
fs::create_dir_all(&pgdata)
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|| {
format!(
"could not set permissions in data directory {}",
pgdata.display()
)
},
)?;
if config_only {
//Just create an empty config file
File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
} else {
self.do_basebackup()?;
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
}
// wal_log_hints is mandatory when running against pageserver (see gh issue#192)
// TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
self.append_conf(
@@ -321,8 +324,6 @@ impl PostgresNode {
// page server yet. (gh issue #349)
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
// Connect it to the page server.
// set up authentication
let password = if let AuthType::ZenithJWT = auth_type {
"$ZENITH_AUTH_TOKEN"
@@ -348,8 +349,6 @@ impl PostgresNode {
.as_str(),
)?;
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
Ok(())
}
@@ -410,6 +409,46 @@ impl PostgresNode {
}
pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
// Bail if the node already running.
if self.status() == "running" {
anyhow::bail!("The node is already running");
}
// 1. We always start compute node from scratch, so
// if old dir exists, preserve config files and drop the directory
// XXX Now we only use 'postgresql.conf'.
// If we will need 'pg_hba.conf', support it here too
let postgresql_conf_path = self.pgdata().join("postgresql.conf");
let postgresql_conf = fs::read(postgresql_conf_path.clone()).with_context(|| {
format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)
})?;
println!(
"Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap()
);
fs::remove_dir_all(&self.pgdata())?;
// 2. Create new node
self.init_from_page_server(self.env.auth_type, false)?;
// 3. Bring back config files
if let Ok(mut file) = OpenOptions::new()
.append(false)
.write(true)
.open(&postgresql_conf_path)
{
file.write_all(&postgresql_conf)?;
file.sync_all()?;
}
// 4. Finally start the compute node postgres
println!("Starting postgres node at '{}'", self.connstr());
self.pg_ctl(&["start"], auth_token)
}

View File

@@ -42,6 +42,9 @@ pub struct LocalEnv {
#[serde(with = "hex")]
pub tenantid: ZTenantId,
// Repository format, 'rocksdb' or 'layered' or None for default
pub repository_format: Option<String>,
// jwt auth token used for communication with pageserver
pub auth_token: String,
@@ -101,6 +104,7 @@ pub fn init(
remote_pageserver: Option<&str>,
tenantid: ZTenantId,
auth_type: AuthType,
repository_format: Option<&str>,
) -> Result<()> {
// check if config already exists
let base_path = base_path();
@@ -176,6 +180,7 @@ pub fn init(
base_data_dir: base_path,
remotes: BTreeMap::default(),
tenantid,
repository_format: repository_format.map(|x| x.into()),
auth_token,
auth_type,
private_key_path,
@@ -194,6 +199,7 @@ pub fn init(
base_data_dir: base_path,
remotes: BTreeMap::default(),
tenantid,
repository_format: repository_format.map(|x| x.into()),
auth_token,
auth_type,
private_key_path,

View File

@@ -50,7 +50,12 @@ impl PageServerNode {
.unwrap()
}
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> Result<()> {
pub fn init(
&self,
create_tenant: Option<&str>,
enable_auth: bool,
repository_format: Option<&str>,
) -> Result<()> {
let mut cmd = Command::new(self.env.pageserver_bin()?);
let mut args = vec![
"--init",
@@ -65,6 +70,10 @@ impl PageServerNode {
args.extend(&["--auth-type", "ZenithJWT"]);
}
if let Some(repo_format) = repository_format {
args.extend(&["--repository-format", repo_format]);
}
create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
let status = cmd
.args(args)

View File

@@ -1,6 +1,6 @@
#!/bin/sh
if [ "$1" = 'pageserver' ]; then
if [ ! -d "/data/timelines" ]; then
if [ ! -d "/data/tenants" ]; then
echo "Initializing pageserver data directory"
pageserver --init -D /data --postgres-distrib /usr/local
fi

11
docs/README.md Normal file
View File

@@ -0,0 +1,11 @@
# Zenith documentation
## Table of contents
- [authentication.md](authentication.md) — pageserver JWT authentication.
- [docker.md](docker.md) — Docker images and building pipeline.
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [pageserver/README](/pageserver/README) — pageserver overview.
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.

38
docs/docker.md Normal file
View File

@@ -0,0 +1,38 @@
# Docker images of Zenith
## Images
Currently we build two main images:
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build).
- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
## Building pipeline
1. Image `zenithdb/compute-tools` is re-built automatically.
2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub.
Build:
```sh
docker build -t zenithdb/build:buster -f Dockerfile.build .
```
Login:
```sh
docker login
```
Push to Docker Hub:
```sh
docker push zenithdb/build:buster
```
3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.

View File

@@ -7,6 +7,7 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
bookfile = "^0.3"
chrono = "0.4.19"
rand = "0.8.3"
regex = "1.4.5"

View File

@@ -179,7 +179,9 @@ impl<'a> Basebackup<'a> {
// Extract twophase state files
//
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
if let Ok(img) = self.timeline.get_page_at_lsn_nowait(RelishTag::TwoPhase { xid }, 0, self.lsn)
if let Ok(img) =
self.timeline
.get_page_at_lsn_nowait(RelishTag::TwoPhase { xid }, 0, self.lsn)
{
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);

View File

@@ -20,14 +20,14 @@ use anyhow::{ensure, Result};
use clap::{App, Arg, ArgMatches};
use daemonize::Daemonize;
use pageserver::{branches, logger, page_cache, page_service, PageServerConf};
use pageserver::{branches, logger, page_cache, page_service, PageServerConf, RepositoryFormat};
use zenith_utils::http_endpoint;
const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
const DEFAULT_HTTP_ENDPOINT_ADDR: &str = "127.0.0.1:9898";
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
const DEFAULT_SUPERUSER: &str = "zenith_admin";
@@ -41,6 +41,7 @@ struct CfgFileParams {
pg_distrib_dir: Option<String>,
auth_validation_public_key_path: Option<String>,
auth_type: Option<String>,
repository_format: Option<String>,
}
impl CfgFileParams {
@@ -58,6 +59,7 @@ impl CfgFileParams {
pg_distrib_dir: get_arg("postgres-distrib"),
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
auth_type: get_arg("auth-type"),
repository_format: get_arg("repository-format"),
}
}
@@ -74,6 +76,7 @@ impl CfgFileParams {
.auth_validation_public_key_path
.or(other.auth_validation_public_key_path),
auth_type: self.auth_type.or(other.auth_type),
repository_format: self.repository_format.or(other.repository_format),
}
}
@@ -133,6 +136,16 @@ impl CfgFileParams {
);
}
let repository_format = match self.repository_format.as_ref() {
Some(repo_format_str) if repo_format_str == "rocksdb" => RepositoryFormat::RocksDb,
Some(repo_format_str) if repo_format_str == "layered" => RepositoryFormat::Layered,
Some(repo_format_str) => anyhow::bail!(
"invalid --repository-format '{}', must be 'rocksdb' or 'layered'",
repo_format_str
),
None => RepositoryFormat::Layered, // default
};
Ok(PageServerConf {
daemonize: false,
@@ -148,8 +161,9 @@ impl CfgFileParams {
pg_distrib_dir,
auth_validation_public_key_path,
auth_type,
repository_format,
})
}
}
@@ -221,6 +235,12 @@ fn main() -> Result<()> {
.takes_value(true)
.help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
)
.arg(
Arg::with_name("repository-format")
.long("repository-format")
.takes_value(true)
.help("Which repository implementation to use, 'rocksdb' or 'layered'"),
)
.get_matches();
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));

View File

@@ -24,7 +24,7 @@ use crate::object_repository::ObjectRepository;
use crate::page_cache;
use crate::restore_local_repo;
use crate::walredo::WalRedoManager;
use crate::{repository::Repository, PageServerConf};
use crate::{repository::Repository, PageServerConf, RepositoryFormat};
#[derive(Serialize, Deserialize, Clone)]
pub struct BranchInfo {
@@ -65,8 +65,8 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
pub fn create_repo(
conf: &'static PageServerConf,
tenantid: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager>,
) -> Result<ObjectRepository> {
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
) -> Result<Arc<dyn Repository>> {
let repo_dir = conf.tenant_path(&tenantid);
if repo_dir.exists() {
bail!("repo for {} already exists", tenantid)
@@ -96,19 +96,27 @@ pub fn create_repo(
// and we failed to run initdb again in the same directory. This has been solved for the
// rapid init+start case now, but the general race condition remains if you restart the
// server quickly.
let storage = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
let repo: Arc<dyn Repository + Sync + Send> =
match conf.repository_format {
RepositoryFormat::Layered => Arc::new(
crate::layered_repository::LayeredRepository::new(conf, wal_redo_manager, tenantid),
),
RepositoryFormat::RocksDb => {
let obj_store = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
let repo = crate::object_repository::ObjectRepository::new(
conf,
std::sync::Arc::new(storage),
wal_redo_manager,
tenantid,
);
Arc::new(ObjectRepository::new(
conf,
Arc::new(obj_store),
wal_redo_manager,
tenantid,
))
}
};
// Load data into pageserver
// TODO To implement zenith import we need to
// move data loading out of create_repo()
bootstrap_timeline(conf, tenantid, tli, &repo)?;
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
Ok(repo)
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,298 @@
# Overview
The on-disk format is based on immutable files. The page server
receives a stream of incoming WAL, parses the WAL records to determine
which pages they apply to, and accumulates the incoming changes in
memory. Every now and then, the accumulated changes are written out to
new files.
The files are called "snapshot files". Each snapshot file corresponds
to one 10 MB slice of a PostgreSQL relation fork. The snapshot files
for each timeline are stored in the timeline's subdirectory under
.zenith/tenants/<tenantid>/timelines.
The files are named like this:
rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
For example:
rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
Some non-relation files are also stored in repository. For example,
a CLOG segment would be named like this:
pg_xact_0000_0_00000000198B06B0_00000000198C2550
There is no difference in how the relation and non-relation files are
managed, except that the first part of file names is different.
Internally, the relations and non-relation files that are managed in
the versioned store are together called "relishes".
Each snapshot file contains a full snapshot, that is, full copy of all
pages in the relation, as of the "start LSN". It also contains all WAL
records applicable to the relation between the start and end
LSNs. With this information, the page server can reconstruct any page
version of the relation in the LSN range.
If a file has been dropped, the last snapshot file for it is created
with the _DROPPED suffix, e.g.
rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
In addition to the relations, with "rel_*" prefix, we use the same
format for storing various smaller files from the PostgreSQL data
directory. They will use different suffixes and the naming scheme
up to the LSN range varies. The Zenith source code uses the term
"relish" to mean "a relation, or other file that's treated like a
relation in the storage"
## Notation used in this document
The full path of a snapshot file looks like this:
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
For simplicity, the examples below use a simplified notation for the
paths. The tenant ID is left out, the timeline ID is replaced with
the human-readable branch name, and spcnode+dbnode+relnode+forkum+segno
with a human-readable table name. The LSNs are also shorter. For
example, a snapshot file for 'orders' table on 'main' branch, with LSN
range 100-200 would be:
main/orders_100_200
# Creating snapshot files
Let's start with a simple example with a system that contains one
branch called 'main' and two tables, 'orders' and 'customers'. The end
of WAL is currently at LSN 250. In this starting situation, you would
have two files on disk:
main/orders_100_200
main/customers_100_200
In addition to those files, the recent changes between LSN 200 and the
end of WAL at 250 are kept in memory. If the page server crashes, the
latest records between 200-250 need to be re-read from the WAL.
Whenever enough WAL has been accumulated in memory, the page server
writes out the changes in memory into new snapshot files. This process
is called "checkpointing" (not to be confused with the PostgreSQL
checkpoints, that's a different thing). The page server only creates
snapshot files for relations that have been modified since the last
checkpoint. For example, if the current end of WAL is at LSN 450, and
the last checkpoint happened at LSN 400 but there hasn't been any
recent changes to 'customers' table, you would have these files on
disk:
main/orders_100_200
main/orders_200_300
main/orders_300_400
main/customers_100_200
If the customers table is modified later, a new file is created for it
at the next checkpoint. The new file will cover the "gap" from the
last snapshot file, so the LSN ranges are always contiguous:
main/orders_100_200
main/orders_200_300
main/orders_300_400
main/customers_100_200
main/customers_200_500
## Reading page versions
Whenever a GetPage@LSN request comes in from the compute node, the
page server needs to reconstruct the requested page, as it was at the
requested LSN. To do that, the page server first checks the recent
in-memory layer; if the requested page version is found there, it can
be returned immediatedly without looking at the files on
disk. Otherwise the page server needs to locate the snapshot file that
contains the requested page version.
For example, if a request comes in for table 'orders' at LSN 250, the
page server would load the 'main/orders_200_300' file into memory, and
reconstruct and return the requested page from it, as it was at
LSN 250. Because the snapshot file consists of a full image of the
relation at the start LSN and the WAL, reconstructing the page
involves replaying any WAL records applicable to the page between LSNs
200-250, starting from the base image at LSN 200.
A request at a file boundary can be satisfied using either file. For
example, if there are two files on disk:
main/orders_100_200
main/orders_200_300
And a request comes with LSN 200, either file can be used for it. It
is better to use the later file, however, because it contains an
already materialized version of all the pages at LSN 200. Using the
first file, you would need to apply any WAL records between 100 and
200 to reconstruct the requested page.
# Multiple branches
Imagine that a child branch is created at LSN 250:
@250
----main--+-------------------------->
\
+---child-------------->
Then, the 'orders' table is updated differently on the 'main' and
'child' branches. You now have this situation on disk:
main/orders_100_200
main/orders_200_300
main/orders_300_400
main/customers_100_200
child/orders_250_300
child/orders_300_400
Because the 'customers' table hasn't been modified on the child
branch, there is no file for it there. If you request a page for it on
the 'child' branch, the page server will not find any snapshot file
for it in the 'child' directory, so it will recurse to look into the
parent 'main' branch instead.
From the 'child' branch's point of view, the history for each relation
is linear, and the request's LSN identifies unambiguously which file
you need to look at. For example, the history for the 'orders' table
on the 'main' branch consists of these files:
main/orders_100_200
main/orders_200_300
main/orders_300_400
And from the 'child' branch's point of view, it consists of these
files:
main/orders_100_200
main/orders_200_300
child/orders_250_300
child/orders_300_400
The branch metadata includes the point where the child branch was
created, LSN 250. If a page request comes with LSN 275, we read the
page version from the 'child/orders_250_300' file. If the request LSN
is 225, we read it from the 'main/orders_200_300' file instead. The
page versions between 250-300 in the 'main/orders_200_300' file are
ignored when operating on the child branch.
Note: It doesn't make any difference if the child branch is created
when the end of the main branch was at LSN 250, or later when the tip of
the main branch had already moved on. The latter case, creating a
branch at a historic LSN, is how we support PITR in Zenith.
# Garbage collection
In this scheme, we keep creating new snapshot files over time. We also
need a mechanism to remove old files that are no longer needed,
because disk space isn't infinite.
What files are still needed? Currently, the page server supports PITR
and branching from any branch at any LSN that is "recent enough" from
the tip of the branch. "Recent enough" is defined as an LSN horizon,
which by default is 64 MB. (See DEFAULT_GC_HORIZON). For this
example, let's assume that the LSN horizon is 150 units.
Let's look at the single branch scenario again. Imagine that the end
of the branch is LSN 525, so that the GC horizon is currently at
525-150 = 375
main/orders_100_200
main/orders_200_300
main/orders_300_400
main/orders_400_500
main/customers_100_200
We can remove files 'main/orders_100_200' and 'main/orders_200_300',
because the end LSNs of those files are older than GC horizon 375, and
there are more recent snapshot files for the table. 'main/orders_300_400'
and 'main/orders_400_500' are still within the horizon, so they must be
retained. 'main/customers_100_200' is old enough, but it cannot be
removed because there is no newer snapshot file for the table.
Things get slightly more complicated with multiple branches. All of
the above still holds, but in addition to recent files we must also
retain older shapshot files that are still needed by child branches.
For example, if child branch is created at LSN 150, and the 'customers'
table is updated on the branch, you would have these files:
main/orders_100_200
main/orders_200_300
main/orders_300_400
main/orders_400_500
main/customers_100_200
child/customers_150_300
In this situation, the 'main/orders_100_200' file cannot be removed,
even though it is older than the GC horizon, because it is still
needed by the child branch. 'main/orders_200_300' can still be
removed. So after garbage collection, these files would remain:
main/orders_100_200
main/orders_300_400
main/orders_400_500
main/customers_100_200
child/customers_150_300
If 'orders' is modified later on the 'child' branch, we will create a
snapshot file for it on the child:
main/orders_100_200
main/orders_300_400
main/orders_400_500
main/customers_100_200
child/customers_150_300
child/orders_150_400
After this, the 'main/orders_100_200' file can be removed. It is no
longer needed by the child branch, because there is a newer snapshot
file there. TODO: This optimization hasn't been implemented! The GC
algorithm will currently keep the file on the 'main' branch anyway, for
as long as the child branch exists.
# TODO: On LSN ranges
In principle, each relation can be checkpointed separately, i.e. the
LSN ranges of the files don't need to line up. So this would be legal:
main/orders_100_200
main/orders_200_300
main/orders_300_400
main/customers_150_250
main/customers_250_500
However, the code currently always checkpoints all relations together.
So that situation doesn't arise in practice.
It would also be OK to have overlapping LSN ranges for the same relation:
main/orders_100_200
main/orders_200_300
main/orders_250_350
main/orders_300_400
The code that reads the snapshot files should cope with this, but this
situation doesn't arise either, because the checkpointing code never
does that. It could be useful, however, as a transient state when
garbage collecting around branch points, or explicit recovery
points. For example, if we start with this:
main/orders_100_200
main/orders_200_300
main/orders_300_400
And there is a branch or explicit recovery point at LSN 150, we could
replace 'main/orders_100_200' with 'main/orders_150_150' to keep a
snapshot only at that exact point that's still needed, removing the
other page versions around it. But such compaction has not been
implemented yet.

View File

@@ -0,0 +1,491 @@
//!
//! An in-memory layer stores recently received page versions in memory. The page versions
//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
//!
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageVersion, SegmentTag, RELISH_SEG_SIZE,
};
use crate::layered_repository::LayeredTimeline;
use crate::layered_repository::SnapshotLayer;
use crate::repository::WALRecord;
use crate::PageServerConf;
use crate::{ZTenantId, ZTimelineId};
use anyhow::{bail, Result};
use bytes::Bytes;
use log::*;
use std::collections::BTreeMap;
use std::ops::Bound::Included;
use std::sync::{Arc, Mutex};
use zenith_utils::lsn::Lsn;
pub struct InMemoryLayer {
conf: &'static PageServerConf,
tenantid: ZTenantId,
timelineid: ZTimelineId,
seg: SegmentTag,
///
/// This layer contains all the changes from 'start_lsn'. The
/// start is inclusive. There is no end LSN; we only use in-memory
/// layer at the end of a timeline.
///
start_lsn: Lsn,
/// The above fields never change. The parts that do change are in 'inner',
/// and protected by mutex.
inner: Mutex<InMemoryLayerInner>,
}
pub struct InMemoryLayerInner {
/// If this relation was dropped, remember when that happened.
drop_lsn: Option<Lsn>,
///
/// All versions of all pages in the layer are are kept here.
/// Indexed by block number and LSN.
///
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
///
/// `segsizes` tracks the size of the segment at different points in time.
///
segsizes: BTreeMap<Lsn, u32>,
}
impl Layer for InMemoryLayer {
fn get_timeline_id(&self) -> ZTimelineId {
return self.timelineid;
}
fn get_seg_tag(&self) -> SegmentTag {
return self.seg;
}
fn get_start_lsn(&self) -> Lsn {
return self.start_lsn;
}
fn get_end_lsn(&self) -> Lsn {
let inner = self.inner.lock().unwrap();
if let Some(drop_lsn) = inner.drop_lsn {
drop_lsn
} else {
Lsn(u64::MAX)
}
}
fn is_dropped(&self) -> bool {
let inner = self.inner.lock().unwrap();
inner.drop_lsn.is_some()
}
/// Look up given page in the cache.
fn get_page_reconstruct_data(
&self,
blknum: u32,
lsn: Lsn,
reconstruct_data: &mut PageReconstructData,
) -> Result<Option<Lsn>> {
// Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
assert!(self.seg.blknum_in_seg(blknum));
{
let inner = self.inner.lock().unwrap();
let minkey = (blknum, Lsn(0));
let maxkey = (blknum, lsn);
let mut iter = inner
.page_versions
.range((Included(&minkey), Included(&maxkey)));
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
if let Some(img) = &entry.page_image {
reconstruct_data.page_img = Some(img.clone());
need_base_image_lsn = None;
break;
} else if let Some(rec) = &entry.record {
reconstruct_data.records.push(rec.clone());
if rec.will_init {
// This WAL record initializes the page, so no need to go further back
need_base_image_lsn = None;
break;
} else {
need_base_image_lsn = Some(*entry_lsn);
}
} else {
// No base image, and no WAL record. Huh?
bail!("no page image or WAL record for requested page");
}
}
// release lock on 'page_versions'
}
Ok(need_base_image_lsn)
}
/// Get size of the relation at given LSN
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
// Scan the BTreeMap backwards, starting from the given entry.
let inner = self.inner.lock().unwrap();
let mut iter = inner.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
if let Some((_entry_lsn, entry)) = iter.next_back() {
let result = *entry;
drop(inner);
trace!("get_seg_size: {} at {} -> {}", self.seg, lsn, result);
Ok(result)
} else {
bail!("No size found for {} at {} in memory", self.seg, lsn);
}
}
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
let inner = self.inner.lock().unwrap();
// Is the requested LSN after the segment was dropped?
if let Some(drop_lsn) = inner.drop_lsn {
if lsn >= drop_lsn {
return Ok(false);
}
}
// Otherwise, it exists
Ok(true)
}
}
impl InMemoryLayer {
///
/// Create a new, empty, in-memory layer
///
pub fn create(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
start_lsn: Lsn,
) -> Result<InMemoryLayer> {
trace!(
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
seg,
timelineid,
start_lsn
);
Ok(InMemoryLayer {
conf,
timelineid,
tenantid,
seg,
start_lsn,
inner: Mutex::new(InMemoryLayerInner {
drop_lsn: None,
page_versions: BTreeMap::new(),
segsizes: BTreeMap::new(),
}),
})
}
// Write operations
/// Remember new page version, as a WAL record over previous version
pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> Result<()> {
self.put_page_version(
blknum,
rec.lsn,
PageVersion {
page_image: None,
record: Some(rec),
},
)
}
/// Remember new page version, as a full page image
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
self.put_page_version(
blknum,
lsn,
PageVersion {
page_image: Some(img),
record: None,
},
)
}
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
assert!(self.seg.blknum_in_seg(blknum));
trace!(
"put_page_version blk {} of {} at {}/{}",
blknum,
self.seg.rel,
self.timelineid,
lsn
);
let mut inner = self.inner.lock().unwrap();
let old = inner.page_versions.insert((blknum, lsn), pv);
if old.is_some() {
// We already had an entry for this LSN. That's odd..
warn!(
"Page version of rel {} blk {} at {} already exists",
self.seg.rel, blknum, lsn
);
}
// Also update the relation size, if this extended the relation.
if self.seg.rel.is_blocky() {
let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;
let mut iter = inner.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
let oldsize;
if let Some((_entry_lsn, entry)) = iter.next_back() {
oldsize = *entry;
} else {
oldsize = 0;
//bail!("No old size found for {} at {}", self.tag, lsn);
}
if newsize > oldsize {
trace!(
"enlarging segment {} from {} to {} blocks at {}",
self.seg,
oldsize,
newsize,
lsn
);
inner.segsizes.insert(lsn, newsize);
}
}
Ok(())
}
/// Remember that the relation was truncated at given LSN
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> anyhow::Result<()> {
let mut inner = self.inner.lock().unwrap();
let old = inner.segsizes.insert(lsn, segsize);
if old.is_some() {
// We already had an entry for this LSN. That's odd..
warn!("Inserting truncation, but had an entry for the LSN already");
}
Ok(())
}
/// Remember that the segment was dropped at given LSN
pub fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
let mut inner = self.inner.lock().unwrap();
assert!(inner.drop_lsn.is_none());
inner.drop_lsn = Some(lsn);
info!("dropped segment {} at {}", self.seg, lsn);
Ok(())
}
///
/// Initialize a new InMemoryLayer for, by copying the state at the given
/// point in time from given existing layer.
///
pub fn copy_snapshot(
conf: &'static PageServerConf,
timeline: &LayeredTimeline,
src: &dyn Layer,
timelineid: ZTimelineId,
tenantid: ZTenantId,
lsn: Lsn,
) -> Result<InMemoryLayer> {
trace!(
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
src.get_seg_tag(),
timelineid,
lsn
);
let mut page_versions = BTreeMap::new();
let mut segsizes = BTreeMap::new();
let seg = src.get_seg_tag();
let startblk;
let size;
if seg.rel.is_blocky() {
size = src.get_seg_size(lsn)?;
segsizes.insert(lsn, size);
startblk = seg.segno * RELISH_SEG_SIZE;
} else {
size = 1;
startblk = 0;
}
for blknum in startblk..(startblk + size) {
let img = timeline.materialize_page(seg, blknum, lsn, src)?;
let pv = PageVersion {
page_image: Some(img),
record: None,
};
page_versions.insert((blknum, lsn), pv);
}
Ok(InMemoryLayer {
conf,
timelineid,
tenantid,
seg: src.get_seg_tag(),
start_lsn: lsn,
inner: Mutex::new(InMemoryLayerInner {
drop_lsn: None,
page_versions: page_versions,
segsizes: segsizes,
}),
})
}
///
/// Write the this in-memory layer to disk, as a snapshot layer.
///
/// The cutoff point for the layer that's written to disk is 'end_lsn'.
///
/// Returns new layers that replace this one. Always returns a
/// SnapshotLayer containing the page versions that were written to disk,
/// but if there were page versions newer than 'end_lsn', also return a new
/// in-memory layer containing those page versions. The caller replaces
/// this layer with the returned layers in the layer map.
///
pub fn freeze(
&self,
cutoff_lsn: Lsn,
// This is needed just to call materialize_page()
timeline: &LayeredTimeline,
) -> Result<(Option<Arc<SnapshotLayer>>, Option<Arc<InMemoryLayer>>)> {
info!(
"freezing in memory layer for {} on timeline {} at {}",
self.seg, self.timelineid, cutoff_lsn
);
let inner = self.inner.lock().unwrap();
// Normally, use the cutoff LSN as the end of the frozen layer.
// But if the relation was dropped, we know that there are no
// more changes coming in for it, and in particular we know that
// there are no changes "in flight" for the LSN anymore, so we use
// the drop LSN instead. The drop-LSN could be ahead of the
// caller-specified LSN!
let dropped = inner.drop_lsn.is_some();
let end_lsn = if dropped {
inner.drop_lsn.unwrap()
} else {
cutoff_lsn
};
// Divide all the page versions into old and new at the 'end_lsn' cutoff point.
let mut before_page_versions;
let mut before_segsizes;
let mut after_page_versions;
let mut after_segsizes;
if !dropped {
before_segsizes = BTreeMap::new();
after_segsizes = BTreeMap::new();
for (lsn, size) in inner.segsizes.iter() {
if *lsn > end_lsn {
after_segsizes.insert(*lsn, *size);
} else {
before_segsizes.insert(*lsn, *size);
}
}
before_page_versions = BTreeMap::new();
after_page_versions = BTreeMap::new();
for ((blknum, lsn), pv) in inner.page_versions.iter() {
if *lsn > end_lsn {
after_page_versions.insert((*blknum, *lsn), pv.clone());
} else {
before_page_versions.insert((*blknum, *lsn), pv.clone());
}
}
} else {
before_page_versions = inner.page_versions.clone();
before_segsizes = inner.segsizes.clone();
after_segsizes = BTreeMap::new();
after_page_versions = BTreeMap::new();
}
// we can release the lock now.
drop(inner);
// Write the page versions before the cutoff to disk.
let snapfile = SnapshotLayer::create(
self.conf,
self.timelineid,
self.tenantid,
self.seg,
self.start_lsn,
end_lsn,
dropped,
before_page_versions,
before_segsizes,
)?;
// If there were any "new" page versions, initialize a new in-memory layer to hold
// them
let new_open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() {
info!("created new in-mem layer for {} {}-", self.seg, end_lsn);
let new_open = Self::copy_snapshot(
self.conf,
timeline,
&snapfile,
self.timelineid,
self.tenantid,
end_lsn,
)?;
let mut new_inner = new_open.inner.lock().unwrap();
new_inner.page_versions.append(&mut after_page_versions);
new_inner.segsizes.append(&mut after_segsizes);
drop(new_inner);
Some(Arc::new(new_open))
} else {
None
};
let new_historic = Some(Arc::new(snapfile));
Ok((new_historic, new_open))
}
/// debugging function to print out the contents of the layer
#[allow(unused)]
pub fn dump(&self) -> String {
let mut result = format!(
"----- inmemory layer for {} {}-> ----\n",
self.seg, self.start_lsn
);
let inner = self.inner.lock().unwrap();
for (k, v) in inner.segsizes.iter() {
result += &format!("{}: {}\n", k, v);
}
for (k, v) in inner.page_versions.iter() {
result += &format!(
"blk {} at {}: {}/{}\n",
k.0,
k.1,
v.page_image.is_some(),
v.record.is_some()
);
}
result
}
}

View File

@@ -0,0 +1,281 @@
//!
//! The layer map tracks what layers exist for all the relations in a timeline.
//!
//! When the timeline is first accessed, the server lists of all snapshot files
//! in the timelines/<timelineid> directory, and populates this map with
//! SnapshotLayers corresponding to each file. When new WAL is received,
//! we create InMemoryLayers to hold the incoming records. Now and then,
//! in the checkpoint() function, the in-memory layers are frozen, forming
//! new snapshot layers and corresponding files are written to disk.
//!
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
use crate::layered_repository::{InMemoryLayer, SnapshotLayer};
use crate::relish::*;
use anyhow::Result;
use log::*;
use std::collections::HashSet;
use std::collections::{BTreeMap, HashMap};
use std::ops::Bound::Included;
use std::sync::Arc;
use zenith_utils::lsn::Lsn;
///
/// LayerMap tracks what layers exist or a timeline. The last layer that is
/// open for writes is always an InMemoryLayer, and is tracked separately
/// because there can be only one for each segment. The older layers,
/// stored on disk, are kept in a BTreeMap keyed by the layer's start LSN.
///
pub struct LayerMap {
segs: HashMap<SegmentTag, SegEntry>,
}
struct SegEntry {
pub open: Option<Arc<InMemoryLayer>>,
pub historic: BTreeMap<Lsn, Arc<SnapshotLayer>>,
}
impl LayerMap {
///
/// Look up using the given segment tag and LSN. This differs from a plain
/// key-value lookup in that if there is any layer that covers the
/// given LSN, or precedes the given LSN, it is returned. In other words,
/// you don't need to know the exact start LSN of the layer.
///
pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
let segentry = self.segs.get(tag)?;
if let Some(open) = &segentry.open {
if open.get_start_lsn() <= lsn {
let x: Arc<dyn Layer> = Arc::clone(&open) as _;
return Some(x);
}
}
if let Some((_k, v)) = segentry
.historic
.range((Included(Lsn(0)), Included(lsn)))
.next_back()
{
let x: Arc<dyn Layer> = Arc::clone(&v) as _;
Some(x)
} else {
None
}
}
///
/// Get the open layer for given segment for writing. Or None if no open
/// layer exists.
///
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
let segentry = self.segs.get(tag)?;
if let Some(open) = &segentry.open {
Some(Arc::clone(open))
} else {
None
}
}
///
/// Insert an open in-memory layer
///
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
let tag = layer.get_seg_tag();
if let Some(segentry) = self.segs.get_mut(&tag) {
if let Some(_old) = &segentry.open {
// FIXME: shouldn't exist, but check
}
segentry.open = Some(layer);
} else {
let segentry = SegEntry {
open: Some(layer),
historic: BTreeMap::new(),
};
self.segs.insert(tag, segentry);
}
}
///
/// Insert an on-disk layer
///
pub fn insert_historic(&mut self, layer: Arc<SnapshotLayer>) {
let tag = layer.get_seg_tag();
let start_lsn = layer.get_start_lsn();
if let Some(segentry) = self.segs.get_mut(&tag) {
segentry.historic.insert(start_lsn, layer);
} else {
let mut historic = BTreeMap::new();
historic.insert(start_lsn, layer);
let segentry = SegEntry {
open: None,
historic,
};
self.segs.insert(tag, segentry);
}
}
///
/// Remove an on-disk layer from the map.
///
/// This should be called when the corresponding file on disk has been deleted.
///
pub fn remove_historic(&mut self, layer: &SnapshotLayer) {
let tag = layer.get_seg_tag();
let start_lsn = layer.get_start_lsn();
if let Some(segentry) = self.segs.get_mut(&tag) {
segentry.historic.remove(&start_lsn);
}
}
pub fn list_rels(&self, spcnode: u32, dbnode: u32) -> Result<HashSet<RelTag>> {
let mut rels: HashSet<RelTag> = HashSet::new();
for (seg, _entry) in self.segs.iter() {
if let RelishTag::Relation(reltag) = seg.rel {
// FIXME: skip if it was dropped before the requested LSN. But there is no
// LSN argument
if (spcnode == 0 || reltag.spcnode == spcnode)
&& (dbnode == 0 || reltag.dbnode == dbnode)
{
rels.insert(reltag);
}
}
}
Ok(rels)
}
pub fn list_nonrels(&self, _lsn: Lsn) -> Result<HashSet<RelishTag>> {
let mut rels: HashSet<RelishTag> = HashSet::new();
// Scan the timeline directory to get all rels in this timeline.
for (seg, _entry) in self.segs.iter() {
// FIXME: skip if it was dropped before the requested LSN.
if let RelishTag::Relation(_) = seg.rel {
} else {
rels.insert(seg.rel);
}
}
Ok(rels)
}
/// Is there a newer layer for given segment?
pub fn newer_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
if let Some(segentry) = self.segs.get(&seg) {
if let Some(_open) = &segentry.open {
return true;
}
for (newer_lsn, layer) in segentry
.historic
.range((Included(lsn), Included(Lsn(u64::MAX))))
{
if layer.get_end_lsn() > lsn {
trace!(
"found later layer for {}, {} {}-{}",
seg,
lsn,
newer_lsn,
layer.get_end_lsn()
);
return true;
} else {
trace!("found singleton layer for {}, {} {}", seg, lsn, newer_lsn);
continue;
}
}
}
trace!("no later layer found for {}, {}", seg, lsn);
false
}
pub fn iter_open_layers(&mut self) -> OpenLayerIter {
OpenLayerIter {
last: None,
segiter: self.segs.iter_mut(),
}
}
pub fn iter_historic_layers(&self) -> HistoricLayerIter {
HistoricLayerIter {
segiter: self.segs.iter(),
iter: None,
}
}
}
impl Default for LayerMap {
fn default() -> Self {
LayerMap {
segs: HashMap::new(),
}
}
}
pub struct OpenLayerIter<'a> {
last: Option<&'a mut SegEntry>,
segiter: std::collections::hash_map::IterMut<'a, SegmentTag, SegEntry>,
}
impl<'a> OpenLayerIter<'a> {
pub fn replace(&mut self, replacement: Option<Arc<InMemoryLayer>>) {
let segentry = self.last.as_mut().unwrap();
segentry.open = replacement;
}
pub fn insert_historic(&mut self, new_layer: Arc<SnapshotLayer>) {
let start_lsn = new_layer.get_start_lsn();
let segentry = self.last.as_mut().unwrap();
segentry.historic.insert(start_lsn, new_layer);
}
}
impl<'a> Iterator for OpenLayerIter<'a> {
type Item = Arc<InMemoryLayer>;
fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
while let Some((_seg, entry)) = self.segiter.next() {
if let Some(open) = &entry.open {
let op = Arc::clone(&open);
self.last = Some(entry);
return Some(op);
}
}
self.last = None;
None
}
}
pub struct HistoricLayerIter<'a> {
segiter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
iter: Option<std::collections::btree_map::Iter<'a, Lsn, Arc<SnapshotLayer>>>,
}
impl<'a> Iterator for HistoricLayerIter<'a> {
type Item = Arc<SnapshotLayer>;
fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
loop {
if let Some(x) = &mut self.iter {
if let Some(x) = x.next() {
return Some(Arc::clone(&*x.1));
}
}
if let Some(seg) = self.segiter.next() {
self.iter = Some(seg.1.historic.iter());
continue;
} else {
return None;
}
}
}
}

View File

@@ -0,0 +1,547 @@
//!
//! A SnapshotLayer represents one snapshot file on disk. One file holds all page
//! version and size information of one relation, in a range of LSN.
//! The name "snapshot file" is a bit of a misnomer because a snapshot file doesn't
//! contain a snapshot at a specific LSN, but rather all the page versions in a range
//! of LSNs.
//!
//! Currently, a snapshot file contains full information needed to reconstruct any
//! page version in the LSN range, without consulting any other snapshot files. When
//! a new snapshot file is created for writing, the full contents of relation are
//! materialized as it is at the beginning of the LSN range. That can be very expensive,
//! we should find a way to store differential files. But this keeps the read-side
//! of things simple. You can find the correct snapshot file based on RelishTag and
//! timeline+LSN, and once you've located it, you have all the data you need to in that
//! file.
//!
//! When a snapshot file needs to be accessed, we slurp the whole file into memory, into
//! the SnapshotLayer struct. See load() and unload() functions.
//!
//! On disk, the snapshot files are stored in timelines/<timelineid> directory.
//! Currently, there are no subdirectories, and each snapshot file is named like this:
//!
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
//!
//! For example:
//!
//! 1663_13990_2609_0_000000000169C348_000000000169C349
//!
//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
//! So the above example would become:
//!
//! 1663_13990_2609_0_000000000169C348_000000000169C349_DROPPED
//!
//! The end LSN indicates when it was dropped in that case, we don't store it in the
//! file contents in any way.
//!
//! A snapshot file is constructed using the 'bookfile' crate. Each file consists of two
//! parts: the page versions and the relation sizes. They are stored as separate chapters.
//!
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageVersion, SegmentTag,
};
use crate::relish::*;
use crate::PageServerConf;
use crate::{ZTenantId, ZTimelineId};
use anyhow::{bail, Result};
use log::*;
use std::collections::BTreeMap;
use std::fmt;
use std::fs;
use std::fs::File;
use std::io::Write;
use std::ops::Bound::Included;
use std::path::PathBuf;
use std::sync::{Arc, Mutex, MutexGuard};
use bookfile::{Book, BookWriter};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
// Magic constant to identify a Zenith snapshot file
static SNAPSHOT_FILE_MAGIC: u32 = 0x5A616E01;
static PAGE_VERSIONS_CHAPTER: u64 = 1;
static REL_SIZES_CHAPTER: u64 = 2;
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
struct SnapshotFileName {
seg: SegmentTag,
start_lsn: Lsn,
end_lsn: Lsn,
dropped: bool,
}
impl SnapshotFileName {
fn from_str(fname: &str) -> Option<Self> {
// Split the filename into parts
//
// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>
//
// or if it was dropped:
//
// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>_DROPPED
//
let rel;
let mut parts;
if let Some(rest) = fname.strip_prefix("rel_") {
parts = rest.split('_');
rel = RelishTag::Relation(RelTag {
spcnode: parts.next()?.parse::<u32>().ok()?,
dbnode: parts.next()?.parse::<u32>().ok()?,
relnode: parts.next()?.parse::<u32>().ok()?,
forknum: parts.next()?.parse::<u8>().ok()?,
});
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::Clog,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::MultiXactOffsets,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
parts = rest.split('_');
rel = RelishTag::FileNodeMap {
spcnode: parts.next()?.parse::<u32>().ok()?,
dbnode: parts.next()?.parse::<u32>().ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
parts = rest.split('_');
rel = RelishTag::TwoPhase {
xid: parts.next()?.parse::<u32>().ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
parts = rest.split('_');
rel = RelishTag::Checkpoint;
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
parts = rest.split('_');
rel = RelishTag::ControlFile;
} else {
return None;
}
let segno = parts.next()?.parse::<u32>().ok()?;
let seg = SegmentTag { rel, segno };
let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
let mut dropped = false;
if let Some(suffix) = parts.next() {
if suffix == "DROPPED" {
dropped = true;
} else {
warn!("unrecognized filename in timeline dir: {}", fname);
return None;
}
}
if parts.next().is_some() {
warn!("unrecognized filename in timeline dir: {}", fname);
return None;
}
Some(SnapshotFileName {
seg,
start_lsn,
end_lsn,
dropped,
})
}
fn to_string(&self) -> String {
let basename = match self.seg.rel {
RelishTag::Relation(reltag) => format!(
"rel_{}_{}_{}_{}",
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
),
RelishTag::Slru {
slru: SlruKind::Clog,
segno,
} => format!("pg_xact_{:04X}", segno),
RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno,
} => format!("pg_multixact_members_{:04X}", segno),
RelishTag::Slru {
slru: SlruKind::MultiXactOffsets,
segno,
} => format!("pg_multixact_offsets_{:04X}", segno),
RelishTag::FileNodeMap { spcnode, dbnode } => {
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
}
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
RelishTag::Checkpoint => format!("pg_control_checkpoint"),
RelishTag::ControlFile => format!("pg_control"),
};
format!(
"{}_{}_{:016X}_{:016X}{}",
basename,
self.seg.segno,
u64::from(self.start_lsn),
u64::from(self.end_lsn),
if self.dropped { "_DROPPED" } else { "" }
)
}
}
impl fmt::Display for SnapshotFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.to_string())
}
}
///
/// SnapshotLayer is the in-memory data structure associated with an
/// on-disk snapshot file. We keep a SnapshotLayer in memory for each
/// file, in the LayerMap. If a layer is in "loaded" state, we have a
/// copy of the file in memory, in 'inner'. Otherwise the struct is
/// just a placeholder for a file that exists on disk, and it needs to
/// be loaded before using it in queries.
///
pub struct SnapshotLayer {
conf: &'static PageServerConf,
pub tenantid: ZTenantId,
pub timelineid: ZTimelineId,
pub seg: SegmentTag,
//
// This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
// start is inclusive, and end is exclusive.
pub start_lsn: Lsn,
pub end_lsn: Lsn,
dropped: bool,
inner: Mutex<SnapshotLayerInner>,
}
pub struct SnapshotLayerInner {
/// If false, the 'page_versions' and 'relsizes' have not been
/// loaded into memory yet.
loaded: bool,
/// All versions of all pages in the file are are kept here.
/// Indexed by block number and LSN.
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
/// `relsizes` tracks the size of the relation at different points in time.
relsizes: BTreeMap<Lsn, u32>,
}
impl Layer for SnapshotLayer {
fn get_timeline_id(&self) -> ZTimelineId {
return self.timelineid;
}
fn get_seg_tag(&self) -> SegmentTag {
return self.seg;
}
fn is_dropped(&self) -> bool {
return self.dropped;
}
fn get_start_lsn(&self) -> Lsn {
return self.start_lsn;
}
fn get_end_lsn(&self) -> Lsn {
return self.end_lsn;
}
/// Look up given page in the cache.
fn get_page_reconstruct_data(
&self,
blknum: u32,
lsn: Lsn,
reconstruct_data: &mut PageReconstructData,
) -> Result<Option<Lsn>> {
// Scan the BTreeMap backwards, starting from the given entry.
let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
{
let inner = self.load()?;
let minkey = (blknum, Lsn(0));
let maxkey = (blknum, lsn);
let mut iter = inner
.page_versions
.range((Included(&minkey), Included(&maxkey)));
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
if let Some(img) = &entry.page_image {
reconstruct_data.page_img = Some(img.clone());
need_base_image_lsn = None;
break;
} else if let Some(rec) = &entry.record {
reconstruct_data.records.push(rec.clone());
if rec.will_init {
// This WAL record initializes the page, so no need to go further back
need_base_image_lsn = None;
break;
} else {
need_base_image_lsn = Some(*entry_lsn);
}
} else {
// No base image, and no WAL record. Huh?
bail!("no page image or WAL record for requested page");
}
}
// release lock on 'inner'
}
Ok(need_base_image_lsn)
}
/// Get size of the relation at given LSN
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
// Scan the BTreeMap backwards, starting from the given entry.
let inner = self.load()?;
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
if let Some((_entry_lsn, entry)) = iter.next_back() {
let result = *entry;
drop(inner);
trace!("get_seg_size: {} at {} -> {}", self.seg, lsn, result);
Ok(result)
} else {
error!(
"No size found for {} at {} in snapshot layer {} {}-{}",
self.seg, lsn, self.seg, self.start_lsn, self.end_lsn
);
bail!(
"No size found for {} at {} in snapshot layer",
self.seg,
lsn
);
}
}
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
// Is the requested LSN after the rel was dropped?
if self.dropped && lsn >= self.end_lsn {
return Ok(false);
}
// Otherwise, it exists.
Ok(true)
}
}
impl SnapshotLayer {
fn path(&self) -> PathBuf {
Self::path_for(
self.conf,
self.timelineid,
self.tenantid,
&SnapshotFileName {
seg: self.seg,
start_lsn: self.start_lsn,
end_lsn: self.end_lsn,
dropped: self.dropped,
},
)
}
fn path_for(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
fname: &SnapshotFileName,
) -> PathBuf {
conf.timeline_path(&timelineid, &tenantid)
.join(fname.to_string())
}
/// Create a new snapshot file, using the given btreemaps containing the page versions and
/// relsizes.
///
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
/// expedient.
pub fn create(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
start_lsn: Lsn,
end_lsn: Lsn,
dropped: bool,
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
relsizes: BTreeMap<Lsn, u32>,
) -> Result<SnapshotLayer> {
let snapfile = SnapshotLayer {
conf: conf,
timelineid: timelineid,
tenantid: tenantid,
seg: seg,
start_lsn: start_lsn,
end_lsn,
dropped,
inner: Mutex::new(SnapshotLayerInner {
loaded: true,
page_versions: page_versions,
relsizes: relsizes,
}),
};
let inner = snapfile.inner.lock().unwrap();
// Write the in-memory btreemaps into a file
let path = snapfile.path();
// Note: This overwrites any existing file. There shouldn't be any.
// FIXME: throw an error instead?
let file = File::create(&path)?;
let book = BookWriter::new(file, SNAPSHOT_FILE_MAGIC)?;
// Write out page versions
let mut chapter = book.new_chapter(PAGE_VERSIONS_CHAPTER);
let buf = BTreeMap::ser(&inner.page_versions)?;
chapter.write_all(&buf)?;
let book = chapter.close()?;
// and relsizes to separate chapter
let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
let buf = BTreeMap::ser(&inner.relsizes)?;
chapter.write_all(&buf)?;
let book = chapter.close()?;
book.close()?;
trace!("saved {}", &path.display());
drop(inner);
Ok(snapfile)
}
///
/// Load the contents of the file into memory
///
fn load(&self) -> Result<MutexGuard<SnapshotLayerInner>> {
// quick exit if already loaded
let mut inner = self.inner.lock().unwrap();
if inner.loaded {
return Ok(inner);
}
let path = Self::path_for(
self.conf,
self.timelineid,
self.tenantid,
&SnapshotFileName {
seg: self.seg,
start_lsn: self.start_lsn,
end_lsn: self.end_lsn,
dropped: self.dropped,
},
);
let file = File::open(&path)?;
let book = Book::new(file)?;
let chapter = book.read_chapter(PAGE_VERSIONS_CHAPTER)?;
let page_versions = BTreeMap::des(&chapter)?;
let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
let relsizes = BTreeMap::des(&chapter)?;
debug!("loaded from {}", &path.display());
*inner = SnapshotLayerInner {
loaded: true,
page_versions,
relsizes,
};
Ok(inner)
}
/// Create SnapshotLayers representing all files on disk
///
// TODO: returning an Iterator would be more idiomatic
pub fn list_snapshot_files(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
) -> Result<Vec<Arc<SnapshotLayer>>> {
let path = conf.timeline_path(&timelineid, &tenantid);
let mut snapfiles: Vec<Arc<SnapshotLayer>> = Vec::new();
for direntry in fs::read_dir(path)? {
let fname = direntry?.file_name();
let fname = fname.to_str().unwrap();
if let Some(snapfilename) = SnapshotFileName::from_str(fname) {
let snapfile = SnapshotLayer {
conf,
timelineid,
tenantid,
seg: snapfilename.seg,
start_lsn: snapfilename.start_lsn,
end_lsn: snapfilename.end_lsn,
dropped: snapfilename.dropped,
inner: Mutex::new(SnapshotLayerInner {
loaded: false,
page_versions: BTreeMap::new(),
relsizes: BTreeMap::new(),
}),
};
snapfiles.push(Arc::new(snapfile));
}
}
return Ok(snapfiles);
}
pub fn delete(&self) -> Result<()> {
// delete underlying file
fs::remove_file(self.path())?;
Ok(())
}
///
/// Release most of the memory used by this layer. If it's accessed again later,
/// it will need to be loaded back.
///
pub fn unload(&self) -> Result<()> {
let mut inner = self.inner.lock().unwrap();
inner.page_versions = BTreeMap::new();
inner.relsizes = BTreeMap::new();
inner.loaded = false;
Ok(())
}
/// debugging function to print out the contents of the layer
#[allow(unused)]
pub fn dump(&self) -> String {
let mut result = format!(
"----- snapshot layer for {} {}-{} ----\n",
self.seg, self.start_lsn, self.end_lsn
);
let inner = self.inner.lock().unwrap();
for (k, v) in inner.relsizes.iter() {
result += &format!("{}: {}\n", k, v);
}
//for (k, v) in inner.page_versions.iter() {
// result += &format!("blk {} at {}: {}/{}\n", k.0, k.1, v.page_image.is_some(), v.record.is_some());
//}
result
}
}

View File

@@ -0,0 +1,128 @@
//!
//! Common traits and structs for layers
//!
use crate::relish::RelishTag;
use crate::repository::WALRecord;
use crate::ZTimelineId;
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::fmt;
use zenith_utils::lsn::Lsn;
// Size of one segment in pages (10 MB)
pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
///
/// Each relish stored in the repository is divided into fixed-sized "segments",
/// with 10 MB of key-space, or 1280 8k pages each.
///
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
pub struct SegmentTag {
pub rel: RelishTag,
pub segno: u32,
}
impl fmt::Display for SegmentTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}.{}", self.rel, self.segno)
}
}
impl SegmentTag {
pub const fn from_blknum(rel: RelishTag, blknum: u32) -> SegmentTag {
SegmentTag {
rel,
segno: blknum / RELISH_SEG_SIZE,
}
}
pub fn blknum_in_seg(&self, blknum: u32) -> bool {
blknum / RELISH_SEG_SIZE == self.segno
}
}
///
/// Represents a version of a page at a specific LSN. The LSN is the key of the
/// entry in the 'page_versions' hash, it is not duplicated here.
///
/// A page version can be stored as a full page image, or as WAL record that needs
/// to be applied over the previous page version to reconstruct this version.
///
/// It's also possible to have both a WAL record and a page image in the same
/// PageVersion. That happens if page version is originally stored as a WAL record
/// but it is later reconstructed by a GetPage@LSN request by performing WAL
/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
/// the WAL record in that case. TODO: That's pretty accidental, not the result
/// of any grand design. If we want to keep reconstructed page versions around, we
/// probably should have a separate buffer cache so that we could control the
/// replacement policy globally. Or if we keep a reconstructed page image, we
/// could throw away the WAL record.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageVersion {
/// an 8kb page image
pub page_image: Option<Bytes>,
/// WAL record to get from previous page version to this one.
pub record: Option<WALRecord>,
}
///
/// Data needed to reconstruct a page version
///
/// 'page_img' is the old base image of the page to start the WAL replay with.
/// It can be None, if the first WAL record initializes the page (will_init)
/// 'records' contains the records to apply over the base image.
///
pub struct PageReconstructData {
pub records: Vec<WALRecord>,
pub page_img: Option<Bytes>,
}
///
/// A Layer holds all page versions for one segment of a relish, in a range of LSNs.
/// There are two kinds of layers, in-memory and snapshot layers. In-memory
/// layers are used to ingest incoming WAL, and provide fast access
/// to the recent page versions. Snaphot layers are stored on disk, and
/// are immutable. This trait presents the common functionality of
/// in-memory and snapshot layers.
///
/// Each layer contains a full snapshot of the segment at the start
/// LSN. In addition to that, it contains WAL (or more page images)
/// needed to recontruct any page version up to the end LSN.
///
pub trait Layer: Send + Sync {
// These functions identify the relish segment and the LSN range
// that this Layer holds.
fn get_timeline_id(&self) -> ZTimelineId;
fn get_seg_tag(&self) -> SegmentTag;
fn get_start_lsn(&self) -> Lsn;
fn get_end_lsn(&self) -> Lsn;
fn is_dropped(&self) -> bool;
///
/// Return data needed to reconstruct given page at LSN.
///
/// It is up to the caller to collect more data from previous layer and
/// perform WAL redo, if necessary.
///
/// If returns Some, the returned data is not complete. The caller needs
/// to continue with the returned 'lsn'.
///
/// Note that the 'blknum' is the offset of the page from the beginning
/// of the *relish*, not the beginning of the segment. The requested
/// 'blknum' must be covered by this segment.
fn get_page_reconstruct_data(
&self,
blknum: u32,
lsn: Lsn,
reconstruct_data: &mut PageReconstructData,
) -> Result<Option<Lsn>>;
// Functions that correspond to the Timeline trait functions.
fn get_seg_size(&self, lsn: Lsn) -> Result<u32>;
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
}

View File

@@ -9,6 +9,7 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
pub mod basebackup;
pub mod branches;
pub mod layered_repository;
pub mod logger;
pub mod object_key;
pub mod object_repository;
@@ -54,6 +55,14 @@ pub struct PageServerConf {
pub auth_type: AuthType,
pub auth_validation_public_key_path: Option<PathBuf>,
pub repository_format: RepositoryFormat,
}
#[derive(Debug, Clone, PartialEq)]
pub enum RepositoryFormat {
Layered,
RocksDb,
}
impl PageServerConf {

View File

@@ -299,15 +299,13 @@ impl Timeline for ObjectTimeline {
// move this check out of the funciton.
//
match rel {
RelishTag::Slru { .. } |
RelishTag::TwoPhase{ .. } =>
{
RelishTag::Slru { .. } | RelishTag::TwoPhase { .. } => {
if !self.get_rel_exists(rel, req_lsn).unwrap_or(false) {
trace!("{:?} at {} doesn't exist", rel, req_lsn);
return Err(anyhow!("non-rel relish doesn't exist"));
}
},
_ => ()
}
_ => (),
};
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];

View File

@@ -2,11 +2,12 @@
//! page server.
use crate::branches;
use crate::layered_repository::LayeredRepository;
use crate::object_repository::ObjectRepository;
use crate::repository::Repository;
use crate::rocksdb_storage::RocksObjectStore;
use crate::walredo::PostgresRedoManager;
use crate::PageServerConf;
use crate::{PageServerConf, RepositoryFormat};
use anyhow::{anyhow, bail, Result};
use lazy_static::lazy_static;
use log::info;
@@ -27,16 +28,35 @@ pub fn init(conf: &'static PageServerConf) {
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
let tenantid =
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
// Set up a WAL redo manager, for applying WAL records.
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
// Set up an object repository, for actual data storage.
let repo =
ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
let repo: Arc<dyn Repository + Sync + Send> = match conf.repository_format {
RepositoryFormat::Layered => {
let repo = Arc::new(LayeredRepository::new(
conf,
Arc::new(walredo_mgr),
tenantid,
));
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
repo
}
RepositoryFormat::RocksDb => {
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
Arc::new(ObjectRepository::new(
conf,
Arc::new(obj_store),
Arc::new(walredo_mgr),
tenantid,
))
}
};
info!("initialized storage for tenant: {}", &tenantid);
m.insert(tenantid, Arc::new(repo));
m.insert(tenantid, repo);
}
}
@@ -53,7 +73,7 @@ pub fn create_repository_for_tenant(
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
m.insert(tenantid, Arc::new(repo));
m.insert(tenantid, repo);
Ok(())
}

View File

@@ -703,6 +703,18 @@ impl postgres_backend::Handler for PageServerHandler {
RowDescriptor::int8_col(b"control_deleted"),
RowDescriptor::int8_col(b"filenodemap_deleted"),
RowDescriptor::int8_col(b"dropped"),
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"elapsed"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
@@ -715,6 +727,43 @@ impl postgres_backend::Handler for PageServerHandler {
Some(&result.control_deleted.to_string().as_bytes()),
Some(&result.filenodemap_deleted.to_string().as_bytes()),
Some(&result.dropped.to_string().as_bytes()),
Some(&result.snapshot_relfiles_total.to_string().as_bytes()),
Some(
&result
.snapshot_relfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
&result
.snapshot_relfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(&result.snapshot_relfiles_not_updated.to_string().as_bytes()),
Some(&result.snapshot_relfiles_removed.to_string().as_bytes()),
Some(&result.snapshot_relfiles_dropped.to_string().as_bytes()),
Some(&result.snapshot_nonrelfiles_total.to_string().as_bytes()),
Some(
&result
.snapshot_nonrelfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
&result
.snapshot_nonrelfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(
&result
.snapshot_nonrelfiles_not_updated
.to_string()
.as_bytes(),
),
Some(&result.snapshot_nonrelfiles_removed.to_string().as_bytes()),
Some(&result.snapshot_nonrelfiles_dropped.to_string().as_bytes()),
Some(&result.elapsed.as_millis().to_string().as_bytes()),
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;

View File

@@ -119,8 +119,16 @@ impl RelishTag {
| RelishTag::TwoPhase { .. } => true,
// and these don't
| RelishTag::ControlFile
| RelishTag::Checkpoint => false,
RelishTag::ControlFile | RelishTag::Checkpoint => false,
}
}
// convenience function to check if this relish is a normal relation.
pub const fn is_relation(&self) -> bool {
if let RelishTag::Relation(_) = self {
true
} else {
false
}
}
}

View File

@@ -5,6 +5,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::iter::Iterator;
use std::ops::AddAssign;
use std::sync::Arc;
use std::time::Duration;
use zenith_utils::lsn::Lsn;
@@ -56,6 +57,8 @@ pub trait Repository: Send + Sync {
///
#[derive(Default)]
pub struct GcResult {
// FIXME: These counters make sense for the ObjectRepository. They are not used
// by the LayeredRepository.
pub n_relations: u64,
pub inspected: u64,
pub truncated: u64,
@@ -66,9 +69,51 @@ pub struct GcResult {
pub control_deleted: u64, // RelishTag::ControlFile
pub filenodemap_deleted: u64, // RelishTag::FileNodeMap
pub dropped: u64,
// These are used for the LayeredRepository instead
pub snapshot_relfiles_total: u64,
pub snapshot_relfiles_needed_by_cutoff: u64,
pub snapshot_relfiles_needed_by_branches: u64,
pub snapshot_relfiles_not_updated: u64,
pub snapshot_relfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
pub snapshot_relfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
pub snapshot_nonrelfiles_total: u64,
pub snapshot_nonrelfiles_needed_by_cutoff: u64,
pub snapshot_nonrelfiles_needed_by_branches: u64,
pub snapshot_nonrelfiles_not_updated: u64,
pub snapshot_nonrelfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
pub snapshot_nonrelfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
pub elapsed: Duration,
}
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.n_relations += other.n_relations;
self.truncated += other.truncated;
self.deleted += other.deleted;
self.dropped += other.dropped;
self.snapshot_relfiles_total += other.snapshot_relfiles_total;
self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff;
self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches;
self.snapshot_relfiles_not_updated += other.snapshot_relfiles_not_updated;
self.snapshot_relfiles_removed += other.snapshot_relfiles_removed;
self.snapshot_relfiles_dropped += other.snapshot_relfiles_dropped;
self.snapshot_nonrelfiles_total += other.snapshot_nonrelfiles_total;
self.snapshot_nonrelfiles_needed_by_cutoff += other.snapshot_nonrelfiles_needed_by_cutoff;
self.snapshot_nonrelfiles_needed_by_branches +=
other.snapshot_nonrelfiles_needed_by_branches;
self.snapshot_nonrelfiles_not_updated += other.snapshot_nonrelfiles_not_updated;
self.snapshot_nonrelfiles_removed += other.snapshot_nonrelfiles_removed;
self.snapshot_nonrelfiles_dropped += other.snapshot_nonrelfiles_dropped;
self.elapsed += other.elapsed;
}
}
pub trait Timeline: Send + Sync {
//------------------------------------------------------------------------------
// Public GET functions
@@ -239,11 +284,12 @@ impl WALRecord {
#[cfg(test)]
mod tests {
use super::*;
use crate::layered_repository::LayeredRepository;
use crate::object_repository::ObjectRepository;
use crate::object_repository::{ObjectValue, PageEntry, RelationSizeEntry};
use crate::rocksdb_storage::RocksObjectStore;
use crate::walredo::{WalRedoError, WalRedoManager};
use crate::PageServerConf;
use crate::{PageServerConf, RepositoryFormat};
use postgres_ffi::pg_constants;
use std::fs;
use std::path::PathBuf;
@@ -277,10 +323,16 @@ mod tests {
buf.freeze()
}
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
fn get_test_repo(
test_name: &str,
repository_format: RepositoryFormat,
) -> Result<Box<dyn Repository>> {
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
let _ = fs::remove_dir_all(&repo_dir);
fs::create_dir_all(&repo_dir).unwrap();
fs::create_dir_all(&repo_dir)?;
fs::create_dir_all(&repo_dir.join("timelines"))?;
let conf = PageServerConf {
daemonize: false,
@@ -293,6 +345,7 @@ mod tests {
pg_distrib_dir: "".into(),
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
repository_format,
};
// Make a static copy of the config. This can never be free'd, but that's
// OK in a test.
@@ -300,24 +353,47 @@ mod tests {
let tenantid = ZTenantId::generate();
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
let walredo_mgr = TestRedoManager {};
let repo =
ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
let repo: Box<dyn Repository + Sync + Send> = match conf.repository_format {
RepositoryFormat::Layered => Box::new(LayeredRepository::new(
conf,
Arc::new(walredo_mgr),
tenantid,
)),
RepositoryFormat::RocksDb => {
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
Ok(Box::new(repo))
Box::new(ObjectRepository::new(
conf,
Arc::new(obj_store),
Arc::new(walredo_mgr),
tenantid,
))
}
};
Ok(repo)
}
/// Test get_relsize() and truncation.
#[test]
fn test_relsize() -> Result<()> {
fn test_relsize_rocksdb() -> Result<()> {
let repo = get_test_repo("test_relsize_rocksdb", RepositoryFormat::RocksDb)?;
test_relsize(&*repo)
}
#[test]
fn test_relsize_layered() -> Result<()> {
let repo = get_test_repo("test_relsize_layered", RepositoryFormat::Layered)?;
test_relsize(&*repo)
}
fn test_relsize(repo: &dyn Repository) -> Result<()> {
// get_timeline() with non-existent timeline id should fail
//repo.get_timeline("11223344556677881122334455667788");
// Create timeline to work on
let repo = get_test_repo("test_relsize")?;
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
@@ -402,14 +478,24 @@ mod tests {
/// This isn't very interesting with the RocksDb implementation, as we don't pay
/// any attention to Postgres segment boundaries there.
#[test]
fn test_large_rel() -> Result<()> {
let repo = get_test_repo("test_large_rel")?;
fn test_large_rel_rocksdb() -> Result<()> {
let repo = get_test_repo("test_large_rel_rocksdb", RepositoryFormat::RocksDb)?;
test_large_rel(&*repo)
}
#[test]
fn test_large_rel_layered() -> Result<()> {
let repo = get_test_repo("test_large_rel_layered", RepositoryFormat::Layered)?;
test_large_rel(&*repo)
}
fn test_large_rel(repo: &dyn Repository) -> Result<()> {
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
tline.init_valid_lsn(Lsn(1));
let mut lsn = 0;
let mut lsn = 1;
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
lsn += 1;
@@ -440,6 +526,21 @@ mod tests {
pg_constants::RELSEG_SIZE - 1
);
// Truncate to 1500, and then truncate all the way down to 0, one block at a time
// This tests the behavior at segment boundaries
let mut size: i32 = 3000;
while size >= 0 {
lsn += 1;
tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
tline.advance_last_valid_lsn(Lsn(lsn));
assert_eq!(
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
size as u32
);
size -= 1;
}
Ok(())
}
@@ -455,15 +556,29 @@ mod tests {
}))
}
#[test]
fn test_branch_rocksdb() -> Result<()> {
let repo = get_test_repo("test_branch_rocksdb", RepositoryFormat::RocksDb)?;
test_branch(&*repo)
}
#[test]
fn test_branch_layered() -> Result<()> {
let repo = get_test_repo("test_branch_layered", RepositoryFormat::Layered)?;
test_branch(&*repo)
}
///
/// Test branch creation
///
#[test]
fn test_branch() -> Result<()> {
let repo = get_test_repo("test_branch")?;
fn test_branch(repo: &dyn Repository) -> Result<()> {
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
// Import initial dummy checkpoint record, otherwise the get_timeline() call
// after branching fails below
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(1), ZERO_PAGE.clone(), false)?;
// Create a relation on the timeline
tline.init_valid_lsn(Lsn(1));
tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
@@ -505,8 +620,19 @@ mod tests {
}
#[test]
fn test_history() -> Result<()> {
let repo = get_test_repo("test_snapshot")?;
fn test_history_rocksdb() -> Result<()> {
let repo = get_test_repo("test_history_rocksdb", RepositoryFormat::RocksDb)?;
test_history(&*repo)
}
#[test]
// TODO: This doesn't work with the layered storage, the functions needed for push/pull
// functionality haven't been implemented yet.
#[ignore]
fn test_history_layered() -> Result<()> {
let repo = get_test_repo("test_history_layered", RepositoryFormat::Layered)?;
test_history(&*repo)
}
fn test_history(repo: &dyn Repository) -> Result<()> {
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

View File

@@ -132,6 +132,7 @@ pub fn import_timeline_from_postgres_datadir(
}
// TODO: Scan pg_tblspc
timeline.advance_last_valid_lsn(lsn);
timeline.checkpoint()?;
Ok(())
@@ -424,13 +425,13 @@ pub fn save_decoded_record(
let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
info!(
"unlink twophaseFile for xid {} parsed_xact.xid {} here",
decoded.xl_xid, parsed_xact.xid
trace!(
"unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
decoded.xl_xid, parsed_xact.xid, lsn
);
timeline.put_unlink(
RelishTag::TwoPhase {
xid: decoded.xl_xid,
xid: parsed_xact.xid,
},
lsn,
)?;
@@ -795,7 +796,13 @@ fn save_clog_truncate_record(
// Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
// TODO This implementation is very inefficient -
// it scans all non-rels only to find Clog
for obj in timeline.list_nonrels(lsn)? {
//
// We cannot pass 'lsn' to the Timeline.list_nonrels(), or it
// will block waiting for the last valid LSN to advance up to
// it. So we use the previous record's LSN in the get calls
// instead.
let req_lsn = min(timeline.get_last_record_lsn(), lsn);
for obj in timeline.list_nonrels(req_lsn)? {
match obj {
RelishTag::Slru { slru, segno } => {
if slru == SlruKind::Clog {

View File

@@ -8,7 +8,7 @@ use crate::page_cache;
use crate::relish::*;
use crate::restore_local_repo;
use crate::waldecoder::*;
use crate::PageServerConf;
use crate::{PageServerConf, RepositoryFormat};
use anyhow::{Error, Result};
use lazy_static::lazy_static;
use log::*;
@@ -264,7 +264,11 @@ fn walreceiver_main(
)?;
if newest_segno - oldest_segno >= 10 {
timeline.checkpoint()?;
// FIXME: The layered repository performs checkpointing in a separate thread, so this
// isn't needed anymore. Remove 'checkpoint' from the Timeline trait altogether?
if conf.repository_format == RepositoryFormat::RocksDb {
timeline.checkpoint()?;
}
// TODO: This is where we could remove WAL older than last_rec_lsn.
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;

View File

@@ -20,6 +20,7 @@
//!
use byteorder::{ByteOrder, LittleEndian};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use lazy_static::lazy_static;
use log::*;
use serde::{Deserialize, Serialize};
use std::cell::RefCell;
@@ -36,6 +37,7 @@ use tokio::io::AsyncBufReadExt;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::process::{ChildStdin, ChildStdout, Command};
use tokio::time::timeout;
use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZTenantId;
@@ -103,6 +105,27 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
static TIMEOUT: Duration = Duration::from_secs(20);
// Metrics collected on WAL redo operations
//
// We collect the time spent in actual WAL redo ('redo'), and time waiting
// for access to the postgres process ('wait') since there is only one for
// each tenant.
lazy_static! {
static ref WAL_REDO_TIME: Histogram =
register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo")
.expect("failed to define a metric");
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
"pageserver_wal_redo_wait_time",
"Time spent waiting for access to the WAL redo process"
)
.expect("failed to define a metric");
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
"pageserver_wal_records_replayed",
"Number of WAL records replayed"
)
.unwrap();
}
///
/// This is the real implementation that uses a Postgres process to
/// perform WAL replay. Only one thread can use the processs at a time,
@@ -156,6 +179,9 @@ impl WalRedoManager for PostgresRedoManager {
base_img: Option<Bytes>,
records: Vec<WALRecord>,
) -> Result<Bytes, WalRedoError> {
let start_time;
let lock_time;
let end_time;
let request = WalRedoRequest {
rel,
@@ -165,16 +191,29 @@ impl WalRedoManager for PostgresRedoManager {
records,
};
// launch the WAL redo process on first use
let mut process_guard = self.process.lock().unwrap();
if process_guard.is_none() {
let p = self.runtime
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
*process_guard = Some(p);
}
let process = (*process_guard).as_ref().unwrap();
start_time = Instant::now();
let result = {
let mut process_guard = self.process.lock().unwrap();
lock_time = Instant::now();
self.runtime.block_on(self.handle_apply_request(&process, &request))
// launch the WAL redo process on first use
if process_guard.is_none() {
let p = self
.runtime
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
*process_guard = Some(p);
}
let process = (*process_guard).as_ref().unwrap();
self.runtime
.block_on(self.handle_apply_request(&process, &request))
};
end_time = Instant::now();
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
result
}
}
@@ -197,12 +236,10 @@ fn mx_offset_to_member_offset(xid: MultiXactId) -> usize {
}
impl PostgresRedoManager {
///
/// Create a new PostgresRedoManager.
///
pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
// We block on waiting for requests on the walredo request channel, but
// use async I/O to communicate with the child process. Initialize the
// runtime for the async part.
@@ -244,6 +281,8 @@ impl PostgresRedoManager {
let buf_tag = BufferTag { rel, blknum };
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
} else {
// Non-relational WAL records are handled here, with custom code that has the
// same effects as the corresponding Postgres WAL redo function.
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let mut page = BytesMut::new();
if let Some(fpi) = base_img {
@@ -257,6 +296,8 @@ impl PostgresRedoManager {
for record in records {
let mut buf = record.rec.clone();
WAL_REDO_RECORD_COUNTER.inc();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
@@ -378,7 +419,7 @@ impl PostgresRedoManager {
panic!();
}
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
// Ralation map file has size 512 bytes
// Relation map file has size 512 bytes
page.clear();
page.extend_from_slice(&buf[12..]); // skip xl_relmap_update
assert!(page.len() == 512); // size of pg_filenode.map
@@ -557,6 +598,8 @@ impl PostgresRedoProcess {
for rec in records.iter() {
let r = rec.clone();
WAL_REDO_RECORD_COUNTER.inc();
stdin
.write_all(&build_apply_record_msg(r.lsn, r.rec))
.await?;

View File

@@ -14,7 +14,8 @@ pytest_plugins = ("fixtures.zenith_fixtures")
#
@pytest.mark.skip(reason=""""
Current GC test is flaky and overly strict. Since we are migrating to the layered repo format
with different GC implementation let's just silence this test for now.
with different GC implementation let's just silence this test for now. This test only
works with the RocksDB implementation.
""")
def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
zenith_cli.run(["branch", "test_gc", "empty"])

View File

@@ -0,0 +1,66 @@
import pytest
import random
import time
from contextlib import closing
from multiprocessing import Process, Value
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
pytest_plugins = ("fixtures.zenith_fixtures")
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
# times, with fault_probability chance of getting a wal acceptor down or up
# along the way. 2 of 3 are always alive, so the work keeps going.
def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
# One safekeeper is enough for this test.
wa_factory.start_n_new(1)
zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
pg = postgres.create_start('test_pageserver_restart',
wal_acceptors=wa_factory.get_connstrs())
pg_conn = pg.connect()
cur = pg_conn.cursor()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
cur.execute('CREATE TABLE foo (t text)')
cur.execute('''
INSERT INTO foo
SELECT 'long string to consume some space' || g
FROM generate_series(1, 100000) g
''')
# Verify that the table is larger than shared_buffers
cur.execute('''
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
from pg_settings where name = 'shared_buffers'
''')
row = cur.fetchone()
print("shared_buffers is {}, table size {}", row[0], row[1]);
assert int(row[0]) < int(row[1])
# Stop and restart pageserver. This is a more or less graceful shutdown, although
# the page server doesn't currently have a shutdown routine so there's no difference
# between stopping and crashing.
pageserver.stop();
pageserver.start();
# Stopping the pageserver breaks the connection from the postgres backend to
# the page server, and causes the next query on the connection to fail. Start a new
# postgres connection too, to avoid that error. (Ideally, the compute node would
# handle that and retry internally, without propagating the error to the user, but
# currently it doesn't...)
pg_conn = pg.connect()
cur = pg_conn.cursor()
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000, )
# Stop the page server by force, and restart it
pageserver.stop();
pageserver.start();

View File

@@ -0,0 +1,124 @@
from contextlib import closing
import psycopg2.extras
import time;
pytest_plugins = ("fixtures.zenith_fixtures")
def print_gc_result(row):
print("GC duration {elapsed} ms".format_map(row));
print(" REL total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
print(" NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
#
# Test Garbage Collection of old snapshot files
#
# This test is pretty tightly coupled with the current implementation of layered
# storage, in layered_repository.rs.
#
def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
pg = postgres.create_start('test_snapfiles_gc')
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
with closing(pageserver.connect()) as psconn:
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
# Get the timeline ID of our branch. We need it for the 'do_gc' command
cur.execute("SHOW zenith.zenith_timeline")
timeline = cur.fetchone()[0]
# Create a test table
cur.execute("CREATE TABLE foo(x integer)")
cur.execute("INSERT INTO foo VALUES (1)")
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
row = cur.fetchone();
print("relfilenode is {}", row[0]);
# Run GC, to clear out any garbage left behind in the catalogs by
# the CREATE TABLE command. We want to have a clean slate with no garbage
# before running the actual tests below, otherwise the counts won't match
# what we expect.
#
# Also run vacuum first to make it less likely that autovacuum or pruning
# kicks in and confuses our numbers.
cur.execute("VACUUM")
# delete the row, to update the Visibility Map. We don't want the VM
# update to confuse our numbers either.
cur.execute("DELETE FROM foo")
print("Running GC before test")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
# remember the number of files
snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
assert snapshot_relfiles_remain > 0
# Insert a row.
print("Inserting one row and running GC")
cur.execute("INSERT INTO foo VALUES (1)")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
assert row['snapshot_relfiles_removed'] == 1
assert row['snapshot_relfiles_dropped'] == 0
# Insert two more rows and run GC.
# This should create a new snapshot file with the new contents, and
# remove the old one.
print("Inserting two more rows and running GC")
cur.execute("INSERT INTO foo VALUES (2)")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
assert row['snapshot_relfiles_removed'] == 1
assert row['snapshot_relfiles_dropped'] == 0
# Do it again. Should again create a new snapshot file and remove old one.
print("Inserting two more rows and running GC")
cur.execute("INSERT INTO foo VALUES (2)")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
assert row['snapshot_relfiles_removed'] == 1
assert row['snapshot_relfiles_dropped'] == 0
# Run GC again, with no changes in the database. Should not remove anything.
print("Run GC again, with nothing to do")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
assert row['snapshot_relfiles_removed'] == 0
assert row['snapshot_relfiles_dropped'] == 0
#
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
#
print("Drop table and run GC again");
cur.execute("DROP TABLE foo")
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
row = pscur.fetchone()
print_gc_result(row);
# Each relation fork is counted separately, hence 3.
assert row['snapshot_relfiles_dropped'] == 3
# The catalog updates also create new snapshot files of the catalogs, which
# are counted as 'removed'
assert row['snapshot_relfiles_removed'] > 0
# TODO: perhaps we should count catalog and user relations separately,
# to make this kind of testing more robust

View File

@@ -1,3 +1,5 @@
import os
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
@@ -28,24 +30,59 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
cur.execute("INSERT INTO foo VALUES ('two')")
cur.execute("PREPARE TRANSACTION 'insert_two'")
# Prepare a transaction that will insert a row
cur.execute('BEGIN')
cur.execute("INSERT INTO foo VALUES ('three')")
cur.execute("PREPARE TRANSACTION 'insert_three'")
# Prepare another transaction that will insert a row
cur.execute('BEGIN')
cur.execute("INSERT INTO foo VALUES ('four')")
cur.execute("PREPARE TRANSACTION 'insert_four'")
# On checkpoint state data copied to files in
# pg_twophase directory and fsynced
cur.execute('CHECKPOINT')
twophase_files = os.listdir(pg.pg_twophase_dir_path())
print(twophase_files)
assert len(twophase_files) == 4
cur.execute("COMMIT PREPARED 'insert_three'")
cur.execute("ROLLBACK PREPARED 'insert_four'")
cur.execute('CHECKPOINT')
twophase_files = os.listdir(pg.pg_twophase_dir_path())
print(twophase_files)
assert len(twophase_files) == 2
# Create a branch with the transaction in prepared state
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
pg2 = postgres.create_start(
# Create compute node, but don't start.
# We want to observe pgdata before postgres starts
pg2 = postgres.create(
'test_twophase_prepared',
config_lines=['max_prepared_transactions=5'],
)
# Check that we restored only needed twophase files
twophase_files2 = os.listdir(pg2.pg_twophase_dir_path())
print(twophase_files2)
assert twophase_files2.sort() == twophase_files.sort()
pg2 = pg2.start()
conn2 = pg2.connect()
cur2 = conn2.cursor()
# On the new branch, commit one of the prepared transactions, abort the other one.
# On the new branch, commit one of the prepared transactions,
# abort the other one.
cur2.execute("COMMIT PREPARED 'insert_one'")
cur2.execute("ROLLBACK PREPARED 'insert_two'")
cur2.execute('SELECT * FROM foo')
assert cur2.fetchall() == [('one', )]
assert cur2.fetchall() == [('one',), ('three',)]
# Neither insert is visible on the original branch, the transactions are still
# in prepared state there.
# Only one committed insert is visible on the original branch
cur.execute('SELECT * FROM foo')
assert cur.fetchall() == []
assert cur.fetchall() == [('three',)]

View File

@@ -4,7 +4,7 @@ import time
from contextlib import closing
from multiprocessing import Process, Value
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
pytest_plugins = ("fixtures.zenith_fixtures")
@@ -61,7 +61,7 @@ def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: Post
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
# times, with fault_probability chance of getting a wal acceptor down or up
# along the way. 2 of 3 are always alive, so the work keeps going.
def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
fault_probability = 0.01
n_inserts = 1000
n_acceptors = 3

View File

@@ -1,4 +1,3 @@
import getpass
import os
import pathlib
import uuid
@@ -7,9 +6,11 @@ import pytest
import shutil
import signal
import subprocess
import time
from contextlib import closing
from pathlib import Path
from dataclasses import dataclass
# Type-related stuff
from psycopg2.extensions import connection as PgConnection
@@ -266,6 +267,7 @@ class Postgres(PgProtocol):
branch: str,
wal_acceptors: Optional[str] = None,
config_lines: Optional[List[str]] = None,
config_only: bool = False,
) -> 'Postgres':
"""
Create the pg data directory.
@@ -277,7 +279,10 @@ class Postgres(PgProtocol):
if not config_lines:
config_lines = []
self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
if config_only:
self.zenith_cli.run(['pg', 'create', '--config-only', branch, f'--tenantid={self.tenant_id}'])
else:
self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
self.branch = branch
if wal_acceptors is not None:
self.adjust_for_wal_acceptors(wal_acceptors)
@@ -304,6 +309,13 @@ class Postgres(PgProtocol):
path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch / 'pg_xact'
return os.path.join(self.repo_dir, path)
def pg_twophase_dir_path(self) -> str:
""" Path to pg_twophase dir """
print(self.tenant_id)
print(self.branch)
path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch / 'pg_twophase'
return os.path.join(self.repo_dir, path)
def config_file_path(self) -> str:
""" Path to postgresql.conf """
filename = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch / 'postgresql.conf'
@@ -376,7 +388,8 @@ class Postgres(PgProtocol):
config_lines: Optional[List[str]] = None,
) -> 'Postgres':
"""
Create a Postgres instance, then start it.
Create a Postgres instance, apply config
and then start it.
Returns self.
"""
@@ -384,6 +397,7 @@ class Postgres(PgProtocol):
branch=branch,
wal_acceptors=wal_acceptors,
config_lines=config_lines,
config_only=True,
).start()
return self
@@ -429,6 +443,54 @@ class PostgresFactory:
config_lines=config_lines,
)
def create(
self,
branch: str = "main",
tenant_id: Optional[str] = None,
wal_acceptors: Optional[str] = None,
config_lines: Optional[List[str]] = None
) -> Postgres:
pg = Postgres(
zenith_cli=self.zenith_cli,
repo_dir=self.repo_dir,
tenant_id=tenant_id or self.initial_tenant,
port=self.base_port + self.num_instances + 1,
)
self.num_instances += 1
self.instances.append(pg)
return pg.create(
branch=branch,
wal_acceptors=wal_acceptors,
config_lines=config_lines,
)
def config(
self,
branch: str = "main",
tenant_id: Optional[str] = None,
wal_acceptors: Optional[str] = None,
config_lines: Optional[List[str]] = None
) -> Postgres:
pg = Postgres(
zenith_cli=self.zenith_cli,
repo_dir=self.repo_dir,
tenant_id=tenant_id or self.initial_tenant,
port=self.base_port + self.num_instances + 1,
)
self.num_instances += 1
self.instances.append(pg)
return pg.config(
branch=branch,
wal_acceptors=wal_acceptors,
config_lines=config_lines,
)
def stop_all(self) -> 'PostgresFactory':
for pg in self.instances:
pg.stop()
@@ -511,26 +573,27 @@ def pg_bin(test_output_dir: str, pg_distrib_dir: str) -> PgBin:
return PgBin(test_output_dir, pg_distrib_dir)
def read_pid(path):
def read_pid(path: Path):
""" Read content of file into number """
return int(Path(path).read_text())
return int(path.read_text())
@dataclass
class WalAcceptor:
""" An object representing a running wal acceptor daemon. """
def __init__(self, wa_binpath, data_dir, port, num, auth_token: Optional[str] = None):
self.wa_binpath = wa_binpath
self.data_dir = data_dir
self.port = port
self.num = num # identifier for logging
self.auth_token = auth_token
bin_path: Path
data_dir: Path
port: int
num: int # identifier for logging
auth_token: Optional[str] = None
def start(self) -> 'WalAcceptor':
# create data directory if not exists
Path(self.data_dir).mkdir(parents=True, exist_ok=True)
self.data_dir.mkdir(parents=True, exist_ok=True)
self.pidfile.unlink(missing_ok=True)
cmd = [self.wa_binpath]
cmd.extend(["-D", self.data_dir])
cmd = [str(self.bin_path)]
cmd.extend(["-D", str(self.data_dir)])
cmd.extend(["-l", "localhost:{}".format(self.port)])
cmd.append("--daemonize")
cmd.append("--no-sync")
@@ -541,38 +604,51 @@ class WalAcceptor:
env = {'PAGESERVER_AUTH_TOKEN': self.auth_token} if self.auth_token else None
subprocess.run(cmd, check=True, env=env)
return self
# wait for wal acceptor start by checkking that pid is readable
for _ in range(3):
pid = self.get_pid()
if pid is not None:
return self
time.sleep(0.5)
raise RuntimeError("cannot get wal acceptor pid")
@property
def pidfile(self) -> Path:
return self.data_dir / "wal_acceptor.pid"
def get_pid(self) -> Optional[int]:
if not self.pidfile.exists():
return None
try:
pid = read_pid(self.pidfile)
except ValueError:
return None
return pid
def stop(self) -> 'WalAcceptor':
print('Stopping wal acceptor {}'.format(self.num))
pidfile_path = os.path.join(self.data_dir, "wal_acceptor.pid")
try:
pid = read_pid(pidfile_path)
try:
os.kill(pid, signal.SIGTERM)
except Exception:
pass # pidfile might be obsolete
# TODO: cleanup pid file on exit in wal acceptor
return self
# for _ in range(5):
# print('waiting wal acceptor {} (pid {}) to stop...', self.num, pid)
# try:
# read_pid(pidfile_path)
# except FileNotFoundError:
# return # done
# time.sleep(1)
# raise Exception('Failed to wait for wal acceptor {} shutdown'.format(self.num))
except FileNotFoundError:
pid = self.get_pid()
if pid is None:
print("Wal acceptor {} is not running".format(self.num))
return self
try:
os.kill(pid, signal.SIGTERM)
except Exception:
# TODO: cleanup pid file on exit in wal acceptor
pass # pidfile might be obsolete
return self
class WalAcceptorFactory:
""" An object representing multiple running wal acceptors. """
def __init__(self, zenith_binpath, data_dir):
self.wa_binpath = os.path.join(zenith_binpath, 'wal_acceptor')
def __init__(self, zenith_binpath: Path, data_dir: Path):
self.wa_binpath = zenith_binpath / 'wal_acceptor'
self.data_dir = data_dir
self.instances = []
self.instances: List[WalAcceptor] = []
self.initial_port = 54321
def start_new(self, auth_token: Optional[str] = None) -> WalAcceptor:
@@ -583,7 +659,7 @@ class WalAcceptorFactory:
wa_num = len(self.instances)
wa = WalAcceptor(
self.wa_binpath,
os.path.join(self.data_dir, "wal_acceptor_{}".format(wa_num)),
self.data_dir / "wal_acceptor_{}".format(wa_num),
self.initial_port + wa_num,
wa_num,
auth_token,
@@ -613,7 +689,7 @@ class WalAcceptorFactory:
@zenfixture
def wa_factory(zenith_binpath: str, repo_dir: str) -> Iterator[WalAcceptorFactory]:
""" Gives WalAcceptorFactory providing wal acceptors. """
wafactory = WalAcceptorFactory(zenith_binpath, os.path.join(repo_dir, "wal_acceptors"))
wafactory = WalAcceptorFactory(Path(zenith_binpath), Path(repo_dir) / "wal_acceptors")
yield wafactory
# After the yield comes any cleanup code we need.
print('Starting wal acceptors cleanup')

View File

@@ -2,7 +2,7 @@
//!
//! FIXME: better description needed here
use anyhow::{bail, Result};
use anyhow::{bail, Context, Result};
use bincode::config::Options;
use bytes::{Buf, Bytes};
use log::*;
@@ -236,7 +236,9 @@ impl<'pg> ReceiveWalConn<'pg> {
.write_message(&BeMessage::CopyBothResponse)?;
// Receive information about server
let server_info = self.read_msg::<ServerInfo>()?;
let server_info = self
.read_msg::<ServerInfo>()
.context("Failed to receive server info")?;
info!(
"Start handshake with wal_proposer {} sysid {} timeline {} tenant {}",
self.peer_addr, server_info.system_id, server_info.timeline_id, server_info.tenant_id,
@@ -284,7 +286,9 @@ impl<'pg> ReceiveWalConn<'pg> {
self.write_msg(&my_info)?;
/* Wait for vote request */
let prop = self.read_msg::<RequestVote>()?;
let prop = self
.read_msg::<RequestVote>()
.context("Failed to read vote request")?;
/* This is Paxos check which should ensure that only one master can perform commits */
if prop.node_id < my_info.server.node_id {
/* Send my node-id to inform proposer that it's candidate was rejected */
@@ -331,7 +335,8 @@ impl<'pg> ReceiveWalConn<'pg> {
let msg_bytes = self.read_msg_bytes()?;
let mut msg_reader = msg_bytes.reader();
let req = SafeKeeperRequest::des_from(&mut msg_reader)?;
let req = SafeKeeperRequest::des_from(&mut msg_reader)
.context("Failed to get WAL message header")?;
if req.sender_id != my_info.server.node_id {
bail!("Sender NodeId is changed");
}

View File

@@ -61,6 +61,13 @@ fn main() -> Result<()> {
.long("enable-auth")
.takes_value(false)
.help("Enable authentication using ZenithJWT")
)
.arg(
Arg::with_name("repository-format")
.long("repository-format")
.takes_value(false)
.value_name("repository-format")
.help("Choose repository format, 'layered' or 'rocksdb'")
),
)
.subcommand(
@@ -85,8 +92,18 @@ fn main() -> Result<()> {
.setting(AppSettings::ArgRequiredElseHelp)
.about("Manage postgres instances")
.subcommand(SubCommand::with_name("list").arg(tenantid_arg.clone()))
.subcommand(SubCommand::with_name("create").arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
.subcommand(SubCommand::with_name("start").arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
.subcommand(SubCommand::with_name("create")
.about("Create a postgres compute node")
.arg(timeline_arg.clone()).arg(tenantid_arg.clone())
.arg(
Arg::with_name("config-only")
.help("Don't do basebackup, create compute node with only config files")
.long("config-only")
.required(false)
))
.subcommand(SubCommand::with_name("start")
.about("Start a postrges compute node.\n This command actually creates new node from scrath, but preserves existing config files")
.arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
.subcommand(
SubCommand::with_name("stop")
.arg(timeline_arg.clone())
@@ -131,8 +148,8 @@ fn main() -> Result<()> {
} else {
AuthType::Trust
};
local_env::init(pageserver_uri, tenantid, auth_type)
let repository_format = init_match.value_of("repository-format");
local_env::init(pageserver_uri, tenantid, auth_type, repository_format)
.with_context(|| "Failed to create config file")?;
}
@@ -151,6 +168,7 @@ fn main() -> Result<()> {
if let Err(e) = pageserver.init(
Some(&env.tenantid.to_string()),
init_match.is_present("enable-auth"),
init_match.value_of("repository-format"),
) {
eprintln!("pageserver init failed: {}", e);
exit(1);
@@ -451,10 +469,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
.value_of("tenantid")
.map_or(Ok(env.tenantid), |value| value.parse())?;
let timeline_name = create_match.value_of("timeline").unwrap_or("main");
// check is that timeline doesnt already exist
// this check here is because it
let config_only = create_match.is_present("config-only");
cplane.new_node(tenantid, timeline_name)?;
cplane.new_node(tenantid, timeline_name, config_only)?;
}
("start", Some(start_match)) => {
let tenantid: ZTenantId = start_match
@@ -475,7 +492,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
if let Some(node) = node {
node.start(&auth_token)?;
} else {
let node = cplane.new_node(tenantid, timeline_name)?;
let node = cplane.new_node(tenantid, timeline_name, false)?;
node.start(&auth_token)?;
}
}

View File

@@ -301,8 +301,9 @@ impl PostgresBackend {
FeMessage::Query(m) => {
trace!("got query {:?}", m.body);
// xxx distinguish fatal and recoverable errors?
if let Err(e) = handler.process_query(self, m.body) {
if let Err(e) = handler.process_query(self, m.body.clone()) {
let errmsg = format!("{}", e);
warn!("query handler for {:?} failed: {}", m.body, errmsg);
self.write_message_noflush(&BeMessage::ErrorResponse(errmsg))?;
}
self.write_message(&BeMessage::ReadyForQuery)?;

View File

@@ -126,7 +126,7 @@ macro_rules! zid_newtype {
/// is separate from PostgreSQL timelines, and doesn't have those
/// limitations. A zenith timeline is identified by a 128-bit ID, which
/// is usually printed out as a hex string.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
pub struct ZTimelineId(ZId);
zid_newtype!(ZTimelineId);