mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 06:30:43 +00:00
Compare commits
58 Commits
bin_ser/lo
...
try-to-fai
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
19e26f53fb | ||
|
|
18ade98955 | ||
|
|
849bbfa777 | ||
|
|
f7f377cfc1 | ||
|
|
fdb19fdb92 | ||
|
|
53b4dc944d | ||
|
|
a03e1b3895 | ||
|
|
15f1bcc9c2 | ||
|
|
24580f2493 | ||
|
|
e3945d94fd | ||
|
|
d806c3a47e | ||
|
|
05fe39088b | ||
|
|
530d3eaf09 | ||
|
|
7e190d72a5 | ||
|
|
9c936034b6 | ||
|
|
5719f13cb2 | ||
|
|
d134a9856e | ||
|
|
664b99b5ac | ||
|
|
4256231eb7 | ||
|
|
ae27490281 | ||
|
|
fbd8ca2ff4 | ||
|
|
ec673a5d67 | ||
|
|
7fab38c51e | ||
|
|
84f7dcd052 | ||
|
|
7095a5d551 | ||
|
|
538c2a2a3e | ||
|
|
62f83869f1 | ||
|
|
69670b61c4 | ||
|
|
0a8aaa2c24 | ||
|
|
e474790400 | ||
|
|
2c99e2461a | ||
|
|
cf8e27a554 | ||
|
|
287ea2e5e3 | ||
|
|
86e14f2f1a | ||
|
|
adbae62281 | ||
|
|
3127a4a13b | ||
|
|
6d993410c9 | ||
|
|
fb05e4cb0b | ||
|
|
b0a7234759 | ||
|
|
ddf4b15ebc | ||
|
|
3065532f15 | ||
|
|
d6fc74a412 | ||
|
|
7a370394a7 | ||
|
|
0f3cf8ac94 | ||
|
|
014be8b230 | ||
|
|
08978458be | ||
|
|
2252d9faa8 | ||
|
|
22e15844ae | ||
|
|
ca9af37478 | ||
|
|
aae41e8661 | ||
|
|
8331ce865c | ||
|
|
3bac4d485d | ||
|
|
f84eaf4f05 | ||
|
|
70b08923ed | ||
|
|
c846a824de | ||
|
|
b71e3a40e2 | ||
|
|
41dfc117e7 | ||
|
|
a72707b8cb |
@@ -24,6 +24,12 @@ jobs:
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
@@ -39,7 +45,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
@@ -59,12 +65,12 @@ jobs:
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres
|
||||
make postgres -j8
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
@@ -96,7 +102,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
@@ -254,7 +260,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -328,14 +334,18 @@ workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- check-codestyle
|
||||
- build-postgres
|
||||
- build-postgres:
|
||||
name: build-postgres-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres
|
||||
- build-postgres-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: pg_regress-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
|
||||
@@ -2,12 +2,17 @@
|
||||
**/__pycache__
|
||||
**/.pytest_cache
|
||||
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
/test_output
|
||||
/.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
/Dockerfile
|
||||
.git
|
||||
target
|
||||
tmp_check
|
||||
tmp_install
|
||||
tmp_check_cli
|
||||
test_output
|
||||
.vscode
|
||||
.zenith
|
||||
integration_tests/.zenith
|
||||
.mypy_cache
|
||||
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
|
||||
|
||||
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,4 +1,4 @@
|
||||
[submodule "vendor/postgres"]
|
||||
path = vendor/postgres
|
||||
url = https://github.com/zenithdb/postgres
|
||||
branch = main
|
||||
branch = walproposer_more_logs
|
||||
|
||||
25
Cargo.lock
generated
25
Cargo.lock
generated
@@ -307,6 +307,26 @@ dependencies = [
|
||||
"vec_map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4556f63e28a78fa5e6f310cfea5647a25636def49a338ab69e33b34a3382057b"
|
||||
dependencies = [
|
||||
"const_format_proc_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_format_proc_macros"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "552782506c398da94466b364973b563887e0ca078bf33a76d4163736165e3594"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
@@ -1179,10 +1199,12 @@ dependencies = [
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
"futures",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
@@ -1406,6 +1428,7 @@ dependencies = [
|
||||
"hex",
|
||||
"md5",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"rustls",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2555,6 +2578,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"prometheus",
|
||||
]
|
||||
|
||||
@@ -2584,6 +2608,7 @@ dependencies = [
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"webpki",
|
||||
|
||||
34
Dockerfile
34
Dockerfile
@@ -10,39 +10,21 @@ FROM zenithdb/build:buster AS pg-build
|
||||
WORKDIR /zenith
|
||||
COPY ./vendor/postgres vendor/postgres
|
||||
COPY ./Makefile Makefile
|
||||
ENV BUILD_TYPE release
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
|
||||
|
||||
#
|
||||
# Calculate cargo dependencies.
|
||||
# This will always run, but only generate recipe.json with list of dependencies without
|
||||
# installing them.
|
||||
#
|
||||
FROM zenithdb/build:buster AS cargo-deps-inspect
|
||||
WORKDIR /zenith
|
||||
COPY . .
|
||||
RUN cargo chef prepare --recipe-path /zenith/recipe.json
|
||||
|
||||
#
|
||||
# Build cargo dependencies.
|
||||
# This temp cantainner should be rebuilt only if recipe.json was changed.
|
||||
#
|
||||
FROM zenithdb/build:buster AS deps-build
|
||||
WORKDIR /zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=cargo-deps-inspect /usr/local/cargo/bin/cargo-chef /usr/local/cargo/bin/
|
||||
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
|
||||
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
|
||||
RUN rm -rf postgres_install/build
|
||||
|
||||
#
|
||||
# Build zenith binaries
|
||||
#
|
||||
# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
|
||||
# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
|
||||
#
|
||||
FROM zenithdb/build:buster AS build
|
||||
WORKDIR /zenith
|
||||
COPY . .
|
||||
# Copy cached dependencies
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY --from=deps-build /zenith/target target
|
||||
COPY --from=deps-build /usr/local/cargo/ /usr/local/cargo/
|
||||
|
||||
COPY . .
|
||||
RUN cargo build --release
|
||||
|
||||
#
|
||||
@@ -51,7 +33,7 @@ RUN cargo build --release
|
||||
FROM debian:buster-slim
|
||||
WORKDIR /data
|
||||
|
||||
RUN apt-get update && apt-get -yq install librocksdb-dev libseccomp-dev openssl && \
|
||||
RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
|
||||
mkdir zenith_install
|
||||
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
|
||||
@@ -9,7 +9,7 @@ WORKDIR /zenith
|
||||
# Install postgres and zenith build dependencies
|
||||
# clang is for rocksdb
|
||||
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libseccomp-dev pkg-config libssl-dev librocksdb-dev clang
|
||||
libseccomp-dev pkg-config libssl-dev clang
|
||||
|
||||
# Install rust tools
|
||||
RUN rustup component add clippy && cargo install cargo-chef cargo-audit
|
||||
RUN rustup component add clippy && cargo install cargo-audit
|
||||
|
||||
18
Makefile
18
Makefile
@@ -6,6 +6,18 @@ else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# We differentiate between release / debug build types using the BUILD_TYPE
|
||||
# environment variable.
|
||||
#
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CFLAGS = -O2 -g3 ${CFLAGS}
|
||||
else
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 ${CFLAGS}
|
||||
endif
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
@@ -30,10 +42,8 @@ tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ Pageserver consists of:
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
```
|
||||
|
||||
[Rust] 1.52 or later is also required.
|
||||
@@ -108,6 +108,13 @@ postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
|
||||
```sh
|
||||
> ./target/debug/zenith pg stop migration_check
|
||||
> ./target/debug/zenith pg stop main
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
|
||||
@@ -452,9 +452,7 @@ impl PostgresNode {
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
assert!(output.status.success(), "whoami failed");
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
@@ -10,5 +10,5 @@
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -34,8 +34,12 @@ toml = "0.5"
|
||||
scopeguard = "1.1.0"
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
async-trait = "0.1"
|
||||
const_format = "0.2.21"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
hex-literal = "0.3"
|
||||
|
||||
@@ -7,8 +7,9 @@ The Page Server has a few different duties:
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
- Backup to S3
|
||||
|
||||
|
||||
|
||||
S3 is the main fault-tolerant storage of all data, as there are no Page Server
|
||||
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
|
||||
keeps track of WAL records which are not syncted to S3 yet.
|
||||
|
||||
The Page Server consists of multiple threads that operate on a shared
|
||||
repository of page versions:
|
||||
|
||||
@@ -9,22 +9,28 @@ use std::{
|
||||
env,
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
process::exit,
|
||||
str::FromStr,
|
||||
thread,
|
||||
};
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
|
||||
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{
|
||||
branches, http, page_service, tenant_mgr, PageServerConf, RelishStorageConfig, S3Config,
|
||||
LOG_FILE_NAME,
|
||||
branches,
|
||||
defaults::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
},
|
||||
http, page_service, relish_storage, tenant_mgr, PageServerConf, RelishStorageConfig,
|
||||
RelishStorageKind, S3Config, LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
|
||||
use const_format::formatcp;
|
||||
|
||||
/// String arguments that can be declared via CLI or config file
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct CfgFileParams {
|
||||
@@ -39,6 +45,7 @@ struct CfgFileParams {
|
||||
auth_type: Option<String>,
|
||||
// see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples
|
||||
relish_storage: Option<RelishStorage>,
|
||||
relish_storage_max_concurrent_sync: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -89,6 +96,7 @@ impl CfgFileParams {
|
||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||
auth_type: get_arg("auth-type"),
|
||||
relish_storage,
|
||||
relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,6 +116,9 @@ impl CfgFileParams {
|
||||
.or(other.auth_validation_public_key_path),
|
||||
auth_type: self.auth_type.or(other.auth_type),
|
||||
relish_storage: self.relish_storage.or(other.relish_storage),
|
||||
relish_storage_max_concurrent_sync: self
|
||||
.relish_storage_max_concurrent_sync
|
||||
.or(other.relish_storage_max_concurrent_sync),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,25 +187,34 @@ impl CfgFileParams {
|
||||
);
|
||||
}
|
||||
|
||||
let relish_storage_config =
|
||||
self.relish_storage
|
||||
.as_ref()
|
||||
.map(|storage_params| match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageConfig::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageConfig::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
});
|
||||
let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
|
||||
Some(relish_storage_max_concurrent_sync) => {
|
||||
relish_storage_max_concurrent_sync.parse()?
|
||||
}
|
||||
None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
};
|
||||
let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
|
||||
let storage = match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageKind::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageKind::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
};
|
||||
RelishStorageConfig {
|
||||
max_concurrent_sync,
|
||||
storage,
|
||||
}
|
||||
});
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
@@ -220,6 +240,7 @@ impl CfgFileParams {
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(
|
||||
@@ -228,14 +249,14 @@ fn main() -> Result<()> {
|
||||
.long("listen-pg")
|
||||
.alias("listen") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
|
||||
.help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen-http")
|
||||
.long("listen-http")
|
||||
.alias("http_endpoint") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help("http endpoint address for for metrics and management API calls ip:port (default: 127.0.0.1:5430)"),
|
||||
.help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
@@ -343,10 +364,19 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-max-concurrent-sync")
|
||||
.long("relish-storage-max-concurrent-sync")
|
||||
.takes_value(true)
|
||||
.help("Maximum allowed concurrent synchronisations with storage"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml");
|
||||
let cfg_file_path = workdir
|
||||
.canonicalize()
|
||||
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?
|
||||
.join("pageserver.toml");
|
||||
|
||||
let args_params = CfgFileParams::from_args(&arg_matches);
|
||||
|
||||
@@ -358,22 +388,37 @@ fn main() -> Result<()> {
|
||||
args_params
|
||||
} else {
|
||||
// Supplement the CLI arguments with the config file
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?;
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
|
||||
.with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?;
|
||||
let file_params: CfgFileParams = toml::from_str(&cfg_file_contents).with_context(|| {
|
||||
format!(
|
||||
"Failed to read '{}' as pageserver config",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
args_params.or(file_params)
|
||||
};
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
env::set_current_dir(&workdir)?;
|
||||
env::set_current_dir(&workdir).with_context(|| {
|
||||
format!(
|
||||
"Failed to set application's current dir to '{}'",
|
||||
workdir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Ensure the config is valid, even if just init-ing
|
||||
let mut conf = params.try_into_config()?;
|
||||
let mut conf = params.try_into_config().with_context(|| {
|
||||
format!(
|
||||
"Pageserver config at '{}' is not valid",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
conf.daemonize = arg_matches.is_present("daemonize");
|
||||
|
||||
if init && conf.daemonize {
|
||||
eprintln!("--daemonize cannot be used with --init");
|
||||
exit(1);
|
||||
bail!("--daemonize cannot be used with --init")
|
||||
}
|
||||
|
||||
// The configuration is all set up now. Turn it into a 'static
|
||||
@@ -383,16 +428,21 @@ fn main() -> Result<()> {
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if init {
|
||||
branches::init_pageserver(conf, create_tenant)?;
|
||||
branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
|
||||
// write the config file
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)?;
|
||||
let cfg_file_contents = toml::to_string_pretty(¶ms)
|
||||
.context("Failed to create pageserver config contents for initialisation")?;
|
||||
// TODO support enable-auth flag
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents)?;
|
||||
|
||||
return Ok(());
|
||||
std::fs::write(&cfg_file_path, cfg_file_contents).with_context(|| {
|
||||
format!(
|
||||
"Failed to initialize pageserver config at '{}'",
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
start_pageserver(conf).context("Failed to start pageserver")
|
||||
}
|
||||
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
@@ -430,16 +480,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
Err(e) => error!("could not daemonize: {:#}", e),
|
||||
}
|
||||
}
|
||||
|
||||
// keep join handles for spawned threads
|
||||
// don't spawn threads before daemonizing
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
|
||||
join_handles.push(handle);
|
||||
}
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
|
||||
// keep join handles for spawned threads
|
||||
let mut join_handles = vec![];
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::{
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use log::*;
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::logging;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -118,7 +119,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
|
||||
println!("initializing tenantid {}", tenantid);
|
||||
create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
|
||||
}
|
||||
fs::create_dir_all(conf.tenants_path())?;
|
||||
crashsafe_dir::create_dir_all(conf.tenants_path())?;
|
||||
|
||||
println!("pageserver init succeeded");
|
||||
Ok(())
|
||||
@@ -135,12 +136,12 @@ pub fn create_repo(
|
||||
}
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
fs::create_dir_all(&repo_dir)
|
||||
crashsafe_dir::create_dir_all(&repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
fs::create_dir(conf.timelines_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
@@ -150,12 +151,13 @@ pub fn create_repo(
|
||||
conf,
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
false,
|
||||
));
|
||||
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
|
||||
bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
@@ -221,11 +223,11 @@ fn bootstrap_timeline(
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
let timeline = repo.create_empty_timeline(tli)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
|
||||
let wal_dir = pgdata_path.join("pg_wal");
|
||||
restore_local_repo::import_timeline_wal(&wal_dir, &*timeline, lsn)?;
|
||||
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
timeline.checkpoint()?;
|
||||
|
||||
println!(
|
||||
@@ -417,7 +419,6 @@ fn create_timeline(
|
||||
let timelinedir = conf.timeline_path(&timelineid, tenantid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
|
||||
@@ -22,20 +22,21 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::fs::File;
|
||||
use std::convert::TryInto;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{fs, thread};
|
||||
|
||||
use crate::layered_repository::inmemory_layer::FreezeLayers;
|
||||
use crate::relish::*;
|
||||
use crate::relish_storage::storage_uploader::QueueBasedRelishUploader;
|
||||
use crate::relish_storage::schedule_timeline_upload;
|
||||
use crate::repository::{GcResult, Repository, Timeline, WALRecord};
|
||||
use crate::restore_local_repo::import_timeline_wal;
|
||||
use crate::walreceiver::IS_WAL_RECEIVER;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
@@ -45,6 +46,7 @@ use zenith_metrics::{
|
||||
};
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
|
||||
@@ -73,6 +75,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
// Taken from PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SAFE_SIZE: usize = 512;
|
||||
const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
|
||||
@@ -112,7 +119,9 @@ pub struct LayeredRepository {
|
||||
timelines: Mutex<HashMap<ZTimelineId, Arc<LayeredTimeline>>>,
|
||||
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
relish_uploader: Option<Arc<QueueBasedRelishUploader>>,
|
||||
/// Makes evey repo's timelines to backup their files to remote storage,
|
||||
/// when they get frozen.
|
||||
upload_relishes: bool,
|
||||
}
|
||||
|
||||
/// Public interface
|
||||
@@ -127,7 +136,7 @@ impl Repository for LayeredRepository {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Create the timeline directory, and write initial metadata to file.
|
||||
std::fs::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
|
||||
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
@@ -135,7 +144,7 @@ impl Repository for LayeredRepository {
|
||||
ancestor_timeline: None,
|
||||
ancestor_lsn: Lsn(0),
|
||||
};
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?;
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
self.conf,
|
||||
@@ -144,8 +153,8 @@ impl Repository for LayeredRepository {
|
||||
timelineid,
|
||||
self.tenantid,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
self.relish_uploader.as_ref().map(Arc::clone),
|
||||
0,
|
||||
false,
|
||||
)?;
|
||||
|
||||
let timeline_rc = Arc::new(timeline);
|
||||
@@ -179,8 +188,8 @@ impl Repository for LayeredRepository {
|
||||
ancestor_timeline: Some(src),
|
||||
ancestor_lsn: start_lsn,
|
||||
};
|
||||
std::fs::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?;
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
|
||||
|
||||
info!("branched timeline {} from {} at {}", dst, src, start_lsn);
|
||||
|
||||
@@ -217,6 +226,7 @@ impl LayeredRepository {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => {
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn;
|
||||
|
||||
// Recurse to look up the ancestor timeline.
|
||||
//
|
||||
@@ -236,35 +246,17 @@ impl LayeredRepository {
|
||||
timelineid,
|
||||
self.tenantid,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
self.relish_uploader.as_ref().map(Arc::clone),
|
||||
0, // init with 0 and update after layers are loaded
|
||||
0, // init with 0 and update after layers are loaded,
|
||||
self.upload_relishes,
|
||||
)?;
|
||||
|
||||
// List the layers on disk, and load them into the layer map
|
||||
timeline.load_layer_map()?;
|
||||
timeline.load_layer_map(disk_consistent_lsn)?;
|
||||
|
||||
// needs to be after load_layer_map
|
||||
timeline.init_current_logical_size()?;
|
||||
|
||||
let timeline = Arc::new(timeline);
|
||||
|
||||
// Load any new WAL after the last checkpoint into memory.
|
||||
info!(
|
||||
"Loading WAL for timeline {} starting at {}",
|
||||
timelineid,
|
||||
timeline.get_last_record_lsn()
|
||||
);
|
||||
let wal_dir = self
|
||||
.conf
|
||||
.timeline_path(&timelineid, &self.tenantid)
|
||||
.join("wal");
|
||||
import_timeline_wal(&wal_dir, timeline.as_ref(), timeline.get_last_record_lsn())?;
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// check again after wal loading
|
||||
Self::assert_size_calculation_matches_offloaded(Arc::clone(&timeline));
|
||||
}
|
||||
|
||||
timelines.insert(timelineid, timeline.clone());
|
||||
Ok(timeline)
|
||||
}
|
||||
@@ -275,15 +267,14 @@ impl LayeredRepository {
|
||||
conf: &'static PageServerConf,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
tenantid: ZTenantId,
|
||||
upload_relishes: bool,
|
||||
) -> LayeredRepository {
|
||||
LayeredRepository {
|
||||
tenantid,
|
||||
conf,
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
walredo_mgr,
|
||||
relish_uploader: conf.relish_storage_config.as_ref().map(|config| {
|
||||
Arc::new(QueueBasedRelishUploader::new(config, &conf.workdir).unwrap())
|
||||
}),
|
||||
upload_relishes,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -358,13 +349,36 @@ impl LayeredRepository {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> Result<PathBuf> {
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
|
||||
let mut file = File::create(&path)?;
|
||||
let timeline_path = conf.timeline_path(&timelineid, &tenantid);
|
||||
let path = timeline_path.join("metadata");
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(first_save)
|
||||
.open(&path)?;
|
||||
|
||||
info!("saving metadata {}", path.display());
|
||||
|
||||
file.write_all(&TimelineMetadata::ser(data)?)?;
|
||||
let mut metadata_bytes = TimelineMetadata::ser(data)?;
|
||||
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
|
||||
if file.write(&metadata_bytes)? != metadata_bytes.len() {
|
||||
bail!("Could not write all the metadata bytes in a single call");
|
||||
}
|
||||
file.sync_all()?;
|
||||
|
||||
// fsync the parent directory to ensure the directory entry is durable
|
||||
if first_save {
|
||||
let timeline_dir = File::open(&timeline_path)?;
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(path)
|
||||
}
|
||||
@@ -375,9 +389,18 @@ impl LayeredRepository {
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<TimelineMetadata> {
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
|
||||
let data = std::fs::read(&path)?;
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
|
||||
|
||||
let data = TimelineMetadata::des(&data)?;
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(calculated_checksum == expected_checksum);
|
||||
|
||||
let data = TimelineMetadata::des_prefix(data)?;
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
@@ -499,24 +522,6 @@ impl LayeredRepository {
|
||||
totals.elapsed = now.elapsed();
|
||||
Ok(totals)
|
||||
}
|
||||
|
||||
fn assert_size_calculation_matches(incremental: usize, timeline: &LayeredTimeline) {
|
||||
match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) {
|
||||
Ok(non_incremental) => {
|
||||
if incremental != non_incremental {
|
||||
error!("timeline size calculation diverged, incremental doesn't match non incremental. incremental={} non_incremental={}", incremental, non_incremental);
|
||||
}
|
||||
}
|
||||
Err(e) => error!("failed to calculate non incremental timeline size: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_size_calculation_matches_offloaded(timeline: Arc<LayeredTimeline>) {
|
||||
let incremental = timeline.get_current_logical_size();
|
||||
thread::spawn(move || {
|
||||
Self::assert_size_calculation_matches(incremental, &timeline);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
@@ -553,8 +558,6 @@ pub struct LayeredTimeline {
|
||||
// WAL redo manager
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
|
||||
|
||||
relish_uploader: Option<Arc<QueueBasedRelishUploader>>,
|
||||
|
||||
// What page versions do we hold in the repository? If we get a
|
||||
// request > last_record_lsn, we need to wait until we receive all
|
||||
// the WAL up to the request. The SeqWait provides functions for
|
||||
@@ -602,6 +605,9 @@ pub struct LayeredTimeline {
|
||||
// TODO: it is possible to combine these two fields into single one using custom metric which uses SeqCst
|
||||
// ordering for its operations, but involves private modules, and macro trickery
|
||||
current_logical_size_gauge: IntGauge,
|
||||
|
||||
/// If `true`, will backup its timeline files to remote storage after freezing.
|
||||
upload_relishes: bool,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
@@ -609,8 +615,11 @@ impl Timeline for LayeredTimeline {
|
||||
/// Wait until WAL has been received up to the given LSN.
|
||||
fn wait_lsn(&self, lsn: Lsn) -> Result<()> {
|
||||
// This should never be called from the WAL receiver thread, because that could lead
|
||||
// to a deadlock. FIXME: Is there a less hacky way to check that?
|
||||
assert_ne!(thread::current().name(), Some("WAL receiver thread"));
|
||||
// to a deadlock.
|
||||
assert!(
|
||||
!IS_WAL_RECEIVER.with(|c| c.get()),
|
||||
"wait_lsn called by WAL receiver thread"
|
||||
);
|
||||
|
||||
self.last_record_lsn
|
||||
.wait_for_timeout(lsn, TIMEOUT)
|
||||
@@ -982,8 +991,8 @@ impl LayeredTimeline {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
relish_uploader: Option<Arc<QueueBasedRelishUploader>>,
|
||||
current_logical_size: usize,
|
||||
upload_relishes: bool,
|
||||
) -> Result<LayeredTimeline> {
|
||||
let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE
|
||||
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
|
||||
@@ -995,7 +1004,6 @@ impl LayeredTimeline {
|
||||
layers: Mutex::new(LayerMap::default()),
|
||||
|
||||
walredo_mgr,
|
||||
relish_uploader,
|
||||
|
||||
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
|
||||
last_record_lsn: SeqWait::new(RecordLsn {
|
||||
@@ -1008,6 +1016,7 @@ impl LayeredTimeline {
|
||||
ancestor_lsn: metadata.ancestor_lsn,
|
||||
current_logical_size: AtomicUsize::new(current_logical_size),
|
||||
current_logical_size_gauge,
|
||||
upload_relishes,
|
||||
};
|
||||
Ok(timeline)
|
||||
}
|
||||
@@ -1015,7 +1024,7 @@ impl LayeredTimeline {
|
||||
///
|
||||
/// Scan the timeline directory to populate the layer map
|
||||
///
|
||||
fn load_layer_map(&self) -> anyhow::Result<()> {
|
||||
fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"loading layer map for timeline {} into memory",
|
||||
self.timelineid
|
||||
@@ -1024,8 +1033,20 @@ impl LayeredTimeline {
|
||||
let (imgfilenames, mut deltafilenames) =
|
||||
filename::list_files(self.conf, self.timelineid, self.tenantid)?;
|
||||
|
||||
let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
|
||||
|
||||
// First create ImageLayer structs for each image file.
|
||||
for filename in imgfilenames.iter() {
|
||||
if filename.lsn > disk_consistent_lsn {
|
||||
warn!(
|
||||
"found future image layer {} on timeline {}",
|
||||
filename, self.timelineid
|
||||
);
|
||||
|
||||
rename_to_backup(timeline_path.join(filename.to_string()))?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let layer = ImageLayer::new(self.conf, self.timelineid, self.tenantid, filename);
|
||||
|
||||
info!(
|
||||
@@ -1043,6 +1064,17 @@ impl LayeredTimeline {
|
||||
deltafilenames.sort();
|
||||
|
||||
for filename in deltafilenames.iter() {
|
||||
ensure!(filename.start_lsn < filename.end_lsn);
|
||||
if filename.end_lsn > disk_consistent_lsn {
|
||||
warn!(
|
||||
"found future delta layer {} on timeline {}",
|
||||
filename, self.timelineid
|
||||
);
|
||||
|
||||
rename_to_backup(timeline_path.join(filename.to_string()))?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let predecessor = layers.get(&filename.seg, filename.start_lsn);
|
||||
|
||||
let predecessor_str: String = if let Some(prec) = &predecessor {
|
||||
@@ -1190,13 +1222,12 @@ impl LayeredTimeline {
|
||||
assert!(lsn.is_aligned());
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
if lsn <= last_record_lsn {
|
||||
panic!(
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
|
||||
lsn,
|
||||
last_record_lsn
|
||||
);
|
||||
}
|
||||
assert!(
|
||||
lsn > last_record_lsn,
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
|
||||
lsn,
|
||||
last_record_lsn,
|
||||
);
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
let layer;
|
||||
@@ -1316,6 +1347,8 @@ impl LayeredTimeline {
|
||||
last_record_lsn
|
||||
);
|
||||
|
||||
let timeline_dir = File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?;
|
||||
|
||||
// Take the in-memory layer with the oldest WAL record. If it's older
|
||||
// than the threshold, write it out to disk as a new image and delta file.
|
||||
// Repeat until all remaining in-memory layers are within the threshold.
|
||||
@@ -1327,6 +1360,8 @@ impl LayeredTimeline {
|
||||
// a lot of memory and/or aren't receiving much updates anymore.
|
||||
let mut disk_consistent_lsn = last_record_lsn;
|
||||
|
||||
let mut created_historics = false;
|
||||
|
||||
while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
|
||||
let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();
|
||||
|
||||
@@ -1379,10 +1414,9 @@ impl LayeredTimeline {
|
||||
drop(layers);
|
||||
let new_historics = frozen.write_to_disk(self)?;
|
||||
layers = self.layers.lock().unwrap();
|
||||
if let Some(relish_uploader) = &self.relish_uploader {
|
||||
for label_path in new_historics.iter().filter_map(|layer| layer.path()) {
|
||||
relish_uploader.schedule_upload(self.timelineid, label_path);
|
||||
}
|
||||
|
||||
if !new_historics.is_empty() {
|
||||
created_historics = true;
|
||||
}
|
||||
|
||||
// Finally, replace the frozen in-memory layer with the new on-disk layers
|
||||
@@ -1414,6 +1448,14 @@ impl LayeredTimeline {
|
||||
layer.unload()?;
|
||||
}
|
||||
|
||||
drop(layers);
|
||||
|
||||
if created_historics {
|
||||
// We must fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
|
||||
// Save the metadata, with updated 'disk_consistent_lsn', to a
|
||||
// file in the timeline dir. After crash, we will restart WAL
|
||||
// streaming and processing from that point.
|
||||
@@ -1438,10 +1480,23 @@ impl LayeredTimeline {
|
||||
ancestor_timeline: ancestor_timelineid,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
};
|
||||
let metadata_path =
|
||||
LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?;
|
||||
if let Some(relish_uploader) = &self.relish_uploader {
|
||||
relish_uploader.schedule_upload(self.timelineid, metadata_path);
|
||||
let _metadata_path = LayeredRepository::save_metadata(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_upload(())
|
||||
// schedule_timeline_upload(LocalTimeline {
|
||||
// tenant_id: self.tenantid,
|
||||
// timeline_id: self.timelineid,
|
||||
// metadata_path,
|
||||
// image_layers: image_layer_uploads,
|
||||
// delta_layers: delta_layer_uploads,
|
||||
// disk_consistent_lsn,
|
||||
// });
|
||||
}
|
||||
|
||||
// Also update the in-memory copy
|
||||
@@ -1874,3 +1929,23 @@ fn layer_ptr_eq(l1: &dyn Layer, l2: &dyn Layer) -> bool {
|
||||
// see here for more https://github.com/rust-lang/rust/issues/46139
|
||||
std::ptr::eq(l1_ptr as *const (), l2_ptr as *const ())
|
||||
}
|
||||
|
||||
/// Add a suffix to a layer file's name: .{num}.old
|
||||
/// Uses the first available num (starts at 0)
|
||||
fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
let mut new_path = path.clone();
|
||||
|
||||
for i in 0u32.. {
|
||||
new_path.set_file_name(format!("{}.{}.old", filename, i));
|
||||
if !new_path.exists() {
|
||||
std::fs::rename(&path, &new_path)?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!(
|
||||
"couldn't find an unused backup number for {:?}",
|
||||
path
|
||||
))
|
||||
}
|
||||
|
||||
@@ -42,12 +42,10 @@ use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
@@ -109,12 +107,6 @@ impl From<&DeltaLayer> for Summary {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PageVersionMeta {
|
||||
page_image_range: Option<BlobRange>,
|
||||
record_range: Option<BlobRange>,
|
||||
}
|
||||
|
||||
///
|
||||
/// DeltaLayer is the in-memory data structure associated with an
|
||||
/// on-disk delta file. We keep a DeltaLayer in memory for each
|
||||
@@ -152,7 +144,7 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>,
|
||||
page_version_metas: BTreeMap<(u32, Lsn), BlobRange>,
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
@@ -229,15 +221,15 @@ impl Layer for DeltaLayer {
|
||||
let mut iter = inner
|
||||
.page_version_metas
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img_range) = &entry.page_image_range {
|
||||
while let Some(((_blknum, _entry_lsn), blob_range)) = iter.next_back() {
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
if let Some(img) = pv.page_image {
|
||||
// Found a page image, return it
|
||||
let img = Bytes::from(read_blob(&page_version_reader, img_range)?);
|
||||
reconstruct_data.page_img = Some(img);
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec_range) = &entry.record_range {
|
||||
let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?;
|
||||
} else if let Some(rec) = pv.record {
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push(rec);
|
||||
if will_init {
|
||||
@@ -340,16 +332,16 @@ impl Layer for DeltaLayer {
|
||||
println!("--- page versions ---");
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for (k, v) in inner.page_version_metas.iter() {
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.iter() {
|
||||
let mut desc = String::new();
|
||||
|
||||
if let Some(page_image_range) = v.page_image_range.as_ref() {
|
||||
let image = read_blob(&chapter, page_image_range)?;
|
||||
write!(&mut desc, " img {} bytes", image.len())?;
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
|
||||
if let Some(img) = pv.page_image.as_ref() {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
if let Some(record_range) = v.record_range.as_ref() {
|
||||
let record_bytes = read_blob(&chapter, record_range)?;
|
||||
let rec = WALRecord::des(&record_bytes)?;
|
||||
if let Some(rec) = pv.record.as_ref() {
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
@@ -359,7 +351,7 @@ impl Layer for DeltaLayer {
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
println!(" blk {} at {}: {}", k.0, k.1, desc);
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -381,14 +373,15 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new delta file, using the given btreemaps containing the page versions and
|
||||
/// relsizes.
|
||||
/// Create a new delta file, using the given page versions and relsizes.
|
||||
/// The page versions are passed by an iterator; the iterator must return
|
||||
/// page versions in blknum+lsn order.
|
||||
///
|
||||
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create(
|
||||
pub fn create<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
@@ -397,7 +390,7 @@ impl DeltaLayer {
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
page_versions: impl Iterator<Item = (&'a (u32, Lsn), &'a PageVersion)>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
) -> Result<DeltaLayer> {
|
||||
let delta_layer = DeltaLayer {
|
||||
@@ -431,26 +424,10 @@ impl DeltaLayer {
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
for (key, page_version) in page_versions {
|
||||
let page_image_range = page_version
|
||||
.page_image
|
||||
.map(|page_image| page_version_writer.write_blob(page_image.as_ref()))
|
||||
.transpose()?;
|
||||
let buf = PageVersion::ser(page_version)?;
|
||||
let blob_range = page_version_writer.write_blob(&buf)?;
|
||||
|
||||
let record_range = page_version
|
||||
.record
|
||||
.map(|record| {
|
||||
let buf = WALRecord::ser(&record)?;
|
||||
page_version_writer.write_blob(&buf)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
let old = inner.page_version_metas.insert(
|
||||
key,
|
||||
PageVersionMeta {
|
||||
page_image_range,
|
||||
record_range,
|
||||
},
|
||||
);
|
||||
let old = inner.page_version_metas.insert(*key, blob_range);
|
||||
|
||||
assert!(old.is_none());
|
||||
}
|
||||
@@ -484,7 +461,8 @@ impl DeltaLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
book.close()?;
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
|
||||
@@ -290,7 +290,7 @@ pub fn list_files(
|
||||
deltafiles.push(deltafilename);
|
||||
} else if let Some(imgfilename) = ImageFileName::from_str(fname) {
|
||||
imgfiles.push(imgfilename);
|
||||
} else if fname == "wal" || fname == "metadata" || fname == "ancestor" {
|
||||
} else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
|
||||
@@ -337,7 +337,8 @@ impl ImageLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
book.close()?;
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ use std::cmp::min;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use zenith_utils::accum::Accum;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -44,7 +44,7 @@ pub struct InMemoryLayer {
|
||||
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: Mutex<InMemoryLayerInner>,
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
@@ -95,7 +95,7 @@ impl Layer for InMemoryLayer {
|
||||
// An in-memory layer doesn't really have a filename as it's not stored on disk,
|
||||
// but we construct a filename as if it was a delta layer
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn;
|
||||
let dropped;
|
||||
@@ -139,7 +139,7 @@ impl Layer for InMemoryLayer {
|
||||
return Lsn(end_lsn.0 + 1);
|
||||
}
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
drop_lsn
|
||||
@@ -149,7 +149,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.drop_lsn.is_some()
|
||||
}
|
||||
|
||||
@@ -167,7 +167,7 @@ impl Layer for InMemoryLayer {
|
||||
let predecessor: Option<Arc<dyn Layer>>;
|
||||
|
||||
{
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
@@ -214,13 +214,13 @@ impl Layer for InMemoryLayer {
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
Ok(inner.get_seg_size(lsn))
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// If the segment created after requested LSN,
|
||||
// it doesn't exist in the layer. But we shouldn't
|
||||
@@ -252,13 +252,13 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.predecessor.is_some()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
.drop_lsn
|
||||
@@ -340,7 +340,7 @@ impl InMemoryLayer {
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes: BTreeMap::new(),
|
||||
@@ -389,7 +389,7 @@ impl InMemoryLayer {
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.check_writeable()?;
|
||||
|
||||
@@ -407,7 +407,7 @@ impl InMemoryLayer {
|
||||
if self.seg.rel.is_blocky() {
|
||||
let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;
|
||||
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire self.inner.lock
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
|
||||
// which we've just acquired above
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
if newsize > oldsize {
|
||||
@@ -460,7 +460,7 @@ impl InMemoryLayer {
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.check_writeable()?;
|
||||
|
||||
// check that this we truncate to a smaller size than segment was before the truncation
|
||||
@@ -481,7 +481,7 @@ impl InMemoryLayer {
|
||||
pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.check_writeable()?;
|
||||
|
||||
@@ -533,7 +533,7 @@ impl InMemoryLayer {
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes,
|
||||
@@ -544,7 +544,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
pub fn is_writeable(&self) -> bool {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.writeable
|
||||
}
|
||||
|
||||
@@ -564,7 +564,7 @@ impl InMemoryLayer {
|
||||
self.assert_not_frozen();
|
||||
|
||||
let self_ref = self.clone();
|
||||
let mut inner = self_ref.inner.lock().unwrap();
|
||||
let mut inner = self_ref.inner.write().unwrap();
|
||||
// Dropped layers don't need any special freeze actions,
|
||||
// they are marked as non-writeable at drop and just
|
||||
// written out to disk by checkpointer.
|
||||
@@ -620,7 +620,7 @@ impl InMemoryLayer {
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: Some(cutoff_lsn),
|
||||
oldest_pending_lsn: self.start_lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
drop_lsn: inner.drop_lsn,
|
||||
page_versions: before_page_versions,
|
||||
segsizes: before_segsizes,
|
||||
@@ -667,40 +667,55 @@ impl InMemoryLayer {
|
||||
self.get_end_lsn()
|
||||
);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let drop_lsn = inner.drop_lsn;
|
||||
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to aquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
// though: another thread might have grabbed a reference to this layer
|
||||
// in `get_layer_for_write' just before the checkpointer called
|
||||
// `freeze`, and then `write_to_disk` on it. When the thread gets the
|
||||
// lock, it will see that it's not writeable anymore and retry, but it
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
assert!(!inner.writeable);
|
||||
|
||||
let end_lsn = match drop_lsn {
|
||||
Some(dlsn) => dlsn,
|
||||
None => self.end_lsn.unwrap(),
|
||||
};
|
||||
|
||||
let predecessor = inner.predecessor.clone();
|
||||
|
||||
let mut before_page_versions;
|
||||
let mut before_segsizes;
|
||||
if inner.drop_lsn.is_none() {
|
||||
before_segsizes = BTreeMap::new();
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn <= end_lsn {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
|
||||
before_page_versions = BTreeMap::new();
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
if *lsn < end_lsn {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
before_page_versions = inner.page_versions.clone();
|
||||
before_segsizes = inner.segsizes.clone();
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
drop_lsn,
|
||||
true,
|
||||
predecessor,
|
||||
inner.page_versions.iter(),
|
||||
inner.segsizes.clone(),
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
drop_lsn
|
||||
);
|
||||
return Ok(vec![Arc::new(delta_layer)]);
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
let end_lsn = self.end_lsn.unwrap();
|
||||
|
||||
let mut before_segsizes = BTreeMap::new();
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn <= end_lsn {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
let mut before_page_versions = inner.page_versions.iter().filter(|tup| {
|
||||
let ((_blknum, lsn), _pv) = tup;
|
||||
|
||||
*lsn < end_lsn
|
||||
});
|
||||
|
||||
let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
|
||||
@@ -713,7 +728,7 @@ impl InMemoryLayer {
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn,
|
||||
drop_lsn.is_some(),
|
||||
false,
|
||||
predecessor,
|
||||
before_page_versions,
|
||||
before_segsizes,
|
||||
@@ -726,21 +741,21 @@ impl InMemoryLayer {
|
||||
end_lsn
|
||||
);
|
||||
} else {
|
||||
assert!(before_page_versions.is_empty());
|
||||
assert!(before_page_versions.next().is_none());
|
||||
}
|
||||
|
||||
if drop_lsn.is_none() {
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
frozen_layers.push(Arc::new(image_layer));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
}
|
||||
drop(inner);
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
frozen_layers.push(Arc::new(image_layer));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
|
||||
Ok(frozen_layers)
|
||||
}
|
||||
|
||||
pub fn update_predecessor(&self, predecessor: Arc<dyn Layer>) -> Option<Arc<dyn Layer>> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.predecessor.replace(predecessor)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,11 +133,7 @@ where
|
||||
found_start_point = true;
|
||||
// It is an error to insert the same item to the tree twice.
|
||||
assert!(
|
||||
point
|
||||
.elements
|
||||
.iter()
|
||||
.find(|x| Arc::ptr_eq(x, &item))
|
||||
.is_none(),
|
||||
!point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
|
||||
"interval is already in the tree"
|
||||
);
|
||||
}
|
||||
@@ -180,7 +176,7 @@ where
|
||||
found_start_point = true;
|
||||
}
|
||||
let len_before = point.elements.len();
|
||||
point.elements.retain(|other| !Arc::ptr_eq(other, &item));
|
||||
point.elements.retain(|other| !Arc::ptr_eq(other, item));
|
||||
let len_after = point.elements.len();
|
||||
assert_eq!(len_after + 1, len_before);
|
||||
if len_after == 0 {
|
||||
@@ -287,6 +283,7 @@ mod tests {
|
||||
write!(f, "{}", self.val)
|
||||
}
|
||||
}
|
||||
#[rustfmt::skip]
|
||||
fn assert_search(
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
@@ -295,24 +292,20 @@ mod tests {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
if expected.is_empty() {
|
||||
panic!("search with {} returned {}, expected None", key, v);
|
||||
}
|
||||
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
|
||||
assert!(
|
||||
expected.contains(&vstr.as_str()),
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected,
|
||||
);
|
||||
|
||||
if !expected.contains(&vstr.as_str()) {
|
||||
panic!(
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected
|
||||
);
|
||||
}
|
||||
Some(v)
|
||||
} else {
|
||||
if !expected.is_empty() {
|
||||
panic!(
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
}
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ pub mod http;
|
||||
pub mod layered_repository;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
mod relish_storage;
|
||||
pub mod relish_storage;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
@@ -22,23 +22,25 @@ pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod defaults {
|
||||
use const_format::formatcp;
|
||||
use std::time::Duration;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = "127.0.0.1:64000"; // can't format! const yet...
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = "127.0.0.1:9898";
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(100);
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -124,10 +126,6 @@ impl PageServerConf {
|
||||
self.timeline_path(timelineid, tenantid).join("ancestor")
|
||||
}
|
||||
|
||||
fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timeline_path(timelineid, tenantid).join("wal")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
@@ -153,8 +151,8 @@ impl PageServerConf {
|
||||
checkpoint_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_pg_addr: "127.0.0.1:5430".to_string(),
|
||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
superuser: "zenith_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
@@ -167,18 +165,37 @@ impl PageServerConf {
|
||||
|
||||
/// External relish storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum RelishStorageConfig {
|
||||
/// Root folder to place all stored relish data into.
|
||||
pub struct RelishStorageConfig {
|
||||
/// Limits the number of concurrent sync operations between pageserver and relish storage.
|
||||
pub max_concurrent_sync: usize,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RelishStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a relish storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum RelishStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored relish data into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all relishes into the root
|
||||
/// of the S3 bucket from the config.
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
}
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ pub fn thread_main(
|
||||
let local_auth = auth.clone();
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!("error: {}", err);
|
||||
error!("page server thread exiting with error: {:#}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -293,7 +293,9 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
error!("error reading relation or page version: {}", e);
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
error!("error reading relation or page version: {:#}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
|
||||
@@ -8,14 +8,43 @@
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
/// A queue and the background machinery behind it to upload
|
||||
/// local page server layer files to external storage.
|
||||
pub mod storage_uploader;
|
||||
/// A queue-based storage with the background machinery behind it to synchronize
|
||||
/// local page server layer files with external storage.
|
||||
mod synced_storage;
|
||||
|
||||
use std::path::Path;
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use self::local_fs::LocalFs;
|
||||
pub use self::synced_storage::schedule_timeline_upload;
|
||||
use crate::relish_storage::rust_s3::RustS3;
|
||||
use crate::{PageServerConf, RelishStorageKind};
|
||||
|
||||
pub fn run_storage_sync_thread(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
match &config.relish_storage_config {
|
||||
Some(relish_storage_config) => {
|
||||
let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
|
||||
match &relish_storage_config.storage {
|
||||
RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone())?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
RustS3::new(s3_config)?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
}
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RelishStorage: Send + Sync {
|
||||
|
||||
@@ -1,116 +0,0 @@
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
path::{Path, PathBuf},
|
||||
sync::{Arc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::{relish_storage::RelishStorage, RelishStorageConfig};
|
||||
|
||||
use super::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
|
||||
pub struct QueueBasedRelishUploader {
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
}
|
||||
|
||||
impl QueueBasedRelishUploader {
|
||||
pub fn new(
|
||||
config: &RelishStorageConfig,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
let upload_queue = Arc::new(Mutex::new(VecDeque::new()));
|
||||
let _handle = match config {
|
||||
RelishStorageConfig::LocalFs(root) => {
|
||||
let relish_storage = LocalFs::new(root.clone())?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
RelishStorageConfig::AwsS3(s3_config) => {
|
||||
let relish_storage = RustS3::new(s3_config)?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self { upload_queue })
|
||||
}
|
||||
|
||||
pub fn schedule_upload(&self, timeline_id: ZTimelineId, relish_path: PathBuf) {
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((timeline_id, relish_path))
|
||||
}
|
||||
}
|
||||
|
||||
fn create_upload_thread<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
relish_storage: S,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> std::io::Result<thread::JoinHandle<()>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
thread::Builder::new()
|
||||
.name("Queue based relish uploader".to_string())
|
||||
.spawn(move || loop {
|
||||
runtime.block_on(async {
|
||||
upload_loop_step(&upload_queue, &relish_storage, page_server_workdir).await;
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload_loop_step<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: &Mutex<VecDeque<(ZTimelineId, PathBuf)>>,
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
) {
|
||||
let mut queue_accessor = upload_queue.lock().unwrap();
|
||||
log::debug!("current upload queue length: {}", queue_accessor.len());
|
||||
let next_upload = queue_accessor.pop_front();
|
||||
drop(queue_accessor);
|
||||
|
||||
let (relish_timeline_id, relish_local_path) = match next_upload {
|
||||
Some(data) => data,
|
||||
None => {
|
||||
// Don't spin and allow others to use the queue.
|
||||
// In future, could be improved to be more clever about delays depending on relish upload stats
|
||||
thread::sleep(std::time::Duration::from_secs(1));
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = upload_relish(relish_storage, page_server_workdir, &relish_local_path).await {
|
||||
log::error!(
|
||||
"Failed to upload relish '{}' for timeline {}, reason: {}",
|
||||
relish_local_path.display(),
|
||||
relish_timeline_id,
|
||||
e
|
||||
);
|
||||
upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((relish_timeline_id, relish_local_path))
|
||||
} else {
|
||||
log::debug!("Relish successfully uploaded");
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<P, S: RelishStorage<RelishStoragePath = P>>(
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let destination = S::derive_destination(page_server_workdir, relish_local_path)?;
|
||||
relish_storage
|
||||
.upload_relish(relish_local_path, &destination)
|
||||
.await
|
||||
}
|
||||
52
pageserver/src/relish_storage/synced_storage.rs
Normal file
52
pageserver/src/relish_storage/synced_storage.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use std::time::Duration;
|
||||
use std::{collections::BinaryHeap, sync::Mutex, thread};
|
||||
|
||||
use crate::{relish_storage::RelishStorage, PageServerConf};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
|
||||
}
|
||||
|
||||
pub fn schedule_timeline_upload(_local_timeline: ()) {
|
||||
// UPLOAD_QUEUE
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .push(SyncTask::Upload(local_timeline))
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum SyncTask {}
|
||||
|
||||
pub fn run_storage_sync_thread<
|
||||
P: std::fmt::Debug,
|
||||
S: 'static + RelishStorage<RelishStoragePath = P>,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
relish_storage: S,
|
||||
max_concurrent_sync: usize,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("Queue based relish storage sync".to_string())
|
||||
.spawn(move || loop {
|
||||
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
|
||||
log::debug!("Upload queue length: {}", queue_accessor.len());
|
||||
let next_task = queue_accessor.pop();
|
||||
drop(queue_accessor);
|
||||
match next_task {
|
||||
Some(task) => runtime.block_on(async {
|
||||
// suppress warnings
|
||||
let _ = (config, task, &relish_storage, max_concurrent_sync);
|
||||
todo!("omitted for brevity")
|
||||
}),
|
||||
None => {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
})?;
|
||||
Ok(Some(handle))
|
||||
}
|
||||
@@ -212,12 +212,18 @@ mod tests {
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use hex_literal::hex;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::path::PathBuf;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
@@ -253,39 +259,53 @@ mod tests {
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
struct RepoHarness {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
}
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
let tenantid = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
||||
impl RepoHarness {
|
||||
fn create(test_name: &'static str) -> Result<Self> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let repo = Box::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
|
||||
Ok(repo)
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
|
||||
fn load(&self) -> Box<dyn Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
Box::new(LayeredRepository::new(
|
||||
self.conf,
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
false,
|
||||
))
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
let repo = RepoHarness::create("test_relsize")?.load();
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
@@ -399,11 +419,10 @@ mod tests {
|
||||
// and then created it again within the same layer.
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let repo = get_test_repo("test_drop_extend")?;
|
||||
let repo = RepoHarness::create("test_drop_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x20));
|
||||
@@ -436,11 +455,10 @@ mod tests {
|
||||
// and then extended it again within the same layer.
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let repo = get_test_repo("test_truncate_extend")?;
|
||||
let repo = RepoHarness::create("test_truncate_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
|
||||
//from storage_layer.rs
|
||||
const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
@@ -537,9 +555,8 @@ mod tests {
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
|
||||
let mut lsn = 0x10;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
@@ -600,9 +617,8 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_list_rels_drop() -> Result<()> {
|
||||
let repo = get_test_repo("test_list_rels_drop")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let repo = RepoHarness::create("test_list_rels_drop")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
@@ -620,9 +636,8 @@ mod tests {
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
|
||||
assert!(newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x30))?
|
||||
@@ -642,7 +657,7 @@ mod tests {
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint()?;
|
||||
repo.gc_iteration(Some(newtimelineid), 0, true)?;
|
||||
repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
|
||||
|
||||
assert!(!newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x40))?
|
||||
@@ -656,9 +671,8 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
@@ -676,9 +690,8 @@ mod tests {
|
||||
assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
|
||||
newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
@@ -706,8 +719,89 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join("metadata");
|
||||
|
||||
assert!(metadata_path.is_file());
|
||||
|
||||
let mut metadata_bytes = std::fs::read(&metadata_path)?;
|
||||
assert_eq!(metadata_bytes.len(), 512);
|
||||
metadata_bytes[512 - 4 - 2] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
|
||||
assert!(err.to_string().contains("checksum"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn future_layerfiles() -> Result<()> {
|
||||
const TEST_NAME: &str = "future_layerfiles";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let make_empty_file = |filename: &str| -> std::io::Result<()> {
|
||||
let path = timeline_path.join(filename);
|
||||
|
||||
assert!(!path.exists());
|
||||
std::fs::write(&path, &[])?;
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let image_filename = format!("pg_control_0_{:016X}", 8000);
|
||||
let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
let check_old = |filename: &str, num: u32| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(!path.exists());
|
||||
|
||||
let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
|
||||
assert!(backup_path.exists());
|
||||
};
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
check_old(&image_filename, 1);
|
||||
check_old(&delta_filename, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
struct TestRedoManager;
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
|
||||
@@ -9,11 +9,9 @@ use std::cmp::min;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use crate::relish::*;
|
||||
@@ -175,8 +173,7 @@ fn import_relfile(
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
bail!("error reading file {}: {:#}", path.display(), e);
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -270,8 +267,7 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
bail!("error reading file {}: {:#}", path.display(), e);
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -283,100 +279,6 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory
|
||||
/// and load all records >= 'startpoint' into the repository.
|
||||
pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn(RelishTag::Checkpoint, 0, startpoint)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
//Read local file
|
||||
let mut path = walpath.join(&filename);
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path = walpath.join(filename + ".partial");
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let open_result = File::open(&path);
|
||||
if let Err(e) = &open_result {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let mut file = open_result?;
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
}
|
||||
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
loop {
|
||||
let rec = waldecoder.poll_decode();
|
||||
if rec.is_err() {
|
||||
// Assume that an error means we've reached the end of
|
||||
// a partial WAL record. So that's ok.
|
||||
trace!("WAL decoder error {:?}", rec);
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
// The previous record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN. (We do this here on the "next" iteration,
|
||||
// rather than right after the save_decoded_record, because at the end of
|
||||
// the WAL, we will also need to perform the update of the checkpoint data
|
||||
// with the same LSN as the last actual record.)
|
||||
timeline.advance_last_record_lsn(last_lsn);
|
||||
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
nrecords += 1;
|
||||
}
|
||||
|
||||
info!("imported {} records up to {}", nrecords, last_lsn);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
if last_lsn != startpoint {
|
||||
info!(
|
||||
"reached end of WAL at {}, updating checkpoint info",
|
||||
last_lsn
|
||||
);
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, last_lsn, checkpoint_bytes)?;
|
||||
|
||||
timeline.advance_last_record_lsn(last_lsn);
|
||||
} else {
|
||||
info!("no WAL to import at {}", last_lsn);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
|
||||
@@ -9,46 +9,84 @@ use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::info;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
pub static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
fn access_repository() -> MutexGuard<'static, HashMap<ZTenantId, Arc<dyn Repository>>> {
|
||||
REPOSITORY.lock().unwrap()
|
||||
}
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = access_repository();
|
||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||
let tenantid =
|
||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
|
||||
let repo = init_repo(conf, tenantid);
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
m.insert(tenantid, repo);
|
||||
}
|
||||
}
|
||||
|
||||
fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc<LayeredRepository> {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
true,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
repo
|
||||
}
|
||||
|
||||
// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
|
||||
// Relevant PR: https://github.com/zenithdb/zenith/pull/686
|
||||
pub fn register_relish_download(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) {
|
||||
log::info!(
|
||||
"Registering new download, tenant id {}, timeline id: {}",
|
||||
tenant_id,
|
||||
timeline_id
|
||||
);
|
||||
match access_repository().entry(tenant_id) {
|
||||
Entry::Occupied(o) => init_timeline(o.get().as_ref(), timeline_id),
|
||||
Entry::Vacant(v) => {
|
||||
log::info!("New repo initialized");
|
||||
let new_repo = init_repo(conf, tenant_id);
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
v.insert(new_repo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
match repo.get_timeline(timeline_id) {
|
||||
Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
|
||||
Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
let mut m = access_repository();
|
||||
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
@@ -62,15 +100,10 @@ pub fn create_repository_for_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
o.insert(tenantid, repo);
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
o.get(&tenantid)
|
||||
.map(|repo| Arc::clone(repo))
|
||||
access_repository()
|
||||
.get(&tenantid)
|
||||
.map(Arc::clone)
|
||||
.ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
|
||||
}
|
||||
|
||||
|
||||
@@ -10,25 +10,22 @@ use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{Error, Result};
|
||||
use anyhow::{bail, Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cmp::{max, min};
|
||||
use std::cell::Cell;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::thread_local;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
@@ -46,6 +43,13 @@ lazy_static! {
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
// Boolean that is true only for WAL receiver threads
|
||||
//
|
||||
// This is used in `wait_lsn` to guard against usage that might lead to a deadlock.
|
||||
pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -66,12 +70,10 @@ pub fn launch_wal_receiver(
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
//
|
||||
// NOTE: This thread name is checked in the assertion in wait_lsn. If you change
|
||||
// this, make sure you update the assertion too.
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
IS_WAL_RECEIVER.with(|c| c.set(true));
|
||||
thread_main(conf, timelineid, tenantid);
|
||||
})
|
||||
.unwrap();
|
||||
@@ -120,7 +122,7 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
conf: &PageServerConf,
|
||||
_conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
tenantid: ZTenantId,
|
||||
@@ -160,7 +162,7 @@ fn walreceiver_main(
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
error!("No previous WAL position");
|
||||
bail!("No previous WAL position");
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
@@ -190,16 +192,6 @@ fn walreceiver_main(
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
write_wal_file(
|
||||
conf,
|
||||
startlsn,
|
||||
&timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
data,
|
||||
&tenantid,
|
||||
)?;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
@@ -240,34 +232,6 @@ fn walreceiver_main(
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed. This is probably too aggressive for production, but it's useful
|
||||
// to expose bugs now.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
conf,
|
||||
&timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
&tenantid,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
@@ -289,7 +253,7 @@ fn walreceiver_main(
|
||||
);
|
||||
|
||||
if reply_requested {
|
||||
Some(timeline.get_last_record_lsn())
|
||||
Some(last_rec_lsn)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -313,47 +277,6 @@ fn walreceiver_main(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_wal_file_range(
|
||||
conf: &PageServerConf,
|
||||
timeline: &ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
tenant: &ZTenantId,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = conf.wal_dir_path(timeline, tenant);
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
@@ -403,98 +326,3 @@ pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
|
||||
Err(IdentifyError.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
conf: &PageServerConf,
|
||||
startpos: Lsn,
|
||||
timelineid: &ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
tenantid: &ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
let wal_dir = conf.wal_dir_path(timelineid, tenantid);
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size);
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
|
||||
/*
|
||||
* If crossing a WAL boundary, only write up until we reach wal
|
||||
* segment size.
|
||||
*/
|
||||
if xlogoff + bytes_left > wal_seg_size {
|
||||
bytes_to_write = wal_seg_size - xlogoff;
|
||||
} else {
|
||||
bytes_to_write = bytes_left;
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
wal_seg_size,
|
||||
);
|
||||
let wal_file_path = wal_dir.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
let mut wal_file: File;
|
||||
/* Try to open already completed segment */
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
wal_file = file;
|
||||
partial = false;
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
|
||||
/* Try to open existed partial file */
|
||||
wal_file = file;
|
||||
partial = true;
|
||||
} else {
|
||||
/* Create and fill new partial file */
|
||||
partial = true;
|
||||
match OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
||||
|
||||
// FIXME: Flush the file
|
||||
//wal_file.sync_all()?;
|
||||
}
|
||||
/* Write was successful, advance our position */
|
||||
bytes_written += bytes_to_write;
|
||||
bytes_left -= bytes_to_write;
|
||||
start_pos += bytes_to_write as u64;
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if start_pos.segment_offset(wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -298,9 +298,11 @@ impl PostgresRedoManager {
|
||||
// Transaction manager stuff
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
if slru != SlruKind::Clog {
|
||||
panic!("Not valid XACT relish tag {:?}", rel);
|
||||
}
|
||||
assert!(
|
||||
slru == SlruKind::Clog,
|
||||
"Not valid XACT relish tag {:?}",
|
||||
rel
|
||||
);
|
||||
segno
|
||||
}
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
@@ -420,7 +422,7 @@ impl PostgresRedoManager {
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
error!("could not apply WAL records: {:#}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
@@ -458,7 +460,7 @@ impl PostgresRedoProcess {
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
error!("could not remove old wal-redo-datadir: {:#}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
@@ -544,8 +546,9 @@ impl PostgresRedoProcess {
|
||||
base_img: Option<Bytes>,
|
||||
records: &[WALRecord],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let stdin = &mut self.stdin;
|
||||
let stdout = &mut self.stdout;
|
||||
// Buffer the writes to avoid a lot of small syscalls.
|
||||
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
|
||||
@@ -18,5 +18,6 @@ tokio = "1.11"
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
clap = "2.33.0"
|
||||
rustls = "0.19.1"
|
||||
reqwest = { version = "0.11", features = ["blocking", "json"] }
|
||||
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{IpAddr, SocketAddr},
|
||||
};
|
||||
use std::net::{SocketAddr, ToSocketAddrs};
|
||||
|
||||
pub struct CPlaneApi {
|
||||
// address: SocketAddr,
|
||||
auth_endpoint: &'static str,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: IpAddr, // TODO: allow host name here too
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
@@ -19,8 +16,13 @@ pub struct DatabaseInfo {
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> SocketAddr {
|
||||
SocketAddr::new(self.host, self.port)
|
||||
pub fn socket_addr(&self) -> Result<SocketAddr> {
|
||||
let host_port = format!("{}:{}", self.host, self.port);
|
||||
host_port
|
||||
.to_socket_addrs()
|
||||
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
|
||||
}
|
||||
|
||||
pub fn conn_string(&self) -> String {
|
||||
@@ -31,62 +33,35 @@ impl DatabaseInfo {
|
||||
}
|
||||
}
|
||||
|
||||
// mock cplane api
|
||||
impl CPlaneApi {
|
||||
pub fn new(_address: &SocketAddr) -> CPlaneApi {
|
||||
CPlaneApi {
|
||||
// address: address.clone(),
|
||||
}
|
||||
pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
|
||||
CPlaneApi { auth_endpoint }
|
||||
}
|
||||
|
||||
pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> {
|
||||
// passwords for both is "mypass"
|
||||
let auth_map: HashMap<_, &str> = vec![
|
||||
("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"),
|
||||
("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
pub fn authenticate_proxy_request(
|
||||
&self,
|
||||
user: &str,
|
||||
database: &str,
|
||||
md5_response: &[u8],
|
||||
salt: &[u8; 4],
|
||||
) -> Result<DatabaseInfo> {
|
||||
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", user)
|
||||
.append_pair("database", database)
|
||||
.append_pair("md5response", std::str::from_utf8(md5_response)?)
|
||||
.append_pair("salt", &hex::encode(salt));
|
||||
|
||||
let stored_hash = auth_map
|
||||
.get(&user)
|
||||
.ok_or_else(|| anyhow::Error::msg("user not found"))?;
|
||||
let salted_stored_hash = format!(
|
||||
"md5{:x}",
|
||||
md5::compute([stored_hash.as_bytes(), salt].concat())
|
||||
);
|
||||
println!("cplane request: {}", url.as_str());
|
||||
|
||||
let received_hash = std::str::from_utf8(md5_response)?;
|
||||
let resp = reqwest::blocking::get(url)?;
|
||||
|
||||
println!(
|
||||
"auth: {} rh={} sh={} ssh={} {:?}",
|
||||
user, received_hash, stored_hash, salted_stored_hash, salt
|
||||
);
|
||||
|
||||
if received_hash == salted_stored_hash {
|
||||
Ok(())
|
||||
if resp.status().is_success() {
|
||||
let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got conn info: #{:?}", conn_info);
|
||||
Ok(conn_info)
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result<DatabaseInfo> {
|
||||
Ok(DatabaseInfo {
|
||||
host: "127.0.0.1".parse()?,
|
||||
port: 5432,
|
||||
dbname: "stas".to_string(),
|
||||
user: "stas".to_string(),
|
||||
password: "mypass".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
// pub fn create_database(&self, _user: &String, _database: &String) -> Result<DatabaseInfo> {
|
||||
// Ok(DatabaseInfo {
|
||||
// host: "127.0.0.1".parse()?,
|
||||
// port: 5432,
|
||||
// dbname: "stas".to_string(),
|
||||
// user: "stas".to_string(),
|
||||
// password: "mypass".to_string(),
|
||||
// })
|
||||
// }
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ pub struct ProxyConf {
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub cplane_address: SocketAddr,
|
||||
pub auth_endpoint: String,
|
||||
|
||||
pub ssl_config: Option<Arc<ServerConfig>>,
|
||||
}
|
||||
@@ -56,8 +56,7 @@ fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerCo
|
||||
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("SSL key file")?;
|
||||
let mut keys = pemfile::rsa_private_keys(&mut &key_bytes[..])
|
||||
.or_else(|_| pemfile::pkcs8_private_keys(&mut &key_bytes[..]))
|
||||
let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS keys"))?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().unwrap()
|
||||
@@ -102,6 +101,14 @@ fn main() -> anyhow::Result<()> {
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://localhost:3000/psql_session/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("auth-endpoint")
|
||||
.short("a")
|
||||
.long("auth-endpoint")
|
||||
.takes_value(true)
|
||||
.help("redirect unauthenticated users to given uri")
|
||||
.default_value("http://localhost:3000/authenticate_proxy_request/"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("ssl-key")
|
||||
.short("k")
|
||||
@@ -122,7 +129,7 @@ fn main() -> anyhow::Result<()> {
|
||||
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
|
||||
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
|
||||
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
cplane_address: "127.0.0.1:3000".parse()?,
|
||||
auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
|
||||
ssl_config: configure_ssl(&arg_matches)?,
|
||||
};
|
||||
let state = ProxyState {
|
||||
|
||||
@@ -49,7 +49,7 @@ struct MgmtHandler {
|
||||
// "host": "127.0.0.1",
|
||||
// "port": 5432,
|
||||
// "dbname": "stas",
|
||||
// "user": "stas"
|
||||
// "user": "stas",
|
||||
// "password": "mypass"
|
||||
// }
|
||||
// }
|
||||
@@ -60,6 +60,9 @@ struct MgmtHandler {
|
||||
// "Failure": "oops"
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // to test manually by sending a query to mgmt interface:
|
||||
// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
|
||||
#[derive(Deserialize)]
|
||||
pub struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
@@ -78,34 +81,47 @@ impl postgres_backend::Handler for MgmtHandler {
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
let res = try_process_query(self, pgb, query_string);
|
||||
// intercept and log error message
|
||||
if res.is_err() {
|
||||
println!("Mgmt query failed: #{:?}", res);
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
fn try_process_query(
|
||||
mgmt: &mut MgmtHandler,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
|
||||
let waiters = self.state.waiters.lock().unwrap();
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
let waiters = mgmt.state.waiters.lock().unwrap();
|
||||
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ pub fn proxy_conn_main(
|
||||
) -> anyhow::Result<()> {
|
||||
let mut conn = ProxyConnection {
|
||||
state,
|
||||
cplane: CPlaneApi::new(&state.conf.cplane_address),
|
||||
cplane: CPlaneApi::new(&state.conf.auth_endpoint),
|
||||
user: "".into(),
|
||||
database: "".into(),
|
||||
pgb: PostgresBackend::new(
|
||||
@@ -80,6 +80,8 @@ pub fn proxy_conn_main(
|
||||
conn.handle_new_user()?
|
||||
};
|
||||
|
||||
// XXX: move that inside handle_new_user/handle_existing_user to be able to
|
||||
// report wrong connection error.
|
||||
proxy_pass(conn.pgb, db_info)
|
||||
}
|
||||
|
||||
@@ -172,21 +174,31 @@ impl ProxyConnection {
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
|
||||
|
||||
if let Err(e) = self.check_auth_md5(md5_response) {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
bail!("auth failed: {}", e);
|
||||
} else {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
}
|
||||
match self.cplane.authenticate_proxy_request(
|
||||
self.user.as_str(),
|
||||
self.database.as_str(),
|
||||
md5_response,
|
||||
&self.md5_salt,
|
||||
) {
|
||||
Err(e) => {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
|
||||
// ok, we are authorized
|
||||
self.cplane.get_database_uri(&self.user, &self.database)
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
Ok(conn_info) => {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(conn_info)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bail!("protocol violation");
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
@@ -232,17 +244,11 @@ databases without opening the browser.
|
||||
|
||||
Ok(dbinfo)
|
||||
}
|
||||
|
||||
fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> {
|
||||
assert!(self.is_existing_user());
|
||||
self.cplane
|
||||
.check_auth(self.user.as_str(), md5_response, &self.md5_salt)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
|
||||
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
Ok(socket)
|
||||
@@ -273,7 +279,9 @@ fn proxy(
|
||||
|
||||
/// Proxy a client connection to a postgres database
|
||||
fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread().build()?;
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let db_stream = runtime.block_on(connect_to_db(db_info))?;
|
||||
let db_stream = db_stream.into_std()?;
|
||||
db_stream.set_nonblocking(false)?;
|
||||
|
||||
@@ -33,11 +33,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
print('LSN after 200100 rows: ' + lsn_b)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
@@ -46,15 +46,15 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
FROM generate_series(1, 200000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
print('LSN after 400100 rows: ' + lsn_c)
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
# Branch at the point where only 200100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
@@ -70,11 +70,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (100100, )
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
|
||||
# Check bad lsn's for branching
|
||||
|
||||
|
||||
@@ -9,10 +9,7 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test restarting and recreating a postgres instance
|
||||
#
|
||||
# XXX: with_wal_acceptors=True fails now, would be fixed with
|
||||
# `postgres --sync-walkeepers` patches.
|
||||
#
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False])
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
|
||||
def test_restart_compute(
|
||||
zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
import pytest
|
||||
import random
|
||||
import time
|
||||
import os
|
||||
import subprocess
|
||||
import uuid
|
||||
|
||||
from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin
|
||||
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -198,3 +202,92 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
|
||||
|
||||
stop_value.value = 1
|
||||
proc.join()
|
||||
|
||||
class ProposerPostgres:
|
||||
"""Object for running safekeepers sync with walproposer"""
|
||||
def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str):
|
||||
self.pgdata_dir: str = pgdata_dir
|
||||
self.pg_bin: PgBin = pg_bin
|
||||
self.timeline_id: str = timeline_id
|
||||
self.tenant_id: str = tenant_id
|
||||
|
||||
def pg_data_dir_path(self) -> str:
|
||||
""" Path to data directory """
|
||||
return self.pgdata_dir
|
||||
|
||||
def config_file_path(self) -> str:
|
||||
""" Path to postgresql.conf """
|
||||
return os.path.join(self.pgdata_dir, 'postgresql.conf')
|
||||
|
||||
def create_dir_config(self, wal_acceptors: str):
|
||||
""" Create dir and config for running --sync-safekeepers """
|
||||
|
||||
mkdir_if_needed(self.pg_data_dir_path())
|
||||
with open(self.config_file_path(), "w") as f:
|
||||
f.writelines([
|
||||
"synchronous_standby_names = 'walproposer'\n",
|
||||
f"zenith.zenith_timeline = '{self.timeline_id}'\n",
|
||||
f"zenith.zenith_tenant = '{self.tenant_id}'\n",
|
||||
f"wal_acceptors = '{wal_acceptors}'\n",
|
||||
])
|
||||
|
||||
def sync_safekeepers(self) -> str:
|
||||
"""
|
||||
Run 'postgres --sync-safekeepers'.
|
||||
Returns execution result, which is commit_lsn after sync.
|
||||
"""
|
||||
|
||||
command = ["postgres", "--sync-safekeepers"]
|
||||
env = {
|
||||
"PGDATA": self.pg_data_dir_path(),
|
||||
}
|
||||
|
||||
basepath = self.pg_bin.run_capture(command, env)
|
||||
stdout_filename = basepath + '.stdout'
|
||||
|
||||
with open(stdout_filename, 'r') as stdout_f:
|
||||
stdout = stdout_f.read()
|
||||
return stdout.strip("\n ")
|
||||
|
||||
|
||||
# insert wal in all safekeepers and run sync on proposer
|
||||
def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory):
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
timeline_id = uuid.uuid4().hex
|
||||
tenant_id = uuid.uuid4().hex
|
||||
|
||||
# write config for proposer
|
||||
pgdata_dir = os.path.join(repo_dir, "proposer_pgdata")
|
||||
pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id)
|
||||
pg.create_dir_config(wa_factory.get_connstrs())
|
||||
|
||||
# valid lsn, which is not in the segment start, nor in zero segment
|
||||
epoch_start_lsn = 0x16B9188 # 0/16B9188
|
||||
begin_lsn = epoch_start_lsn
|
||||
|
||||
# append and commit WAL
|
||||
lsn_after_append = []
|
||||
for i in range(3):
|
||||
res = wa_factory.instances[i].append_logical_message(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
{
|
||||
"lm_prefix": "prefix",
|
||||
"lm_message": "message",
|
||||
"set_commit_lsn": True,
|
||||
"term": 2,
|
||||
"begin_lsn": begin_lsn,
|
||||
"epoch_start_lsn": epoch_start_lsn,
|
||||
"truncate_lsn": epoch_start_lsn,
|
||||
},
|
||||
)
|
||||
lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"])
|
||||
lsn_after_append.append(lsn_hex)
|
||||
print(f"safekeeper[{i}] lsn after append: {lsn_hex}")
|
||||
|
||||
# run sync safekeepers
|
||||
lsn_after_sync = pg.sync_safekeepers()
|
||||
print(f"lsn after sync = {lsn_after_sync}")
|
||||
|
||||
assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
|
||||
|
||||
@@ -33,11 +33,16 @@ class BankClient(object):
|
||||
row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs')
|
||||
assert row['sum'] == self.n_accounts * self.init_amount
|
||||
|
||||
async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount):
|
||||
async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount, large_wal=False):
|
||||
# avoid deadlocks by sorting uids
|
||||
if from_uid > to_uid:
|
||||
from_uid, to_uid, amount = to_uid, from_uid, -amount
|
||||
|
||||
if large_wal:
|
||||
# record with size about 128kb
|
||||
await conn.execute("SELECT pg_logical_emit_message(false, 'hello', REPEAT('abacaba', 19000))")
|
||||
return
|
||||
|
||||
async with conn.transaction():
|
||||
await conn.execute(
|
||||
'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2',
|
||||
@@ -72,7 +77,7 @@ class WorkerStats(object):
|
||||
print('All workers made {} transactions'.format(progress))
|
||||
|
||||
|
||||
async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer):
|
||||
async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer, large_wal=False):
|
||||
pg_conn = await pg.connect_async()
|
||||
debug_print('Started worker {}'.format(worker_id))
|
||||
|
||||
@@ -81,7 +86,7 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
|
||||
to_uid = (from_uid + random.randint(1, n_accounts - 1)) % n_accounts
|
||||
amount = random.randint(1, max_transfer)
|
||||
|
||||
await bank_transfer(pg_conn, from_uid, to_uid, amount)
|
||||
await bank_transfer(pg_conn, from_uid, to_uid, amount, large_wal=large_wal)
|
||||
stats.inc_progress(worker_id)
|
||||
|
||||
debug_print('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))
|
||||
@@ -95,12 +100,10 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
|
||||
# On each iteration 1 acceptor is stopped, and 2 others should allow
|
||||
# background workers execute transactions. In the end, state should remain
|
||||
# consistent.
|
||||
async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10):
|
||||
async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10, period_time=10, iterations=6, large_wal=False, normal_work_sleep=0):
|
||||
n_accounts = 100
|
||||
init_amount = 100000
|
||||
max_transfer = 100
|
||||
period_time = 10
|
||||
iterations = 6
|
||||
|
||||
pg_conn = await pg.connect_async()
|
||||
bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount)
|
||||
@@ -110,7 +113,7 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
|
||||
stats = WorkerStats(n_workers)
|
||||
workers = []
|
||||
for worker_id in range(n_workers):
|
||||
worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer)
|
||||
worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer, large_wal=large_wal)
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
|
||||
@@ -118,9 +121,14 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
|
||||
victim = acceptors[it % len(acceptors)]
|
||||
victim.stop()
|
||||
|
||||
# wait for transactions that could have started and finished before
|
||||
# victim acceptor was stopped
|
||||
await asyncio.sleep(1)
|
||||
# Wait till previous victim recovers so it is ready for the next
|
||||
# iteration by making any writing xact.
|
||||
conn = await pg.connect_async()
|
||||
await conn.execute(
|
||||
'UPDATE bank_accs SET amount = amount WHERE uid = 1',
|
||||
timeout=120
|
||||
)
|
||||
await conn.close()
|
||||
|
||||
stats.reset()
|
||||
await asyncio.sleep(period_time)
|
||||
@@ -128,6 +136,11 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
|
||||
stats.check_progress()
|
||||
|
||||
victim.start()
|
||||
|
||||
# sleep to sync all safekepeers together
|
||||
if normal_work_sleep > 0:
|
||||
await asyncio.sleep(normal_work_sleep)
|
||||
|
||||
|
||||
print('Iterations are finished, exiting coroutines...')
|
||||
stats.running = False
|
||||
@@ -148,7 +161,7 @@ def test_restarts_under_load(zenith_cli, pageserver: ZenithPageserver, postgres:
|
||||
pg = postgres.create_start('test_wal_acceptors_restarts_under_load',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
asyncio.run(run_restarts_under_load(pg, wa_factory.instances))
|
||||
asyncio.run(run_restarts_under_load(pg, wa_factory.instances, iterations=9, n_workers=30, period_time=5))
|
||||
|
||||
# TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
|
||||
pg.stop()
|
||||
|
||||
@@ -1,103 +0,0 @@
|
||||
import os
|
||||
import subprocess
|
||||
import uuid
|
||||
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, PgBin
|
||||
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
class ProposerPostgres:
|
||||
"""Object for running safekeepers sync with walproposer"""
|
||||
def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str):
|
||||
self.pgdata_dir: str = pgdata_dir
|
||||
self.pg_bin: PgBin = pg_bin
|
||||
self.timeline_id: str = timeline_id
|
||||
self.tenant_id: str = tenant_id
|
||||
|
||||
def pg_data_dir_path(self) -> str:
|
||||
""" Path to data directory """
|
||||
return self.pgdata_dir
|
||||
|
||||
def config_file_path(self) -> str:
|
||||
""" Path to postgresql.conf """
|
||||
return os.path.join(self.pgdata_dir, 'postgresql.conf')
|
||||
|
||||
def create_dir_config(self, wal_acceptors: str):
|
||||
""" Create dir and config for running --sync-safekeepers """
|
||||
|
||||
mkdir_if_needed(self.pg_data_dir_path())
|
||||
with open(self.config_file_path(), "w") as f:
|
||||
f.write("zenith.zenith_timeline = '{}'\n".format(self.timeline_id))
|
||||
f.write("zenith.zenith_tenant = '{}'\n".format(self.tenant_id))
|
||||
f.write("synchronous_standby_names = '{}'\n".format("walproposer"))
|
||||
f.write("wal_acceptors = '{}'\n".format(wal_acceptors))
|
||||
|
||||
def sync_safekeepers(self) -> subprocess.CompletedProcess:
|
||||
"""
|
||||
Run 'postgres --sync-safekeepers'.
|
||||
Returns execution result, which is commit_lsn after sync.
|
||||
"""
|
||||
|
||||
pg_path = os.path.join(self.pg_bin.pg_bin_path, "postgres")
|
||||
command = [pg_path, "--sync-safekeepers"]
|
||||
env = {
|
||||
"PGDATA": self.pg_data_dir_path(),
|
||||
}
|
||||
|
||||
print('Running command "{}"'.format(" ".join(command)))
|
||||
res = subprocess.run(
|
||||
command, env=env, check=True, text=True, stdout=subprocess.PIPE
|
||||
)
|
||||
|
||||
return res.stdout.strip("\n ")
|
||||
|
||||
|
||||
# insert wal in all safekeepers and run sync on proposer
|
||||
def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory):
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
timeline_id = uuid.uuid4().hex
|
||||
tenant_id = uuid.uuid4().hex
|
||||
|
||||
# write config for proposer
|
||||
pgdata_dir = os.path.join(repo_dir, "proposer_pgdata")
|
||||
pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id)
|
||||
pg.create_dir_config(wa_factory.get_connstrs())
|
||||
|
||||
# run sync to init safekeepers with ProposerGreeting
|
||||
initial_lsn = pg.sync_safekeepers()
|
||||
|
||||
# should be 0/0 for empty safekeepers
|
||||
assert initial_lsn == "0/0"
|
||||
|
||||
# valid lsn, which is not in the segment start, nor in zero segment
|
||||
epoch_start_lsn = 0x16B9188 # 0/16B9188
|
||||
begin_lsn = epoch_start_lsn
|
||||
|
||||
# append and commit WAL
|
||||
lsn_after_append = []
|
||||
for i in range(3):
|
||||
res = wa_factory.instances[i].append_logical_message(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
{
|
||||
"lm_prefix": "prefix",
|
||||
"lm_message": "message",
|
||||
"set_commit_lsn": True,
|
||||
"term": 2,
|
||||
"begin_lsn": begin_lsn,
|
||||
"epoch_start_lsn": epoch_start_lsn,
|
||||
"truncate_lsn": epoch_start_lsn,
|
||||
},
|
||||
)
|
||||
lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"])
|
||||
lsn_after_append.append(lsn_hex)
|
||||
print(f"safekeeper[{i}] lsn after append: {lsn_hex}")
|
||||
|
||||
# run sync safekeepers
|
||||
lsn_after_sync = pg.sync_safekeepers()
|
||||
print(f"lsn after sync = {lsn_after_sync}")
|
||||
|
||||
assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
|
||||
@@ -136,7 +136,8 @@ class ZenithBenchmarker:
|
||||
# The metric should be an integer, as it's a number of bytes. But in general
|
||||
# all prometheus metrics are floats. So to be pedantic, read it as a float
|
||||
# and round to integer.
|
||||
matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
|
||||
matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', all_metrics,
|
||||
re.MULTILINE)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
@contextmanager
|
||||
|
||||
@@ -21,7 +21,7 @@ def mkdir_if_needed(path: str) -> None:
|
||||
assert os.path.isdir(path)
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
""" Run a process and capture its output
|
||||
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
@@ -29,6 +29,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
counter.
|
||||
|
||||
If those files already exist, we will overwrite them.
|
||||
Returns basepath for files with captured output.
|
||||
"""
|
||||
assert type(cmd) is list
|
||||
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
|
||||
@@ -41,6 +42,8 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
return basepath
|
||||
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
|
||||
@@ -71,7 +71,9 @@ def pytest_configure(config):
|
||||
# This is bad; we don't want any of those processes polluting the
|
||||
# result of the test.
|
||||
# NOTE this shows as an internal pytest error, there might be a better way
|
||||
raise Exception('found interfering processes running')
|
||||
raise Exception(
|
||||
'Found interfering processes running. Stop all Zenith pageservers, nodes, WALs, as well as stand-alone Postgres.'
|
||||
)
|
||||
|
||||
|
||||
def determine_scope(fixture_name: str, config: Any) -> str:
|
||||
@@ -486,17 +488,19 @@ class PgBin:
|
||||
def run_capture(self,
|
||||
command: List[str],
|
||||
env: Optional[Env] = None,
|
||||
cwd: Optional[str] = None) -> None:
|
||||
cwd: Optional[str] = None,
|
||||
**kwargs: Any) -> None:
|
||||
"""
|
||||
Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
|
||||
This is just like `run`, but for chatty programs.
|
||||
This is just like `run`, but for chatty programs. Returns basepath for files
|
||||
with captured output.
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True)
|
||||
return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs)
|
||||
|
||||
|
||||
@zenfixture
|
||||
|
||||
58
test_runner/performance/test_bulk_tenant_create.py
Normal file
58
test_runner/performance/test_bulk_tenant_create.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import timeit
|
||||
import pytest
|
||||
|
||||
from fixtures.zenith_fixtures import (
|
||||
TenantFactory,
|
||||
ZenithCli,
|
||||
PostgresFactory,
|
||||
)
|
||||
|
||||
pytest_plugins = ("fixtures.benchmark_fixture")
|
||||
|
||||
# Run bulk tenant creation test.
|
||||
#
|
||||
# Collects metrics:
|
||||
#
|
||||
# 1. Time to create {1,10,50} tenants
|
||||
# 2. Average creation time per tenant
|
||||
|
||||
|
||||
@pytest.mark.parametrize('tenants_count', [1, 5, 10])
|
||||
@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa'])
|
||||
def test_bulk_tenant_create(
|
||||
zenith_cli: ZenithCli,
|
||||
tenant_factory: TenantFactory,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory,
|
||||
use_wal_acceptors: str,
|
||||
tenants_count: int,
|
||||
zenbenchmark,
|
||||
):
|
||||
"""Measure tenant creation time (with and without wal acceptors)"""
|
||||
|
||||
time_slices = []
|
||||
|
||||
for i in range(tenants_count):
|
||||
start = timeit.default_timer()
|
||||
|
||||
tenant = tenant_factory.create()
|
||||
zenith_cli.run([
|
||||
"branch", f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", "main",
|
||||
f"--tenantid={tenant}"
|
||||
])
|
||||
|
||||
if use_wal_acceptors == 'with_wa':
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
pg_tenant = postgres.create_start(
|
||||
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
|
||||
tenant,
|
||||
wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None,
|
||||
)
|
||||
|
||||
end = timeit.default_timer()
|
||||
time_slices.append(end - start)
|
||||
|
||||
pg_tenant.stop()
|
||||
|
||||
zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: d2af682f6e...d9f7f10320
@@ -76,6 +76,43 @@ safekeepers.
|
||||
See README_PROTO.md for a more detailed desription of the consensus
|
||||
protocol. spec/ contains TLA+ specification of it.
|
||||
|
||||
# Q&A
|
||||
|
||||
Q: Why have a separate service instead of connecting Page Server directly to a
|
||||
primary PostgreSQL node?
|
||||
A: Page Server is a single server which can be lost. As our primary
|
||||
fault-tolerant storage is S3, we do not want to wait for it before
|
||||
committing a transaction. The WAL service acts as a temporary fault-tolerant
|
||||
storage for recent data before it gets to the Page Server and then finally
|
||||
to S3. Whenever WALs and pages are committed to S3, WAL's storage can be
|
||||
trimmed.
|
||||
|
||||
Q: What if the compute node evicts a page, needs it back, but the page is yet
|
||||
to reach the Page Server?
|
||||
A: If the compute node has evicted a page, all changes from that page are
|
||||
already committed, i.e. they are saved on majority of WAL safekeepers. These
|
||||
WAL records will eventually reach the Page Server. The Page Server notes
|
||||
that the compute note requests pages with a very recent LSN and will not
|
||||
respond to the compute node until it a corresponding WAL is received from WAL
|
||||
safekeepers.
|
||||
|
||||
Q: How long may Page Server wait for?
|
||||
A: Not too long, hopefully. If a page is evicted, it probably was not used for
|
||||
a while, so the WAL service have had enough time to push changes to the Page
|
||||
Server. There may be issues if there is no backpressure and compute node with
|
||||
WAL service run ahead of Page Server, though.
|
||||
There is no backpressure right now, so you may even see some spurious
|
||||
timeouts in tests.
|
||||
|
||||
Q: How do WAL safekeepers communicate with each other?
|
||||
A: They may only send each other messages via the compute node, they never
|
||||
communicate directly with each other.
|
||||
|
||||
Q: Why have a consensus algorithm if there is only a single compute node?
|
||||
A: Actually there may be moments with multiple PostgreSQL nodes running at the
|
||||
same time. E.g. we are bringing one up and one down. We would like to avoid
|
||||
simultaneous writes from different nodes, so there should be a consensus on
|
||||
who is the primary node.
|
||||
|
||||
# Terminology
|
||||
|
||||
|
||||
@@ -13,7 +13,9 @@ use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse};
|
||||
use crate::safekeeper::{AppendRequest, AppendRequestHeader, ProposerAcceptorMessage};
|
||||
use crate::safekeeper::{
|
||||
AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerGreeting,
|
||||
};
|
||||
use crate::safekeeper::{SafeKeeperState, Term};
|
||||
use crate::send_wal::SendWalHandler;
|
||||
use crate::timeline::TimelineTools;
|
||||
@@ -48,6 +50,9 @@ struct AppendResult {
|
||||
inserted_wal: InsertedWAL,
|
||||
}
|
||||
|
||||
/// Handles command to craft logical message WAL record with given
|
||||
/// content, and then append it with specified term and lsn. This
|
||||
/// function is used to test safekeepers in different scenarios.
|
||||
pub fn handle_json_ctrl(
|
||||
swh: &mut SendWalHandler,
|
||||
pgb: &mut PostgresBackend,
|
||||
@@ -62,6 +67,9 @@ pub fn handle_json_ctrl(
|
||||
let append_request: AppendLogicalMessage = serde_json::from_slice(cmd)?;
|
||||
info!("JSON_CTRL request: {:?}", append_request);
|
||||
|
||||
// need to init safekeeper state before AppendRequest
|
||||
prepare_safekeeper(swh)?;
|
||||
|
||||
let inserted_wal = append_logical_message(swh, append_request)?;
|
||||
let response = AppendResult {
|
||||
state: swh.timeline.get().get_info(),
|
||||
@@ -80,6 +88,27 @@ pub fn handle_json_ctrl(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepare safekeeper to process append requests without crashes,
|
||||
/// by sending ProposerGreeting with default server.wal_seg_size.
|
||||
fn prepare_safekeeper(swh: &mut SendWalHandler) -> Result<()> {
|
||||
let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting {
|
||||
protocol_version: 1, // current protocol
|
||||
pg_version: 0, // unknown
|
||||
proposer_id: [0u8; 16],
|
||||
system_id: 0,
|
||||
ztli: swh.timelineid.unwrap(),
|
||||
tenant_id: swh.tenantid.unwrap(),
|
||||
tli: 0,
|
||||
wal_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, // 16MB, default for tests
|
||||
});
|
||||
|
||||
let response = swh.timeline.get().process_msg(&greeting_request)?;
|
||||
match response {
|
||||
AcceptorProposerMessage::Greeting(_) => Ok(()),
|
||||
_ => anyhow::bail!("not GreetingResponse"),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct InsertedWAL {
|
||||
begin_lsn: Lsn,
|
||||
@@ -122,7 +151,7 @@ fn append_logical_message(
|
||||
|
||||
let append_response = match response {
|
||||
AcceptorProposerMessage::AppendResponse(resp) => resp,
|
||||
_ => return Err(anyhow!("not AppendResponse")),
|
||||
_ => anyhow::bail!("not AppendResponse"),
|
||||
};
|
||||
|
||||
Ok(InsertedWAL {
|
||||
@@ -148,9 +177,9 @@ impl XlLogicalMessage {
|
||||
}
|
||||
}
|
||||
|
||||
// Create new WAL record for non-transactional logical message.
|
||||
// Used for creating artificial WAL for tests, as LogicalMessage
|
||||
// record is basically no-op.
|
||||
/// Create new WAL record for non-transactional logical message.
|
||||
/// Used for creating artificial WAL for tests, as LogicalMessage
|
||||
/// record is basically no-op.
|
||||
fn encode_logical_message(prefix: String, message: String) -> Vec<u8> {
|
||||
let mut prefix_bytes = BytesMut::with_capacity(prefix.len() + 1);
|
||||
prefix_bytes.put(prefix.as_bytes());
|
||||
|
||||
@@ -275,8 +275,8 @@ impl AcceptorProposerMessage {
|
||||
pub trait Storage {
|
||||
/// Persist safekeeper state on disk, optionally syncing it.
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()>;
|
||||
/// Write piece of wal in buf to disk.
|
||||
fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
/// Write piece of wal in buf to disk and sync it.
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
}
|
||||
|
||||
/// SafeKeeper which consumes events (messages from compute) and provides
|
||||
@@ -423,7 +423,7 @@ where
|
||||
let mut last_rec_lsn = Lsn(0);
|
||||
if !msg.wal_data.is_empty() {
|
||||
self.storage
|
||||
.write_wal(&self.s, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
|
||||
// figure out last record's end lsn for reporting (if we got the
|
||||
// whole record)
|
||||
@@ -546,7 +546,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, _s: &SafeKeeperState, _startpos: Lsn, _buf: &[u8]) -> Result<()> {
|
||||
fn write_wal(&mut self, _server: &ServerInfo, _startpos: Lsn, _buf: &[u8]) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,9 +50,11 @@ impl postgres_backend::Handler for SendWalHandler {
|
||||
}
|
||||
|
||||
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: Bytes) -> Result<()> {
|
||||
// START_WAL_PUSH is the only command that initializes the timeline
|
||||
// START_WAL_PUSH is the only command that initializes the timeline in production.
|
||||
// There is also JSON_CTRL command, which should initialize the timeline for testing.
|
||||
if self.timeline.is_none() {
|
||||
if query_string.starts_with(b"START_WAL_PUSH") {
|
||||
if query_string.starts_with(b"START_WAL_PUSH") || query_string.starts_with(b"JSON_CTRL")
|
||||
{
|
||||
self.timeline.set(
|
||||
&self.conf,
|
||||
self.tenantid.unwrap(),
|
||||
|
||||
@@ -18,8 +18,8 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, Storage,
|
||||
SK_FORMAT_VERSION, SK_MAGIC,
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo,
|
||||
Storage, SK_FORMAT_VERSION, SK_MAGIC,
|
||||
};
|
||||
use crate::WalAcceptorConf;
|
||||
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
|
||||
@@ -32,7 +32,7 @@ struct SharedState {
|
||||
sk: SafeKeeper<FileStorage>,
|
||||
/// For receiving-sending wal cooperation
|
||||
/// quorum commit LSN we've notified walsenders about
|
||||
commit_lsn: Lsn,
|
||||
notified_commit_lsn: Lsn,
|
||||
/// combined hot standby feedback from all replicas
|
||||
hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
@@ -72,7 +72,7 @@ impl SharedState {
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
commit_lsn: Lsn(0),
|
||||
notified_commit_lsn: Lsn(0),
|
||||
sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state),
|
||||
hs_feedback: HotStandbyFeedback {
|
||||
ts: 0,
|
||||
@@ -186,7 +186,7 @@ impl Timeline {
|
||||
pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
loop {
|
||||
let commit_lsn = shared_state.commit_lsn;
|
||||
let commit_lsn = shared_state.notified_commit_lsn;
|
||||
// This must be `>`, not `>=`.
|
||||
if commit_lsn > lsn {
|
||||
return commit_lsn;
|
||||
@@ -198,8 +198,8 @@ impl Timeline {
|
||||
// Notify caught-up WAL senders about new WAL data received
|
||||
pub fn notify_wal_senders(&self, commit_lsn: Lsn) {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
if shared_state.commit_lsn < commit_lsn {
|
||||
shared_state.commit_lsn = commit_lsn;
|
||||
if shared_state.notified_commit_lsn < commit_lsn {
|
||||
shared_state.notified_commit_lsn = commit_lsn;
|
||||
self.cond.notify_all();
|
||||
}
|
||||
}
|
||||
@@ -337,14 +337,14 @@ impl Storage for FileStorage {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
let wal_seg_size = s.server.wal_seg_size as usize;
|
||||
let ztli = s.server.ztli;
|
||||
let wal_seg_size = server.wal_seg_size as usize;
|
||||
let ztli = server.ztli;
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize;
|
||||
@@ -365,7 +365,7 @@ impl Storage for FileStorage {
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
// note: we basically don't support changing pg timeline
|
||||
let wal_file_name = XLogFileName(s.server.tli, segno, wal_seg_size);
|
||||
let wal_file_name = XLogFileName(server.tli, segno, wal_seg_size);
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
|
||||
@@ -7,3 +7,4 @@ edition = "2018"
|
||||
prometheus = {version = "0.12", default_features=false} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
lazy_static = "1.4"
|
||||
once_cell = "1.8.0"
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::race::OnceBox;
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_histogram, Histogram};
|
||||
pub use prometheus::{register_histogram_vec, HistogramVec};
|
||||
@@ -24,9 +25,29 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||
prometheus::gather()
|
||||
}
|
||||
|
||||
static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
|
||||
|
||||
/// Sets a prefix which will be used for all common metrics, typically a service
|
||||
/// name like 'pageserver'. Should be executed exactly once in the beginning of
|
||||
/// any executable which uses common metrics.
|
||||
pub fn set_common_metrics_prefix(prefix: &'static str) {
|
||||
COMMON_METRICS_PREFIX.set(prefix.into()).unwrap();
|
||||
}
|
||||
|
||||
/// Prepends a prefix to a common metric name so they are distinguished between
|
||||
/// different services, see https://github.com/zenithdb/zenith/pull/681
|
||||
/// A call to set_common_metrics_prefix() is necessary prior to calling this.
|
||||
pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
|
||||
format!(
|
||||
"{}_{}",
|
||||
COMMON_METRICS_PREFIX.get().unwrap(),
|
||||
unprefixed_metric_name
|
||||
)
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_disk_io_bytes",
|
||||
new_common_metric_name("disk_io_bytes"),
|
||||
"Bytes written and read from disk, grouped by the operation (read|write)",
|
||||
&["io_operation"]
|
||||
)
|
||||
|
||||
@@ -38,3 +38,4 @@ rustls-split = "0.2.1"
|
||||
hex-literal = "0.3"
|
||||
bytes = "1.0"
|
||||
webpki = "0.21"
|
||||
tempfile = "3.2"
|
||||
|
||||
125
zenith_utils/src/crashsafe_dir.rs
Normal file
125
zenith_utils/src/crashsafe_dir.rs
Normal file
@@ -0,0 +1,125 @@
|
||||
use std::{
|
||||
fs::{self, File},
|
||||
io,
|
||||
path::Path,
|
||||
};
|
||||
|
||||
/// Similar to [`std::fs::create_dir`], except we fsync the
|
||||
/// created directory and its parent.
|
||||
pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
|
||||
let path = path.as_ref();
|
||||
|
||||
fs::create_dir(path)?;
|
||||
File::open(path)?.sync_all()?;
|
||||
|
||||
if let Some(parent) = path.parent() {
|
||||
File::open(parent)?.sync_all()
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"can't find parent",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to [`std::fs::create_dir_all`], except we fsync all
|
||||
/// newly created directories and the pre-existing parent.
|
||||
pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
|
||||
let mut path = path.as_ref();
|
||||
|
||||
let mut dirs_to_create = Vec::new();
|
||||
|
||||
// Figure out which directories we need to create.
|
||||
loop {
|
||||
match path.metadata() {
|
||||
Ok(metadata) if metadata.is_dir() => break,
|
||||
Ok(_) => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::AlreadyExists,
|
||||
format!("non-directory found in path: {:?}", path),
|
||||
));
|
||||
}
|
||||
Err(ref e) if e.kind() == io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
|
||||
dirs_to_create.push(path);
|
||||
|
||||
match path.parent() {
|
||||
Some(parent) => path = parent,
|
||||
None => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"can't find parent",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create directories from parent to child.
|
||||
for &path in dirs_to_create.iter().rev() {
|
||||
fs::create_dir(path)?;
|
||||
}
|
||||
|
||||
// Fsync the created directories from child to parent.
|
||||
for &path in dirs_to_create.iter() {
|
||||
File::open(path)?.sync_all()?;
|
||||
}
|
||||
|
||||
// If we created any new directories, fsync the parent.
|
||||
if !dirs_to_create.is_empty() {
|
||||
File::open(path)?.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_create_dir_fsyncd() {
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let existing_dir_path = dir.path();
|
||||
let err = create_dir(existing_dir_path).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
|
||||
|
||||
let child_dir = existing_dir_path.join("child");
|
||||
create_dir(child_dir).unwrap();
|
||||
|
||||
let nested_child_dir = existing_dir_path.join("child1").join("child2");
|
||||
let err = create_dir(nested_child_dir).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::NotFound);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_dir_all_fsyncd() {
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let existing_dir_path = dir.path();
|
||||
create_dir_all(existing_dir_path).unwrap();
|
||||
|
||||
let child_dir = existing_dir_path.join("child");
|
||||
assert!(!child_dir.exists());
|
||||
create_dir_all(&child_dir).unwrap();
|
||||
assert!(child_dir.exists());
|
||||
|
||||
let nested_child_dir = existing_dir_path.join("child1").join("child2");
|
||||
assert!(!nested_child_dir.exists());
|
||||
create_dir_all(&nested_child_dir).unwrap();
|
||||
assert!(nested_child_dir.exists());
|
||||
|
||||
let file_path = existing_dir_path.join("file");
|
||||
std::fs::write(&file_path, b"").unwrap();
|
||||
|
||||
let err = create_dir_all(&file_path).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
|
||||
|
||||
let invalid_dir_path = file_path.join("folder");
|
||||
create_dir_all(&invalid_dir_path).unwrap_err();
|
||||
}
|
||||
}
|
||||
@@ -9,14 +9,14 @@ use routerify::ext::RequestExt;
|
||||
use routerify::RequestInfo;
|
||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||
use std::net::TcpListener;
|
||||
use zenith_metrics::{register_int_counter, IntCounter};
|
||||
use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
|
||||
use zenith_metrics::{Encoder, TextEncoder};
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
lazy_static! {
|
||||
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
|
||||
"pageserver_serve_metrics_count",
|
||||
new_common_metric_name("serve_metrics_count"),
|
||||
"Number of metric requests made"
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
@@ -18,6 +18,9 @@ pub mod pq_proto;
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
// helper functions for creating and fsyncing directories/trees
|
||||
pub mod crashsafe_dir;
|
||||
|
||||
// common authentication routines
|
||||
pub mod auth;
|
||||
|
||||
|
||||
@@ -192,9 +192,7 @@ impl AtomicLsn {
|
||||
/// This operation will panic on overflow.
|
||||
pub fn fetch_add(&self, val: u64) -> Lsn {
|
||||
let prev = self.inner.fetch_add(val, Ordering::AcqRel);
|
||||
if prev.checked_add(val).is_none() {
|
||||
panic!("AtomicLsn overflow");
|
||||
}
|
||||
assert!(prev.checked_add(val).is_some(), "AtomicLsn overflow");
|
||||
Lsn(prev)
|
||||
}
|
||||
|
||||
|
||||
@@ -652,7 +652,7 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
|
||||
BeMessage::EncryptionResponse(should_negotiate) => {
|
||||
let response = if *should_negotiate { b'Y' } else { b'N' };
|
||||
let response = if *should_negotiate { b'S' } else { b'N' };
|
||||
buf.put_u8(response);
|
||||
}
|
||||
|
||||
|
||||
@@ -78,6 +78,10 @@ macro_rules! zid_newtype {
|
||||
pub fn generate() -> Self {
|
||||
$t(ZId::generate())
|
||||
}
|
||||
|
||||
pub const fn from_array(b: [u8; 16]) -> Self {
|
||||
$t(ZId(b))
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for $t {
|
||||
|
||||
@@ -43,7 +43,7 @@ fn ssl() {
|
||||
client_sock.write_u32::<BigEndian>(80877103).unwrap();
|
||||
|
||||
let ssl_response = client_sock.read_u8().unwrap();
|
||||
assert_eq!(b'Y', ssl_response);
|
||||
assert_eq!(b'S', ssl_response);
|
||||
|
||||
let mut cfg = rustls::ClientConfig::new();
|
||||
cfg.root_store.add(&CERT).unwrap();
|
||||
|
||||
Reference in New Issue
Block a user