mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-28 09:40:36 +00:00
Compare commits
42 Commits
ps-thread-
...
layer_comp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c6416c5aa4 | ||
|
|
5217058e8e | ||
|
|
75f71a6380 | ||
|
|
54b75248ff | ||
|
|
0e1bd57c53 | ||
|
|
1d71949c51 | ||
|
|
7d565aa4b9 | ||
|
|
72a7220dc8 | ||
|
|
b0d114ee3f | ||
|
|
38f2d165b7 | ||
|
|
5a5737278e | ||
|
|
06f5e017a1 | ||
|
|
887b0e14d9 | ||
|
|
c584d90bb9 | ||
|
|
7997fc2932 | ||
|
|
24d2313d0b | ||
|
|
9ab52e2186 | ||
|
|
6f1f33ef42 | ||
|
|
703f691df8 | ||
|
|
2b265fd6dc | ||
|
|
d32b491a53 | ||
|
|
18272f53db | ||
|
|
d9bc3fbc8d | ||
|
|
a0eb50552b | ||
|
|
8ea907b66c | ||
|
|
51c64d9a79 | ||
|
|
56d7ccbd3d | ||
|
|
f40d29a035 | ||
|
|
057468e27c | ||
|
|
6a9aab5be1 | ||
|
|
224c2146d4 | ||
|
|
73b6a6e3c3 | ||
|
|
0ed0433e82 | ||
|
|
e90b83646c | ||
|
|
4aac2aded4 | ||
|
|
076b8e3d04 | ||
|
|
39eadf6236 | ||
|
|
4472d49c1e | ||
|
|
dc057ace2f | ||
|
|
0e49d748b8 | ||
|
|
fc7d1ba043 | ||
|
|
e28b3dee37 |
@@ -1,5 +1,6 @@
|
|||||||
[pageservers]
|
[pageservers]
|
||||||
zenith-1-ps-1 console_region_id=1
|
#zenith-1-ps-1 console_region_id=1
|
||||||
|
zenith-1-ps-2 console_region_id=1
|
||||||
|
|
||||||
[safekeepers]
|
[safekeepers]
|
||||||
zenith-1-sk-1 console_region_id=1
|
zenith-1-sk-1 console_region_id=1
|
||||||
@@ -15,4 +16,3 @@ console_mgmt_base_url = http://console-release.local
|
|||||||
bucket_name = zenith-storage-oregon
|
bucket_name = zenith-storage-oregon
|
||||||
bucket_region = us-west-2
|
bucket_region = us-west-2
|
||||||
etcd_endpoints = etcd-release.local:2379
|
etcd_endpoints = etcd-release.local:2379
|
||||||
safekeeper_enable_s3_offload = true
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ zenith-us-stage-ps-2 console_region_id=27
|
|||||||
zenith-us-stage-sk-1 console_region_id=27
|
zenith-us-stage-sk-1 console_region_id=27
|
||||||
zenith-us-stage-sk-4 console_region_id=27
|
zenith-us-stage-sk-4 console_region_id=27
|
||||||
zenith-us-stage-sk-5 console_region_id=27
|
zenith-us-stage-sk-5 console_region_id=27
|
||||||
|
zenith-us-stage-sk-6 console_region_id=27
|
||||||
|
|
||||||
[storage:children]
|
[storage:children]
|
||||||
pageservers
|
pageservers
|
||||||
@@ -16,4 +17,3 @@ console_mgmt_base_url = http://console-staging.local
|
|||||||
bucket_name = zenith-staging-storage-us-east-1
|
bucket_name = zenith-staging-storage-us-east-1
|
||||||
bucket_region = us-east-1
|
bucket_region = us-east-1
|
||||||
etcd_endpoints = etcd-staging.local:2379
|
etcd_endpoints = etcd-staging.local:2379
|
||||||
safekeeper_enable_s3_offload = false
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
|||||||
Type=simple
|
Type=simple
|
||||||
User=safekeeper
|
User=safekeeper
|
||||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
|
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
|
||||||
ExecReload=/bin/kill -HUP $MAINPID
|
ExecReload=/bin/kill -HUP $MAINPID
|
||||||
KillMode=mixed
|
KillMode=mixed
|
||||||
KillSignal=SIGINT
|
KillSignal=SIGINT
|
||||||
|
|||||||
@@ -11,15 +11,6 @@ executors:
|
|||||||
- image: zimg/rust:1.58
|
- image: zimg/rust:1.58
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check-codestyle-rust:
|
|
||||||
executor: neon-xlarge-executor
|
|
||||||
steps:
|
|
||||||
- checkout
|
|
||||||
- run:
|
|
||||||
name: rustfmt
|
|
||||||
when: always
|
|
||||||
command: cargo fmt --all -- --check
|
|
||||||
|
|
||||||
# A job to build postgres
|
# A job to build postgres
|
||||||
build-postgres:
|
build-postgres:
|
||||||
executor: neon-xlarge-executor
|
executor: neon-xlarge-executor
|
||||||
@@ -685,7 +676,7 @@ jobs:
|
|||||||
name: Setup helm v3
|
name: Setup helm v3
|
||||||
command: |
|
command: |
|
||||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||||
helm repo add zenithdb https://neondatabase.github.io/helm-charts
|
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||||
- run:
|
- run:
|
||||||
name: Re-deploy proxy
|
name: Re-deploy proxy
|
||||||
command: |
|
command: |
|
||||||
@@ -740,7 +731,6 @@ jobs:
|
|||||||
workflows:
|
workflows:
|
||||||
build_and_test:
|
build_and_test:
|
||||||
jobs:
|
jobs:
|
||||||
- check-codestyle-rust
|
|
||||||
- check-codestyle-python
|
- check-codestyle-python
|
||||||
- build-postgres:
|
- build-postgres:
|
||||||
name: build-postgres-<< matrix.build_type >>
|
name: build-postgres-<< matrix.build_type >>
|
||||||
|
|||||||
10
.github/workflows/testing.yml
vendored
10
.github/workflows/testing.yml
vendored
@@ -1,8 +1,10 @@
|
|||||||
name: Build and Test
|
name: Build and Test
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
|
||||||
push:
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
regression-check:
|
regression-check:
|
||||||
@@ -23,13 +25,17 @@ jobs:
|
|||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 2
|
fetch-depth: 2
|
||||||
|
|
||||||
- name: install rust toolchain ${{ matrix.rust_toolchain }}
|
- name: Install rust toolchain ${{ matrix.rust_toolchain }}
|
||||||
uses: actions-rs/toolchain@v1
|
uses: actions-rs/toolchain@v1
|
||||||
with:
|
with:
|
||||||
profile: minimal
|
profile: minimal
|
||||||
toolchain: ${{ matrix.rust_toolchain }}
|
toolchain: ${{ matrix.rust_toolchain }}
|
||||||
|
components: rustfmt, clippy
|
||||||
override: true
|
override: true
|
||||||
|
|
||||||
|
- name: Check formatting
|
||||||
|
run: cargo fmt --all -- --check
|
||||||
|
|
||||||
- name: Install Ubuntu postgres dependencies
|
- name: Install Ubuntu postgres dependencies
|
||||||
if: matrix.os == 'ubuntu-latest'
|
if: matrix.os == 'ubuntu-latest'
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
53
Cargo.lock
generated
53
Cargo.lock
generated
@@ -292,6 +292,9 @@ name = "cc"
|
|||||||
version = "1.0.72"
|
version = "1.0.72"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
|
checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
|
||||||
|
dependencies = [
|
||||||
|
"jobserver",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cexpr"
|
name = "cexpr"
|
||||||
@@ -1356,6 +1359,15 @@ version = "1.0.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"
|
checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "jobserver"
|
||||||
|
version = "0.1.24"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "js-sys"
|
name = "js-sys"
|
||||||
version = "0.3.56"
|
version = "0.3.56"
|
||||||
@@ -1722,9 +1734,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
version = "1.9.0"
|
version = "1.10.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
|
checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "oorandom"
|
name = "oorandom"
|
||||||
@@ -1831,6 +1843,7 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
|
"zstd",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -2394,6 +2407,8 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
|
"metrics",
|
||||||
|
"once_cell",
|
||||||
"rusoto_core",
|
"rusoto_core",
|
||||||
"rusoto_s3",
|
"rusoto_s3",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -2401,6 +2416,7 @@ dependencies = [
|
|||||||
"tempfile",
|
"tempfile",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-util 0.7.0",
|
"tokio-util 0.7.0",
|
||||||
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
@@ -2652,6 +2668,7 @@ name = "safekeeper"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-trait",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"clap 3.0.14",
|
"clap 3.0.14",
|
||||||
@@ -2660,12 +2677,14 @@ dependencies = [
|
|||||||
"daemonize",
|
"daemonize",
|
||||||
"etcd_broker",
|
"etcd_broker",
|
||||||
"fs2",
|
"fs2",
|
||||||
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hex",
|
"hex",
|
||||||
"humantime",
|
"humantime",
|
||||||
"hyper",
|
"hyper",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"metrics",
|
"metrics",
|
||||||
|
"once_cell",
|
||||||
"postgres",
|
"postgres",
|
||||||
"postgres-protocol",
|
"postgres-protocol",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
@@ -2679,6 +2698,7 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
"tokio-util 0.7.0",
|
"tokio-util 0.7.0",
|
||||||
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
@@ -3931,3 +3951,32 @@ name = "zeroize"
|
|||||||
version = "1.5.2"
|
version = "1.5.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7c88870063c39ee00ec285a2f8d6a966e5b6fb2becc4e8dac77ed0d370ed6006"
|
checksum = "7c88870063c39ee00ec285a2f8d6a966e5b6fb2becc4e8dac77ed0d370ed6006"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd"
|
||||||
|
version = "0.11.1+zstd.1.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "77a16b8414fde0414e90c612eba70985577451c4c504b99885ebed24762cb81a"
|
||||||
|
dependencies = [
|
||||||
|
"zstd-safe",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd-safe"
|
||||||
|
version = "5.0.1+zstd.1.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7c12659121420dd6365c5c3de4901f97145b79651fb1d25814020ed2ed0585ae"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"zstd-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd-sys"
|
||||||
|
version = "2.0.1+zstd.1.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|||||||
@@ -146,8 +146,14 @@ impl ComputeNode {
|
|||||||
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
||||||
};
|
};
|
||||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||||
let mut ar = tar::Archive::new(copyreader);
|
|
||||||
|
|
||||||
|
// Read the archive directly from the `CopyOutReader`
|
||||||
|
//
|
||||||
|
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||||
|
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||||
|
// sends an Error after finishing the tarball, we will not notice it.
|
||||||
|
let mut ar = tar::Archive::new(copyreader);
|
||||||
|
ar.set_ignore_zeros(true);
|
||||||
ar.unpack(&self.pgdata)?;
|
ar.unpack(&self.pgdata)?;
|
||||||
|
|
||||||
self.metrics.basebackup_ms.store(
|
self.metrics.basebackup_ms.store(
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
tar = "0.4.33"
|
tar = "0.4.38"
|
||||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_with = "1.12.0"
|
serde_with = "1.12.0"
|
||||||
|
|||||||
@@ -231,8 +231,13 @@ impl PostgresNode {
|
|||||||
.context("page server 'basebackup' command failed")?;
|
.context("page server 'basebackup' command failed")?;
|
||||||
|
|
||||||
// Read the archive directly from the `CopyOutReader`
|
// Read the archive directly from the `CopyOutReader`
|
||||||
tar::Archive::new(copyreader)
|
//
|
||||||
.unpack(&self.pgdata())
|
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||||
|
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||||
|
// sends an Error after finishing the tarball, we will not notice it.
|
||||||
|
let mut ar = tar::Archive::new(copyreader);
|
||||||
|
ar.set_ignore_zeros(true);
|
||||||
|
ar.unpack(&self.pgdata())
|
||||||
.context("extracting base backup failed")?;
|
.context("extracting base backup failed")?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -274,6 +279,8 @@ impl PostgresNode {
|
|||||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||||
conf.append("port", &self.address.port().to_string());
|
conf.append("port", &self.address.port().to_string());
|
||||||
conf.append("wal_keep_size", "0");
|
conf.append("wal_keep_size", "0");
|
||||||
|
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
|
||||||
|
conf.append("restart_after_crash", "off");
|
||||||
|
|
||||||
// Configure the node to fetch pages from pageserver
|
// Configure the node to fetch pages from pageserver
|
||||||
let pageserver_connstr = {
|
let pageserver_connstr = {
|
||||||
|
|||||||
@@ -48,6 +48,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
|||||||
format!("--data-dir={}", etcd_data_dir.display()),
|
format!("--data-dir={}", etcd_data_dir.display()),
|
||||||
format!("--listen-client-urls={client_urls}"),
|
format!("--listen-client-urls={client_urls}"),
|
||||||
format!("--advertise-client-urls={client_urls}"),
|
format!("--advertise-client-urls={client_urls}"),
|
||||||
|
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||||
|
// size smaller. Our test etcd clusters are very small.
|
||||||
|
// See https://github.com/etcd-io/etcd/issues/7910
|
||||||
|
"--quota-backend-bytes=100000000".to_string(),
|
||||||
])
|
])
|
||||||
.stdout(Stdio::from(etcd_stdout_file))
|
.stdout(Stdio::from(etcd_stdout_file))
|
||||||
.stderr(Stdio::from(etcd_stderr_file))
|
.stderr(Stdio::from(etcd_stderr_file))
|
||||||
|
|||||||
@@ -49,3 +49,12 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
|||||||
cmd
|
cmd
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||||
|
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
|
||||||
|
if let Ok(value) = std::env::var(env_key) {
|
||||||
|
cmd = cmd.env(env_key, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cmd
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ use std::process::{Command, Stdio};
|
|||||||
use utils::{
|
use utils::{
|
||||||
auth::{encode_from_key_file, Claims, Scope},
|
auth::{encode_from_key_file, Claims, Scope},
|
||||||
postgres_backend::AuthType,
|
postgres_backend::AuthType,
|
||||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::safekeeper::SafekeeperNode;
|
use crate::safekeeper::SafekeeperNode;
|
||||||
@@ -136,7 +136,7 @@ impl EtcdBroker {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub struct PageServerConf {
|
pub struct PageServerConf {
|
||||||
// node id
|
// node id
|
||||||
pub id: ZNodeId,
|
pub id: NodeId,
|
||||||
// Pageserver connection settings
|
// Pageserver connection settings
|
||||||
pub listen_pg_addr: String,
|
pub listen_pg_addr: String,
|
||||||
pub listen_http_addr: String,
|
pub listen_http_addr: String,
|
||||||
@@ -151,7 +151,7 @@ pub struct PageServerConf {
|
|||||||
impl Default for PageServerConf {
|
impl Default for PageServerConf {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
id: ZNodeId(0),
|
id: NodeId(0),
|
||||||
listen_pg_addr: String::new(),
|
listen_pg_addr: String::new(),
|
||||||
listen_http_addr: String::new(),
|
listen_http_addr: String::new(),
|
||||||
auth_type: AuthType::Trust,
|
auth_type: AuthType::Trust,
|
||||||
@@ -163,19 +163,23 @@ impl Default for PageServerConf {
|
|||||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub struct SafekeeperConf {
|
pub struct SafekeeperConf {
|
||||||
pub id: ZNodeId,
|
pub id: NodeId,
|
||||||
pub pg_port: u16,
|
pub pg_port: u16,
|
||||||
pub http_port: u16,
|
pub http_port: u16,
|
||||||
pub sync: bool,
|
pub sync: bool,
|
||||||
|
pub remote_storage: Option<String>,
|
||||||
|
pub backup_threads: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SafekeeperConf {
|
impl Default for SafekeeperConf {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
id: ZNodeId(0),
|
id: NodeId(0),
|
||||||
pg_port: 0,
|
pg_port: 0,
|
||||||
http_port: 0,
|
http_port: 0,
|
||||||
sync: true,
|
sync: true,
|
||||||
|
remote_storage: None,
|
||||||
|
backup_threads: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -377,6 +381,7 @@ impl LocalEnv {
|
|||||||
base_path != Path::new(""),
|
base_path != Path::new(""),
|
||||||
"repository base path is missing"
|
"repository base path is missing"
|
||||||
);
|
);
|
||||||
|
|
||||||
ensure!(
|
ensure!(
|
||||||
!base_path.exists(),
|
!base_path.exists(),
|
||||||
"directory '{}' already exists. Perhaps already initialized?",
|
"directory '{}' already exists. Perhaps already initialized?",
|
||||||
|
|||||||
@@ -18,12 +18,12 @@ use thiserror::Error;
|
|||||||
use utils::{
|
use utils::{
|
||||||
connstring::connection_address,
|
connstring::connection_address,
|
||||||
http::error::HttpErrorBody,
|
http::error::HttpErrorBody,
|
||||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||||
use crate::storage::PageServerNode;
|
use crate::storage::PageServerNode;
|
||||||
use crate::{fill_rust_env_vars, read_pidfile};
|
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum SafekeeperHttpError {
|
pub enum SafekeeperHttpError {
|
||||||
@@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response {
|
|||||||
//
|
//
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct SafekeeperNode {
|
pub struct SafekeeperNode {
|
||||||
pub id: ZNodeId,
|
pub id: NodeId,
|
||||||
|
|
||||||
pub conf: SafekeeperConf,
|
pub conf: SafekeeperConf,
|
||||||
|
|
||||||
@@ -100,7 +100,7 @@ impl SafekeeperNode {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf {
|
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||||
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
|
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -143,6 +143,14 @@ impl SafekeeperNode {
|
|||||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||||
}
|
}
|
||||||
|
if let Some(threads) = self.conf.backup_threads {
|
||||||
|
cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
|
||||||
|
}
|
||||||
|
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||||
|
cmd.args(&["--remote-storage", remote_storage]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fill_aws_secrets_vars(&mut cmd);
|
||||||
|
|
||||||
if !cmd.status()?.success() {
|
if !cmd.status()?.success() {
|
||||||
bail!(
|
bail!(
|
||||||
@@ -286,7 +294,7 @@ impl SafekeeperNode {
|
|||||||
&self,
|
&self,
|
||||||
tenant_id: ZTenantId,
|
tenant_id: ZTenantId,
|
||||||
timeline_id: ZTimelineId,
|
timeline_id: ZTimelineId,
|
||||||
peer_ids: Vec<ZNodeId>,
|
peer_ids: Vec<NodeId>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.http_request(
|
.http_request(
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ use utils::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use crate::local_env::LocalEnv;
|
use crate::local_env::LocalEnv;
|
||||||
use crate::{fill_rust_env_vars, read_pidfile};
|
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
|
||||||
use pageserver::tenant_mgr::TenantInfo;
|
use pageserver::tenant_mgr::TenantInfo;
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
@@ -493,12 +493,3 @@ impl PageServerNode {
|
|||||||
Ok(timeline_info_response)
|
Ok(timeline_info_response)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
|
||||||
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
|
|
||||||
if let Ok(value) = std::env::var(env_key) {
|
|
||||||
cmd = cmd.env(env_key, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cmd
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ use tokio::{sync::mpsc, task::JoinHandle};
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{
|
use utils::{
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Default value to use for prefixing to all etcd keys with.
|
/// Default value to use for prefixing to all etcd keys with.
|
||||||
@@ -25,7 +25,7 @@ pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
|||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
struct SafekeeperTimeline {
|
struct SafekeeperTimeline {
|
||||||
safekeeper_id: ZNodeId,
|
safekeeper_id: NodeId,
|
||||||
info: SkTimelineInfo,
|
info: SkTimelineInfo,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,10 +43,10 @@ pub struct SkTimelineInfo {
|
|||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub commit_lsn: Option<Lsn>,
|
pub commit_lsn: Option<Lsn>,
|
||||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
/// LSN up to which safekeeper has backed WAL.
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub s3_wal_lsn: Option<Lsn>,
|
pub backup_lsn: Option<Lsn>,
|
||||||
/// LSN of last checkpoint uploaded by pageserver.
|
/// LSN of last checkpoint uploaded by pageserver.
|
||||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
@@ -71,7 +71,7 @@ pub enum BrokerError {
|
|||||||
/// A way to control the data retrieval from a certain subscription.
|
/// A way to control the data retrieval from a certain subscription.
|
||||||
pub struct SkTimelineSubscription {
|
pub struct SkTimelineSubscription {
|
||||||
safekeeper_timeline_updates:
|
safekeeper_timeline_updates:
|
||||||
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>>,
|
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>>,
|
||||||
kind: SkTimelineSubscriptionKind,
|
kind: SkTimelineSubscriptionKind,
|
||||||
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||||
watcher: Watcher,
|
watcher: Watcher,
|
||||||
@@ -81,7 +81,7 @@ impl SkTimelineSubscription {
|
|||||||
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
||||||
pub async fn fetch_data(
|
pub async fn fetch_data(
|
||||||
&mut self,
|
&mut self,
|
||||||
) -> Option<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>> {
|
) -> Option<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>> {
|
||||||
self.safekeeper_timeline_updates.recv().await
|
self.safekeeper_timeline_updates.recv().await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,7 +221,7 @@ pub async fn subscribe_to_safekeeper_timeline_updates(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>> = HashMap::new();
|
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>> = HashMap::new();
|
||||||
// Keep track that the timeline data updates from etcd arrive in the right order.
|
// Keep track that the timeline data updates from etcd arrive in the right order.
|
||||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
||||||
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
||||||
@@ -299,18 +299,18 @@ fn parse_etcd_key_value(
|
|||||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||||
parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
|
parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
|
||||||
),
|
),
|
||||||
ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
|
NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
|
||||||
),
|
),
|
||||||
SubscriptionKind::Tenant(tenant_id) => (
|
SubscriptionKind::Tenant(tenant_id) => (
|
||||||
ZTenantTimelineId::new(
|
ZTenantTimelineId::new(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||||
),
|
),
|
||||||
ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
|
NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
|
||||||
),
|
),
|
||||||
SubscriptionKind::Timeline(zttid) => (
|
SubscriptionKind::Timeline(zttid) => (
|
||||||
zttid,
|
zttid,
|
||||||
ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
|
NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -5,14 +5,17 @@ edition = "2021"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
async-trait = "0.1"
|
||||||
tokio-util = { version = "0.7", features = ["io"] }
|
metrics = { version = "0.1", path = "../metrics" }
|
||||||
tracing = "0.1.27"
|
once_cell = "1.8.0"
|
||||||
rusoto_core = "0.48"
|
rusoto_core = "0.48"
|
||||||
rusoto_s3 = "0.48"
|
rusoto_s3 = "0.48"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
async-trait = "0.1"
|
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||||
|
tokio-util = { version = "0.7", features = ["io"] }
|
||||||
|
toml_edit = { version = "0.13", features = ["easy"] }
|
||||||
|
tracing = "0.1.27"
|
||||||
|
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|
||||||
|
|||||||
@@ -16,8 +16,10 @@ use std::{
|
|||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::{bail, Context};
|
||||||
|
|
||||||
use tokio::io;
|
use tokio::io;
|
||||||
|
use toml_edit::Item;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
pub use self::{
|
pub use self::{
|
||||||
@@ -203,6 +205,90 @@ pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str)
|
|||||||
.with_extension(new_extension.as_ref())
|
.with_extension(new_extension.as_ref())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl RemoteStorageConfig {
|
||||||
|
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||||
|
let local_path = toml.get("local_path");
|
||||||
|
let bucket_name = toml.get("bucket_name");
|
||||||
|
let bucket_region = toml.get("bucket_region");
|
||||||
|
|
||||||
|
let max_concurrent_syncs = NonZeroUsize::new(
|
||||||
|
parse_optional_integer("max_concurrent_syncs", toml)?
|
||||||
|
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
|
||||||
|
)
|
||||||
|
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
|
||||||
|
|
||||||
|
let max_sync_errors = NonZeroU32::new(
|
||||||
|
parse_optional_integer("max_sync_errors", toml)?
|
||||||
|
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
||||||
|
)
|
||||||
|
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
||||||
|
|
||||||
|
let concurrency_limit = NonZeroUsize::new(
|
||||||
|
parse_optional_integer("concurrency_limit", toml)?
|
||||||
|
.unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
||||||
|
)
|
||||||
|
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||||
|
|
||||||
|
let storage = match (local_path, bucket_name, bucket_region) {
|
||||||
|
(None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
|
||||||
|
(_, Some(_), None) => {
|
||||||
|
bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
|
||||||
|
}
|
||||||
|
(_, None, Some(_)) => {
|
||||||
|
bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
|
||||||
|
}
|
||||||
|
(None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
|
||||||
|
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
||||||
|
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
||||||
|
prefix_in_bucket: toml
|
||||||
|
.get("prefix_in_bucket")
|
||||||
|
.map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
|
||||||
|
.transpose()?,
|
||||||
|
endpoint: toml
|
||||||
|
.get("endpoint")
|
||||||
|
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||||
|
.transpose()?,
|
||||||
|
concurrency_limit,
|
||||||
|
}),
|
||||||
|
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
|
||||||
|
parse_toml_string("local_path", local_path)?,
|
||||||
|
)),
|
||||||
|
(Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(RemoteStorageConfig {
|
||||||
|
max_concurrent_syncs,
|
||||||
|
max_sync_errors,
|
||||||
|
storage,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper functions to parse a toml Item
|
||||||
|
fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
|
||||||
|
where
|
||||||
|
I: TryFrom<i64, Error = E>,
|
||||||
|
E: std::error::Error + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
let toml_integer = match item.get(name) {
|
||||||
|
Some(item) => item
|
||||||
|
.as_integer()
|
||||||
|
.with_context(|| format!("configure option {name} is not an integer"))?,
|
||||||
|
None => return Ok(None),
|
||||||
|
};
|
||||||
|
|
||||||
|
I::try_from(toml_integer)
|
||||||
|
.map(Some)
|
||||||
|
.with_context(|| format!("configure option {name} is too large"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
|
||||||
|
let s = item
|
||||||
|
.as_str()
|
||||||
|
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||||
|
Ok(s.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
@@ -23,6 +23,71 @@ use crate::{strip_path_prefix, RemoteStorage, S3Config};
|
|||||||
|
|
||||||
use super::StorageMetadata;
|
use super::StorageMetadata;
|
||||||
|
|
||||||
|
pub(super) mod metrics {
|
||||||
|
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"remote_storage_s3_requests_count",
|
||||||
|
"Number of s3 requests of particular type",
|
||||||
|
&["request_type"],
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"remote_storage_s3_failures_count",
|
||||||
|
"Number of failed s3 requests of particular type",
|
||||||
|
&["request_type"],
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
pub fn inc_get_object() {
|
||||||
|
S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_get_object_fail() {
|
||||||
|
S3_REQUESTS_FAIL_COUNT
|
||||||
|
.with_label_values(&["get_object"])
|
||||||
|
.inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_put_object() {
|
||||||
|
S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_put_object_fail() {
|
||||||
|
S3_REQUESTS_FAIL_COUNT
|
||||||
|
.with_label_values(&["put_object"])
|
||||||
|
.inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_delete_object() {
|
||||||
|
S3_REQUESTS_COUNT
|
||||||
|
.with_label_values(&["delete_object"])
|
||||||
|
.inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_delete_object_fail() {
|
||||||
|
S3_REQUESTS_FAIL_COUNT
|
||||||
|
.with_label_values(&["delete_object"])
|
||||||
|
.inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_list_objects() {
|
||||||
|
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc_list_objects_fail() {
|
||||||
|
S3_REQUESTS_FAIL_COUNT
|
||||||
|
.with_label_values(&["list_objects"])
|
||||||
|
.inc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const S3_PREFIX_SEPARATOR: char = '/';
|
const S3_PREFIX_SEPARATOR: char = '/';
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||||
@@ -152,6 +217,9 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.acquire()
|
.acquire()
|
||||||
.await
|
.await
|
||||||
.context("Concurrency limiter semaphore got closed during S3 list")?;
|
.context("Concurrency limiter semaphore got closed during S3 list")?;
|
||||||
|
|
||||||
|
metrics::inc_list_objects();
|
||||||
|
|
||||||
let fetch_response = self
|
let fetch_response = self
|
||||||
.client
|
.client
|
||||||
.list_objects_v2(ListObjectsV2Request {
|
.list_objects_v2(ListObjectsV2Request {
|
||||||
@@ -160,7 +228,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
continuation_token,
|
continuation_token,
|
||||||
..ListObjectsV2Request::default()
|
..ListObjectsV2Request::default()
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
metrics::inc_list_objects_fail();
|
||||||
|
e
|
||||||
|
})?;
|
||||||
document_keys.extend(
|
document_keys.extend(
|
||||||
fetch_response
|
fetch_response
|
||||||
.contents
|
.contents
|
||||||
@@ -190,6 +262,8 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.acquire()
|
.acquire()
|
||||||
.await
|
.await
|
||||||
.context("Concurrency limiter semaphore got closed during S3 upload")?;
|
.context("Concurrency limiter semaphore got closed during S3 upload")?;
|
||||||
|
|
||||||
|
metrics::inc_put_object();
|
||||||
self.client
|
self.client
|
||||||
.put_object(PutObjectRequest {
|
.put_object(PutObjectRequest {
|
||||||
body: Some(StreamingBody::new_with_size(
|
body: Some(StreamingBody::new_with_size(
|
||||||
@@ -201,7 +275,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
metadata: metadata.map(|m| m.0),
|
metadata: metadata.map(|m| m.0),
|
||||||
..PutObjectRequest::default()
|
..PutObjectRequest::default()
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
metrics::inc_put_object_fail();
|
||||||
|
e
|
||||||
|
})?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -215,6 +293,9 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.acquire()
|
.acquire()
|
||||||
.await
|
.await
|
||||||
.context("Concurrency limiter semaphore got closed during S3 download")?;
|
.context("Concurrency limiter semaphore got closed during S3 download")?;
|
||||||
|
|
||||||
|
metrics::inc_get_object();
|
||||||
|
|
||||||
let object_output = self
|
let object_output = self
|
||||||
.client
|
.client
|
||||||
.get_object(GetObjectRequest {
|
.get_object(GetObjectRequest {
|
||||||
@@ -222,7 +303,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
key: from.key().to_owned(),
|
key: from.key().to_owned(),
|
||||||
..GetObjectRequest::default()
|
..GetObjectRequest::default()
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
metrics::inc_get_object_fail();
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
if let Some(body) = object_output.body {
|
if let Some(body) = object_output.body {
|
||||||
let mut from = io::BufReader::new(body.into_async_read());
|
let mut from = io::BufReader::new(body.into_async_read());
|
||||||
@@ -251,6 +336,9 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.acquire()
|
.acquire()
|
||||||
.await
|
.await
|
||||||
.context("Concurrency limiter semaphore got closed during S3 range download")?;
|
.context("Concurrency limiter semaphore got closed during S3 range download")?;
|
||||||
|
|
||||||
|
metrics::inc_get_object();
|
||||||
|
|
||||||
let object_output = self
|
let object_output = self
|
||||||
.client
|
.client
|
||||||
.get_object(GetObjectRequest {
|
.get_object(GetObjectRequest {
|
||||||
@@ -259,7 +347,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
range,
|
range,
|
||||||
..GetObjectRequest::default()
|
..GetObjectRequest::default()
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
metrics::inc_get_object_fail();
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
if let Some(body) = object_output.body {
|
if let Some(body) = object_output.body {
|
||||||
let mut from = io::BufReader::new(body.into_async_read());
|
let mut from = io::BufReader::new(body.into_async_read());
|
||||||
@@ -275,13 +367,20 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.acquire()
|
.acquire()
|
||||||
.await
|
.await
|
||||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||||
|
|
||||||
|
metrics::inc_delete_object();
|
||||||
|
|
||||||
self.client
|
self.client
|
||||||
.delete_object(DeleteObjectRequest {
|
.delete_object(DeleteObjectRequest {
|
||||||
bucket: self.bucket_name.clone(),
|
bucket: self.bucket_name.clone(),
|
||||||
key: path.key().to_owned(),
|
key: path.key().to_owned(),
|
||||||
..DeleteObjectRequest::default()
|
..DeleteObjectRequest::default()
|
||||||
})
|
})
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
metrics::inc_delete_object_fail();
|
||||||
|
e
|
||||||
|
})?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,9 @@ impl Lsn {
|
|||||||
/// Maximum possible value for an LSN
|
/// Maximum possible value for an LSN
|
||||||
pub const MAX: Lsn = Lsn(u64::MAX);
|
pub const MAX: Lsn = Lsn(u64::MAX);
|
||||||
|
|
||||||
|
/// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h
|
||||||
|
pub const INVALID: Lsn = Lsn(0);
|
||||||
|
|
||||||
/// Subtract a number, returning None on overflow.
|
/// Subtract a number, returning None on overflow.
|
||||||
pub fn checked_sub<T: Into<u64>>(self, other: T) -> Option<Lsn> {
|
pub fn checked_sub<T: Into<u64>>(self, other: T) -> Option<Lsn> {
|
||||||
let other: u64 = other.into();
|
let other: u64 = other.into();
|
||||||
@@ -103,6 +106,12 @@ impl Lsn {
|
|||||||
pub fn is_aligned(&self) -> bool {
|
pub fn is_aligned(&self) -> bool {
|
||||||
*self == self.align()
|
*self == self.align()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return if the LSN is valid
|
||||||
|
/// mimics postgres XLogRecPtrIsInvalid macro
|
||||||
|
pub fn is_valid(self) -> bool {
|
||||||
|
self != Lsn::INVALID
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<u64> for Lsn {
|
impl From<u64> for Lsn {
|
||||||
|
|||||||
@@ -218,7 +218,7 @@ impl ZTenantTimelineId {
|
|||||||
|
|
||||||
impl fmt::Display for ZTenantTimelineId {
|
impl fmt::Display for ZTenantTimelineId {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
write!(f, "{}-{}", self.tenant_id, self.timeline_id)
|
write!(f, "{}/{}", self.tenant_id, self.timeline_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -226,9 +226,9 @@ impl fmt::Display for ZTenantTimelineId {
|
|||||||
// by the console.
|
// by the console.
|
||||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
|
||||||
#[serde(transparent)]
|
#[serde(transparent)]
|
||||||
pub struct ZNodeId(pub u64);
|
pub struct NodeId(pub u64);
|
||||||
|
|
||||||
impl fmt::Display for ZNodeId {
|
impl fmt::Display for NodeId {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
write!(f, "{}", self.0)
|
write!(f, "{}", self.0)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,14 +22,14 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
postgres_backend::AuthType,
|
postgres_backend::AuthType,
|
||||||
project_git_version,
|
project_git_version,
|
||||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
use pageserver::timelines::TimelineInfo;
|
use pageserver::timelines::TimelineInfo;
|
||||||
|
|
||||||
// Default id of a safekeeper node, if not specified on the command line.
|
// Default id of a safekeeper node, if not specified on the command line.
|
||||||
const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1);
|
const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
|
||||||
const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1);
|
const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
|
|
||||||
@@ -860,7 +860,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result<SafekeeperNode> {
|
fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNode> {
|
||||||
if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
|
if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
|
||||||
Ok(SafekeeperNode::from_env(env, node))
|
Ok(SafekeeperNode::from_env(env, node))
|
||||||
} else {
|
} else {
|
||||||
@@ -876,7 +876,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
|||||||
|
|
||||||
// All the commands take an optional safekeeper name argument
|
// All the commands take an optional safekeeper name argument
|
||||||
let sk_id = if let Some(id_str) = sub_args.value_of("id") {
|
let sk_id = if let Some(id_str) = sub_args.value_of("id") {
|
||||||
ZNodeId(id_str.parse().context("while parsing safekeeper id")?)
|
NodeId(id_str.parse().context("while parsing safekeeper id")?)
|
||||||
} else {
|
} else {
|
||||||
DEFAULT_SAFEKEEPER_ID
|
DEFAULT_SAFEKEEPER_ID
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -54,6 +54,9 @@ crossbeam-utils = "0.8.5"
|
|||||||
fail = "0.5.0"
|
fail = "0.5.0"
|
||||||
git-version = "0.3.5"
|
git-version = "0.3.5"
|
||||||
|
|
||||||
|
# 'experimental' is needed for the `zstd::bulk::Decompressor::upper_bound` function.
|
||||||
|
zstd = { version = "0.11.1", features = ["experimental"] }
|
||||||
|
|
||||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||||
etcd_broker = { path = "../libs/etcd_broker" }
|
etcd_broker = { path = "../libs/etcd_broker" }
|
||||||
metrics = { path = "../libs/metrics" }
|
metrics = { path = "../libs/metrics" }
|
||||||
|
|||||||
@@ -10,8 +10,9 @@
|
|||||||
//! This module is responsible for creation of such tarball
|
//! This module is responsible for creation of such tarball
|
||||||
//! from data stored in object storage.
|
//! from data stored in object storage.
|
||||||
//!
|
//!
|
||||||
use anyhow::{anyhow, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use bytes::{BufMut, BytesMut};
|
use bytes::{BufMut, BytesMut};
|
||||||
|
use fail::fail_point;
|
||||||
use std::fmt::Write as FmtWrite;
|
use std::fmt::Write as FmtWrite;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -30,11 +31,16 @@ use utils::lsn::Lsn;
|
|||||||
/// This is short-living object only for the time of tarball creation,
|
/// This is short-living object only for the time of tarball creation,
|
||||||
/// created mostly to avoid passing a lot of parameters between various functions
|
/// created mostly to avoid passing a lot of parameters between various functions
|
||||||
/// used for constructing tarball.
|
/// used for constructing tarball.
|
||||||
pub struct Basebackup<'a> {
|
pub struct Basebackup<'a, W>
|
||||||
ar: Builder<&'a mut dyn Write>,
|
where
|
||||||
|
W: Write,
|
||||||
|
{
|
||||||
|
ar: Builder<AbortableWrite<W>>,
|
||||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||||
pub lsn: Lsn,
|
pub lsn: Lsn,
|
||||||
prev_record_lsn: Lsn,
|
prev_record_lsn: Lsn,
|
||||||
|
|
||||||
|
finished: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create basebackup with non-rel data in it. Omit relational data.
|
// Create basebackup with non-rel data in it. Omit relational data.
|
||||||
@@ -44,12 +50,15 @@ pub struct Basebackup<'a> {
|
|||||||
// * When working without safekeepers. In this situation it is important to match the lsn
|
// * When working without safekeepers. In this situation it is important to match the lsn
|
||||||
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||||
// to start the replication.
|
// to start the replication.
|
||||||
impl<'a> Basebackup<'a> {
|
impl<'a, W> Basebackup<'a, W>
|
||||||
|
where
|
||||||
|
W: Write,
|
||||||
|
{
|
||||||
pub fn new(
|
pub fn new(
|
||||||
write: &'a mut dyn Write,
|
write: W,
|
||||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||||
req_lsn: Option<Lsn>,
|
req_lsn: Option<Lsn>,
|
||||||
) -> Result<Basebackup<'a>> {
|
) -> Result<Basebackup<'a, W>> {
|
||||||
// Compute postgres doesn't have any previous WAL files, but the first
|
// Compute postgres doesn't have any previous WAL files, but the first
|
||||||
// record that it's going to write needs to include the LSN of the
|
// record that it's going to write needs to include the LSN of the
|
||||||
// previous record (xl_prev). We include prev_record_lsn in the
|
// previous record (xl_prev). We include prev_record_lsn in the
|
||||||
@@ -90,14 +99,15 @@ impl<'a> Basebackup<'a> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
Ok(Basebackup {
|
Ok(Basebackup {
|
||||||
ar: Builder::new(write),
|
ar: Builder::new(AbortableWrite::new(write)),
|
||||||
timeline,
|
timeline,
|
||||||
lsn: backup_lsn,
|
lsn: backup_lsn,
|
||||||
prev_record_lsn: backup_prev,
|
prev_record_lsn: backup_prev,
|
||||||
|
finished: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||||
// Create pgdata subdirs structure
|
// Create pgdata subdirs structure
|
||||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||||
let header = new_tar_header_dir(*dir)?;
|
let header = new_tar_header_dir(*dir)?;
|
||||||
@@ -135,9 +145,14 @@ impl<'a> Basebackup<'a> {
|
|||||||
self.add_twophase_file(xid)?;
|
self.add_twophase_file(xid)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fail_point!("basebackup-before-control-file", |_| {
|
||||||
|
bail!("failpoint basebackup-before-control-file")
|
||||||
|
});
|
||||||
|
|
||||||
// Generate pg_control and bootstrap WAL segment.
|
// Generate pg_control and bootstrap WAL segment.
|
||||||
self.add_pgcontrol_file()?;
|
self.add_pgcontrol_file()?;
|
||||||
self.ar.finish()?;
|
self.ar.finish()?;
|
||||||
|
self.finished = true;
|
||||||
debug!("all tarred up!");
|
debug!("all tarred up!");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -331,6 +346,19 @@ impl<'a> Basebackup<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a, W> Drop for Basebackup<'a, W>
|
||||||
|
where
|
||||||
|
W: Write,
|
||||||
|
{
|
||||||
|
/// If the basebackup was not finished, prevent the Archive::drop() from
|
||||||
|
/// writing the end-of-archive marker.
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.finished {
|
||||||
|
self.ar.get_mut().abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Create new tarball entry header
|
// Create new tarball entry header
|
||||||
//
|
//
|
||||||
@@ -366,3 +394,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
|
|||||||
header.set_cksum();
|
header.set_cksum();
|
||||||
Ok(header)
|
Ok(header)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A wrapper that passes through all data to the underlying Write,
|
||||||
|
/// until abort() is called.
|
||||||
|
///
|
||||||
|
/// tar::Builder has an annoying habit of finishing the archive with
|
||||||
|
/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
|
||||||
|
/// even if an error occurs and we don't finish building the archive.
|
||||||
|
/// We'd rather abort writing the tarball immediately than construct
|
||||||
|
/// a seemingly valid but incomplete archive. This wrapper allows us
|
||||||
|
/// to swallow the end-of-archive marker that Builder::drop() emits,
|
||||||
|
/// without writing it to the underlying sink.
|
||||||
|
///
|
||||||
|
struct AbortableWrite<W> {
|
||||||
|
w: W,
|
||||||
|
aborted: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W> AbortableWrite<W> {
|
||||||
|
pub fn new(w: W) -> Self {
|
||||||
|
AbortableWrite { w, aborted: false }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn abort(&mut self) {
|
||||||
|
self.aborted = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W> Write for AbortableWrite<W>
|
||||||
|
where
|
||||||
|
W: Write,
|
||||||
|
{
|
||||||
|
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
|
||||||
|
if self.aborted {
|
||||||
|
Ok(data.len())
|
||||||
|
} else {
|
||||||
|
self.w.write(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn flush(&mut self) -> io::Result<()> {
|
||||||
|
if self.aborted {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
self.w.flush()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,9 +5,9 @@
|
|||||||
//! See also `settings.md` for better description on every parameter.
|
//! See also `settings.md` for better description on every parameter.
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config};
|
use remote_storage::RemoteStorageConfig;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@@ -16,12 +16,18 @@ use toml_edit::{Document, Item};
|
|||||||
use url::Url;
|
use url::Url;
|
||||||
use utils::{
|
use utils::{
|
||||||
postgres_backend::AuthType,
|
postgres_backend::AuthType,
|
||||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::layered_repository::TIMELINES_SEGMENT_NAME;
|
use crate::layered_repository::TIMELINES_SEGMENT_NAME;
|
||||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||||
|
|
||||||
|
pub const ZSTD_MAX_SAMPLES: usize = 1024;
|
||||||
|
pub const ZSTD_MIN_SAMPLES: usize = 8; // magic requirement of zstd
|
||||||
|
pub const ZSTD_MAX_SAMPLE_BYTES: usize = 10 * 1024 * 1024; // max memory size for holding samples
|
||||||
|
pub const ZSTD_MAX_DICTIONARY_SIZE: usize = 8 * 1024 - 4; // make dictionary + BLOB length fit in first page
|
||||||
|
pub const ZSTD_COMPRESSION_LEVEL: i32 = 0; // default compression level
|
||||||
|
|
||||||
pub mod defaults {
|
pub mod defaults {
|
||||||
use crate::tenant_config::defaults::*;
|
use crate::tenant_config::defaults::*;
|
||||||
use const_format::formatcp;
|
use const_format::formatcp;
|
||||||
@@ -78,7 +84,7 @@ pub mod defaults {
|
|||||||
pub struct PageServerConf {
|
pub struct PageServerConf {
|
||||||
// Identifier of that particular pageserver so e g safekeepers
|
// Identifier of that particular pageserver so e g safekeepers
|
||||||
// can safely distinguish different pageservers
|
// can safely distinguish different pageservers
|
||||||
pub id: ZNodeId,
|
pub id: NodeId,
|
||||||
|
|
||||||
/// Example (default): 127.0.0.1:64000
|
/// Example (default): 127.0.0.1:64000
|
||||||
pub listen_pg_addr: String,
|
pub listen_pg_addr: String,
|
||||||
@@ -180,7 +186,7 @@ struct PageServerConfigBuilder {
|
|||||||
auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
|
auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
|
||||||
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
||||||
|
|
||||||
id: BuilderValue<ZNodeId>,
|
id: BuilderValue<NodeId>,
|
||||||
|
|
||||||
profiling: BuilderValue<ProfilingConfig>,
|
profiling: BuilderValue<ProfilingConfig>,
|
||||||
broker_etcd_prefix: BuilderValue<String>,
|
broker_etcd_prefix: BuilderValue<String>,
|
||||||
@@ -276,7 +282,7 @@ impl PageServerConfigBuilder {
|
|||||||
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
|
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn id(&mut self, node_id: ZNodeId) {
|
pub fn id(&mut self, node_id: NodeId) {
|
||||||
self.id = BuilderValue::Set(node_id)
|
self.id = BuilderValue::Set(node_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -394,12 +400,12 @@ impl PageServerConf {
|
|||||||
)),
|
)),
|
||||||
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
||||||
"remote_storage" => {
|
"remote_storage" => {
|
||||||
builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
|
builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?))
|
||||||
}
|
}
|
||||||
"tenant_config" => {
|
"tenant_config" => {
|
||||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||||
}
|
}
|
||||||
"id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
|
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||||
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
||||||
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
|
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
|
||||||
"broker_endpoints" => builder.broker_endpoints(
|
"broker_endpoints" => builder.broker_endpoints(
|
||||||
@@ -484,64 +490,6 @@ impl PageServerConf {
|
|||||||
Ok(t_conf)
|
Ok(t_conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// subroutine of parse_config(), to parse the `[remote_storage]` table.
|
|
||||||
fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
|
||||||
let local_path = toml.get("local_path");
|
|
||||||
let bucket_name = toml.get("bucket_name");
|
|
||||||
let bucket_region = toml.get("bucket_region");
|
|
||||||
|
|
||||||
let max_concurrent_syncs = NonZeroUsize::new(
|
|
||||||
parse_optional_integer("max_concurrent_syncs", toml)?
|
|
||||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
|
|
||||||
)
|
|
||||||
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
|
|
||||||
|
|
||||||
let max_sync_errors = NonZeroU32::new(
|
|
||||||
parse_optional_integer("max_sync_errors", toml)?
|
|
||||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
|
||||||
)
|
|
||||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
|
||||||
|
|
||||||
let concurrency_limit = NonZeroUsize::new(
|
|
||||||
parse_optional_integer("concurrency_limit", toml)?
|
|
||||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
|
||||||
)
|
|
||||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
|
||||||
|
|
||||||
let storage = match (local_path, bucket_name, bucket_region) {
|
|
||||||
(None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
|
|
||||||
(_, Some(_), None) => {
|
|
||||||
bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
|
|
||||||
}
|
|
||||||
(_, None, Some(_)) => {
|
|
||||||
bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
|
|
||||||
}
|
|
||||||
(None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
|
|
||||||
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
|
||||||
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
|
||||||
prefix_in_bucket: toml
|
|
||||||
.get("prefix_in_bucket")
|
|
||||||
.map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
|
|
||||||
.transpose()?,
|
|
||||||
endpoint: toml
|
|
||||||
.get("endpoint")
|
|
||||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
|
||||||
.transpose()?,
|
|
||||||
concurrency_limit,
|
|
||||||
}),
|
|
||||||
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
|
|
||||||
parse_toml_string("local_path", local_path)?,
|
|
||||||
)),
|
|
||||||
(Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(RemoteStorageConfig {
|
|
||||||
max_concurrent_syncs,
|
|
||||||
max_sync_errors,
|
|
||||||
storage,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn test_repo_dir(test_name: &str) -> PathBuf {
|
pub fn test_repo_dir(test_name: &str) -> PathBuf {
|
||||||
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||||
@@ -550,7 +498,7 @@ impl PageServerConf {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||||
PageServerConf {
|
PageServerConf {
|
||||||
id: ZNodeId(0),
|
id: NodeId(0),
|
||||||
wait_lsn_timeout: Duration::from_secs(60),
|
wait_lsn_timeout: Duration::from_secs(60),
|
||||||
wal_redo_timeout: Duration::from_secs(60),
|
wal_redo_timeout: Duration::from_secs(60),
|
||||||
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
|
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
|
||||||
@@ -592,23 +540,6 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
|
|||||||
Ok(i as u64)
|
Ok(i as u64)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
|
|
||||||
where
|
|
||||||
I: TryFrom<i64, Error = E>,
|
|
||||||
E: std::error::Error + Send + Sync + 'static,
|
|
||||||
{
|
|
||||||
let toml_integer = match item.get(name) {
|
|
||||||
Some(item) => item
|
|
||||||
.as_integer()
|
|
||||||
.with_context(|| format!("configure option {name} is not an integer"))?,
|
|
||||||
None => return Ok(None),
|
|
||||||
};
|
|
||||||
|
|
||||||
I::try_from(toml_integer)
|
|
||||||
.map(Some)
|
|
||||||
.with_context(|| format!("configure option {name} is too large"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
|
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
|
||||||
let s = item
|
let s = item
|
||||||
.as_str()
|
.as_str()
|
||||||
@@ -651,8 +582,12 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::fs;
|
use std::{
|
||||||
|
fs,
|
||||||
|
num::{NonZeroU32, NonZeroUsize},
|
||||||
|
};
|
||||||
|
|
||||||
|
use remote_storage::{RemoteStorageKind, S3Config};
|
||||||
use tempfile::{tempdir, TempDir};
|
use tempfile::{tempdir, TempDir};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -693,7 +628,7 @@ id = 10
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
parsed_config,
|
parsed_config,
|
||||||
PageServerConf {
|
PageServerConf {
|
||||||
id: ZNodeId(10),
|
id: NodeId(10),
|
||||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||||
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
|
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
|
||||||
@@ -737,7 +672,7 @@ id = 10
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
parsed_config,
|
parsed_config,
|
||||||
PageServerConf {
|
PageServerConf {
|
||||||
id: ZNodeId(10),
|
id: NodeId(10),
|
||||||
listen_pg_addr: "127.0.0.1:64000".to_string(),
|
listen_pg_addr: "127.0.0.1:64000".to_string(),
|
||||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||||
wait_lsn_timeout: Duration::from_secs(111),
|
wait_lsn_timeout: Duration::from_secs(111),
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use serde_with::{serde_as, DisplayFromStr};
|
use serde_with::{serde_as, DisplayFromStr};
|
||||||
use utils::{
|
use utils::{
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[serde_as]
|
#[serde_as]
|
||||||
@@ -42,7 +42,7 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId
|
|||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
pub struct StatusResponse {
|
pub struct StatusResponse {
|
||||||
pub id: ZNodeId,
|
pub id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantCreateRequest {
|
impl TenantCreateRequest {
|
||||||
|
|||||||
@@ -1230,7 +1230,7 @@ impl LayeredTimeline {
|
|||||||
}),
|
}),
|
||||||
disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),
|
disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),
|
||||||
|
|
||||||
last_freeze_at: AtomicLsn::new(0),
|
last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0),
|
||||||
|
|
||||||
ancestor_timeline: ancestor,
|
ancestor_timeline: ancestor,
|
||||||
ancestor_lsn: metadata.ancestor_lsn(),
|
ancestor_lsn: metadata.ancestor_lsn(),
|
||||||
@@ -2518,7 +2518,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
|||||||
bail!("couldn't find an unused backup number for {:?}", path)
|
bail!("couldn't find an unused backup number for {:?}", path)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_metadata(
|
pub fn load_metadata(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: ZTimelineId,
|
timeline_id: ZTimelineId,
|
||||||
tenant_id: ZTenantId,
|
tenant_id: ZTenantId,
|
||||||
|
|||||||
@@ -23,6 +23,25 @@
|
|||||||
//! "values" part. The actual page images and WAL records are stored in the
|
//! "values" part. The actual page images and WAL records are stored in the
|
||||||
//! "values" part.
|
//! "values" part.
|
||||||
//!
|
//!
|
||||||
|
//! # Compression
|
||||||
|
//!
|
||||||
|
//! Each value is stored as a Blob, which can optionally be compressed. Compression
|
||||||
|
//! is done by ZStandard, in dictionary mode, which gives pretty good compression
|
||||||
|
//! ratio even for small inputs like WAL records.
|
||||||
|
//!
|
||||||
|
//! The dictionary is built separately for each delta layer file, and stored in
|
||||||
|
//! the file itself.
|
||||||
|
//!
|
||||||
|
//! TODO: The ZStandard format includes constant 4-byte "magic bytes" in the beginning
|
||||||
|
//! of each compressed block. With small values like WAL records, that's pretty wasteful.
|
||||||
|
//! We could disable those bytes by setting the `include_magibytes' flag to false,
|
||||||
|
//! but as of this writing that's considered experimental in the zstd crate, and the
|
||||||
|
//! zstd::bulk::Decompressor::upper_bound() function doesn't work without the magic bytes
|
||||||
|
//! so we would have to find a different way of allocating the decompression buffer if
|
||||||
|
//! we did that.
|
||||||
|
//!
|
||||||
|
|
||||||
|
use crate::config;
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||||
use crate::layered_repository::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
use crate::layered_repository::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||||
@@ -36,7 +55,7 @@ use crate::repository::{Key, Value, KEY_SIZE};
|
|||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
use crate::walrecord;
|
use crate::walrecord;
|
||||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use rand::{distributions::Alphanumeric, Rng};
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fs;
|
use std::fs;
|
||||||
@@ -75,6 +94,9 @@ struct Summary {
|
|||||||
index_start_blk: u32,
|
index_start_blk: u32,
|
||||||
/// Block within the 'index', where the B-tree root page is stored
|
/// Block within the 'index', where the B-tree root page is stored
|
||||||
index_root_blk: u32,
|
index_root_blk: u32,
|
||||||
|
|
||||||
|
/// Byte offset of the compression dictionary, or 0 if no compression
|
||||||
|
dictionary_offset: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&DeltaLayer> for Summary {
|
impl From<&DeltaLayer> for Summary {
|
||||||
@@ -90,33 +112,46 @@ impl From<&DeltaLayer> for Summary {
|
|||||||
|
|
||||||
index_start_blk: 0,
|
index_start_blk: 0,
|
||||||
index_root_blk: 0,
|
index_root_blk: 0,
|
||||||
|
|
||||||
|
dictionary_offset: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flag indicating that this version initialize the page
|
|
||||||
const WILL_INIT: u64 = 1;
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
/// Struct representing reference to BLOB in the file. The reference contains
|
||||||
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
/// the offset to the BLOB within the file, a flag indicating if it's
|
||||||
|
/// compressed or not, and also the `will_init` flag. The `will_init` flag
|
||||||
/// helps to determine the range of records that needs to be applied, without
|
/// helps to determine the range of records that needs to be applied, without
|
||||||
/// reading/deserializing records themselves.
|
/// reading/deserializing records themselves.
|
||||||
///
|
///
|
||||||
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||||
struct BlobRef(u64);
|
struct BlobRef(u64);
|
||||||
|
|
||||||
|
/// Flag indicating that this blob is compressed
|
||||||
|
const BLOB_COMPRESSED: u64 = 1;
|
||||||
|
|
||||||
|
/// Flag indicating that this version initializes the page
|
||||||
|
const WILL_INIT: u64 = 2;
|
||||||
|
|
||||||
impl BlobRef {
|
impl BlobRef {
|
||||||
|
pub fn compressed(&self) -> bool {
|
||||||
|
(self.0 & BLOB_COMPRESSED) != 0
|
||||||
|
}
|
||||||
|
|
||||||
pub fn will_init(&self) -> bool {
|
pub fn will_init(&self) -> bool {
|
||||||
(self.0 & WILL_INIT) != 0
|
(self.0 & WILL_INIT) != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn pos(&self) -> u64 {
|
pub fn pos(&self) -> u64 {
|
||||||
self.0 >> 1
|
self.0 >> 2
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(pos: u64, will_init: bool) -> BlobRef {
|
pub fn new(pos: u64, compressed: bool, will_init: bool) -> BlobRef {
|
||||||
let mut blob_ref = pos << 1;
|
let mut blob_ref = pos << 2;
|
||||||
|
if compressed {
|
||||||
|
blob_ref |= BLOB_COMPRESSED;
|
||||||
|
}
|
||||||
if will_init {
|
if will_init {
|
||||||
blob_ref |= WILL_INIT;
|
blob_ref |= WILL_INIT;
|
||||||
}
|
}
|
||||||
@@ -193,6 +228,37 @@ pub struct DeltaLayerInner {
|
|||||||
|
|
||||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||||
file: Option<FileBlockReader<VirtualFile>>,
|
file: Option<FileBlockReader<VirtualFile>>,
|
||||||
|
|
||||||
|
/// Compression dictionary, as raw bytes, and in prepared format ready for use
|
||||||
|
/// for decompression. None if there is no dictionary, or if 'loaded' is false.
|
||||||
|
dictionary: Option<(Vec<u8>, zstd::dict::DecoderDictionary<'static>)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeltaLayerInner {
|
||||||
|
// Create a new Decompressor, using the prepared dictionary
|
||||||
|
fn create_decompressor(&self) -> Result<Option<zstd::bulk::Decompressor<'_>>> {
|
||||||
|
if let Some((_, dict)) = &self.dictionary {
|
||||||
|
let decompressor = zstd::bulk::Decompressor::with_prepared_dictionary(dict)?;
|
||||||
|
Ok(Some(decompressor))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new Decompressor, without using the prepared dictionary.
|
||||||
|
//
|
||||||
|
// For the cases that you cannot use 'create_decompressor', if the
|
||||||
|
// Decompressor needs to outlive 'self'.
|
||||||
|
fn create_decompressor_not_prepared(
|
||||||
|
&self,
|
||||||
|
) -> Result<Option<zstd::bulk::Decompressor<'static>>> {
|
||||||
|
if let Some((dict, _)) = &self.dictionary {
|
||||||
|
let decompressor = zstd::bulk::Decompressor::with_dictionary(dict)?;
|
||||||
|
Ok(Some(decompressor))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Layer for DeltaLayer {
|
impl Layer for DeltaLayer {
|
||||||
@@ -234,6 +300,8 @@ impl Layer for DeltaLayer {
|
|||||||
{
|
{
|
||||||
// Open the file and lock the metadata in memory
|
// Open the file and lock the metadata in memory
|
||||||
let inner = self.load()?;
|
let inner = self.load()?;
|
||||||
|
let mut decompressor = inner.create_decompressor()?;
|
||||||
|
let mut decompress_buf = Vec::new();
|
||||||
|
|
||||||
// Scan the page versions backwards, starting from `lsn`.
|
// Scan the page versions backwards, starting from `lsn`.
|
||||||
let file = inner.file.as_ref().unwrap();
|
let file = inner.file.as_ref().unwrap();
|
||||||
@@ -244,7 +312,7 @@ impl Layer for DeltaLayer {
|
|||||||
);
|
);
|
||||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||||
|
|
||||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
let mut blob_refs: Vec<(Lsn, BlobRef)> = Vec::new();
|
||||||
|
|
||||||
tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
||||||
let blob_ref = BlobRef(value);
|
let blob_ref = BlobRef(value);
|
||||||
@@ -255,21 +323,36 @@ impl Layer for DeltaLayer {
|
|||||||
if entry_lsn < lsn_range.start {
|
if entry_lsn < lsn_range.start {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
offsets.push((entry_lsn, blob_ref.pos()));
|
blob_refs.push((entry_lsn, blob_ref));
|
||||||
|
|
||||||
!blob_ref.will_init()
|
!blob_ref.will_init()
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||||
let mut cursor = file.block_cursor();
|
let mut cursor = file.block_cursor();
|
||||||
for (entry_lsn, pos) in offsets {
|
for (entry_lsn, blob_ref) in blob_refs {
|
||||||
let buf = cursor.read_blob(pos).with_context(|| {
|
let buf = cursor.read_blob(blob_ref.pos()).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to read blob from virtual file {}",
|
"Failed to read blob from virtual file {}",
|
||||||
file.file.path.display()
|
file.file.path.display()
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let val = Value::des(&buf).with_context(|| {
|
let uncompressed_bytes = if blob_ref.compressed() {
|
||||||
|
if let Some(ref mut decompressor) = decompressor {
|
||||||
|
let decompressed_max_len = zstd::bulk::Decompressor::upper_bound(&buf)
|
||||||
|
.ok_or_else(|| anyhow!("could not get decompressed length"))?;
|
||||||
|
decompress_buf.clear();
|
||||||
|
decompress_buf.reserve(decompressed_max_len);
|
||||||
|
let _ = decompressor.decompress_to_buffer(&buf, &mut decompress_buf)?;
|
||||||
|
&decompress_buf
|
||||||
|
} else {
|
||||||
|
bail!("blob is compressed, but there was no dictionary");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
&buf
|
||||||
|
};
|
||||||
|
|
||||||
|
let val = Value::des(uncompressed_bytes).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to deserialize file blob from virtual file {}",
|
"Failed to deserialize file blob from virtual file {}",
|
||||||
file.file.path.display()
|
file.file.path.display()
|
||||||
@@ -347,7 +430,6 @@ impl Layer for DeltaLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let inner = self.load()?;
|
let inner = self.load()?;
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"index_start_blk: {}, root {}",
|
"index_start_blk: {}, root {}",
|
||||||
inner.index_start_blk, inner.index_root_blk
|
inner.index_start_blk, inner.index_root_blk
|
||||||
@@ -363,19 +445,49 @@ impl Layer for DeltaLayer {
|
|||||||
tree_reader.dump()?;
|
tree_reader.dump()?;
|
||||||
|
|
||||||
let mut cursor = file.block_cursor();
|
let mut cursor = file.block_cursor();
|
||||||
|
let mut decompressor = inner.create_decompressor()?;
|
||||||
|
let mut decompress_buf = Vec::new();
|
||||||
|
|
||||||
// A subroutine to dump a single blob
|
// A subroutine to dump a single blob
|
||||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
let buf = cursor.read_blob(blob_ref.pos()).with_context(|| {
|
||||||
let val = Value::des(&buf)?;
|
format!(
|
||||||
|
"Failed to read blob from virtual file {}",
|
||||||
|
file.file.path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let uncompressed_bytes = if blob_ref.compressed() {
|
||||||
|
if let Some(ref mut decompressor) = decompressor {
|
||||||
|
let decompressed_max_len = zstd::bulk::Decompressor::upper_bound(&buf)
|
||||||
|
.ok_or_else(|| anyhow!("could not get decompressed length"))?;
|
||||||
|
decompress_buf.clear();
|
||||||
|
decompress_buf.reserve(decompressed_max_len);
|
||||||
|
let _ = decompressor.decompress_to_buffer(&buf, &mut decompress_buf)?;
|
||||||
|
&decompress_buf
|
||||||
|
} else {
|
||||||
|
bail!("blob is compressed, but there was no dictionary");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
&buf
|
||||||
|
};
|
||||||
|
|
||||||
|
let val = Value::des(uncompressed_bytes).with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to deserialize file blob from virtual file {}",
|
||||||
|
file.file.path.display()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
let desc = match val {
|
let desc = match val {
|
||||||
Value::Image(img) => {
|
Value::Image(img) => {
|
||||||
format!(" img {} bytes", img.len())
|
format!("img {} bytes, {} compressed", img.len(), buf.len())
|
||||||
}
|
}
|
||||||
Value::WalRecord(rec) => {
|
Value::WalRecord(rec) => {
|
||||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||||
format!(
|
format!(
|
||||||
" rec {} bytes will_init: {} {}",
|
"rec {} bytes, {} compressed, will_init {}: {}",
|
||||||
|
uncompressed_bytes.len(),
|
||||||
buf.len(),
|
buf.len(),
|
||||||
rec.will_init(),
|
rec.will_init(),
|
||||||
wal_desc
|
wal_desc
|
||||||
@@ -494,6 +606,7 @@ impl DeltaLayer {
|
|||||||
let mut expected_summary = Summary::from(self);
|
let mut expected_summary = Summary::from(self);
|
||||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||||
|
expected_summary.dictionary_offset = actual_summary.dictionary_offset;
|
||||||
if actual_summary != expected_summary {
|
if actual_summary != expected_summary {
|
||||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||||
}
|
}
|
||||||
@@ -512,6 +625,13 @@ impl DeltaLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Load and prepare the dictionary, if any
|
||||||
|
if actual_summary.dictionary_offset != 0 {
|
||||||
|
let mut cursor = file.block_cursor();
|
||||||
|
let dict = cursor.read_blob(actual_summary.dictionary_offset)?;
|
||||||
|
let prepared_dict = zstd::dict::DecoderDictionary::copy(&dict);
|
||||||
|
inner.dictionary = Some((dict, prepared_dict));
|
||||||
|
}
|
||||||
inner.index_start_blk = actual_summary.index_start_blk;
|
inner.index_start_blk = actual_summary.index_start_blk;
|
||||||
inner.index_root_blk = actual_summary.index_root_blk;
|
inner.index_root_blk = actual_summary.index_root_blk;
|
||||||
|
|
||||||
@@ -537,6 +657,7 @@ impl DeltaLayer {
|
|||||||
inner: RwLock::new(DeltaLayerInner {
|
inner: RwLock::new(DeltaLayerInner {
|
||||||
loaded: false,
|
loaded: false,
|
||||||
file: None,
|
file: None,
|
||||||
|
dictionary: None,
|
||||||
index_start_blk: 0,
|
index_start_blk: 0,
|
||||||
index_root_blk: 0,
|
index_root_blk: 0,
|
||||||
}),
|
}),
|
||||||
@@ -564,6 +685,7 @@ impl DeltaLayer {
|
|||||||
inner: RwLock::new(DeltaLayerInner {
|
inner: RwLock::new(DeltaLayerInner {
|
||||||
loaded: false,
|
loaded: false,
|
||||||
file: None,
|
file: None,
|
||||||
|
dictionary: None,
|
||||||
index_start_blk: 0,
|
index_start_blk: 0,
|
||||||
index_root_blk: 0,
|
index_root_blk: 0,
|
||||||
}),
|
}),
|
||||||
@@ -599,6 +721,16 @@ impl DeltaLayer {
|
|||||||
///
|
///
|
||||||
/// 3. Call `finish`.
|
/// 3. Call `finish`.
|
||||||
///
|
///
|
||||||
|
///
|
||||||
|
/// To train the dictionary for compression, the first ZSTD_MAX_SAMPLES values
|
||||||
|
/// (or up ZSTD_MAX_SAMPLE_BYTES) are buffered in memory, before writing them
|
||||||
|
/// to disk. When the "sample buffer" fills up, the buffered values are used
|
||||||
|
/// to train a zstandard dictionary, which is then used to compress all the
|
||||||
|
/// buffered values, and all subsequent values. So the dictionary is built
|
||||||
|
/// based on just the first values, but in practice that usually gives pretty
|
||||||
|
/// good compression for all subsequent data as well. Things like page and
|
||||||
|
/// tuple headers are similar across all pages of the same relation.
|
||||||
|
///
|
||||||
pub struct DeltaLayerWriter {
|
pub struct DeltaLayerWriter {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
@@ -611,6 +743,13 @@ pub struct DeltaLayerWriter {
|
|||||||
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
|
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
|
||||||
|
|
||||||
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
|
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
|
||||||
|
compressor: Option<zstd::bulk::Compressor<'static>>,
|
||||||
|
dictionary_offset: u64,
|
||||||
|
|
||||||
|
training: bool,
|
||||||
|
sample_key_lsn_willinit: Vec<(Key, Lsn, bool)>,
|
||||||
|
sample_sizes: Vec<usize>,
|
||||||
|
sample_data: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DeltaLayerWriter {
|
impl DeltaLayerWriter {
|
||||||
@@ -641,7 +780,6 @@ impl DeltaLayerWriter {
|
|||||||
// Initialize the b-tree index builder
|
// Initialize the b-tree index builder
|
||||||
let block_buf = BlockBuf::new();
|
let block_buf = BlockBuf::new();
|
||||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||||
|
|
||||||
Ok(DeltaLayerWriter {
|
Ok(DeltaLayerWriter {
|
||||||
conf,
|
conf,
|
||||||
path,
|
path,
|
||||||
@@ -651,6 +789,13 @@ impl DeltaLayerWriter {
|
|||||||
lsn_range,
|
lsn_range,
|
||||||
tree: tree_builder,
|
tree: tree_builder,
|
||||||
blob_writer,
|
blob_writer,
|
||||||
|
compressor: None,
|
||||||
|
dictionary_offset: 0,
|
||||||
|
|
||||||
|
training: true,
|
||||||
|
sample_key_lsn_willinit: Vec::new(),
|
||||||
|
sample_sizes: Vec::new(),
|
||||||
|
sample_data: Vec::new(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -660,18 +805,122 @@ impl DeltaLayerWriter {
|
|||||||
/// The values must be appended in key, lsn order.
|
/// The values must be appended in key, lsn order.
|
||||||
///
|
///
|
||||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||||
|
let blob_content = &Value::ser(&val)?;
|
||||||
|
|
||||||
|
// Are we still accumulating values for training the compression dictionary?
|
||||||
|
if self.training {
|
||||||
|
self.put_value_train(key, lsn, val.will_init(), blob_content)?;
|
||||||
|
|
||||||
|
if self.sample_sizes.len() >= config::ZSTD_MAX_SAMPLES
|
||||||
|
|| self.sample_data.len() >= config::ZSTD_MAX_SAMPLE_BYTES
|
||||||
|
{
|
||||||
|
self.finish_training()?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self.put_value_flush(key, lsn, val.will_init(), blob_content)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Accumulate one key-value pair in the samples buffer
|
||||||
|
fn put_value_train(&mut self, key: Key, lsn: Lsn, will_init: bool, bytes: &[u8]) -> Result<()> {
|
||||||
|
assert!(self.training);
|
||||||
|
self.sample_key_lsn_willinit.push((key, lsn, will_init));
|
||||||
|
self.sample_sizes.push(bytes.len());
|
||||||
|
self.sample_data.extend_from_slice(bytes);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Train the compression dictionary, and flush out all the accumulated
|
||||||
|
/// key-value pairs to disk.
|
||||||
|
fn finish_training(&mut self) -> Result<()> {
|
||||||
|
assert!(self.training);
|
||||||
|
assert!(self.sample_sizes.len() == self.sample_key_lsn_willinit.len());
|
||||||
|
|
||||||
|
// Create the dictionary, if we had enough samples for it.
|
||||||
|
//
|
||||||
|
// If there weren't enough samples, we don't do any compression at
|
||||||
|
// all. Possibly we could still benefit from compression; for example
|
||||||
|
// if you have only one gigantic value in a single layer, it would
|
||||||
|
// still be good to compress that, without a dictionary. But we don't
|
||||||
|
// do that currently.
|
||||||
|
if self.sample_sizes.len() >= config::ZSTD_MIN_SAMPLES {
|
||||||
|
let dictionary = zstd::dict::from_continuous(
|
||||||
|
&self.sample_data,
|
||||||
|
&self.sample_sizes,
|
||||||
|
config::ZSTD_MAX_DICTIONARY_SIZE,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let off = self.blob_writer.write_blob(&dictionary)?;
|
||||||
|
self.dictionary_offset = off;
|
||||||
|
|
||||||
|
let compressor = zstd::bulk::Compressor::with_dictionary(
|
||||||
|
config::ZSTD_COMPRESSION_LEVEL,
|
||||||
|
&dictionary,
|
||||||
|
)?;
|
||||||
|
self.compressor = Some(compressor);
|
||||||
|
};
|
||||||
|
self.training = false;
|
||||||
|
|
||||||
|
// release the memory used by the sample buffers
|
||||||
|
let sample_key_lsn_willinit = std::mem::take(&mut self.sample_key_lsn_willinit);
|
||||||
|
let sample_sizes = std::mem::take(&mut self.sample_sizes);
|
||||||
|
let sample_data = std::mem::take(&mut self.sample_data);
|
||||||
|
|
||||||
|
// Compress and write out all the buffered key-value pairs
|
||||||
|
let mut buf_idx: usize = 0;
|
||||||
|
for ((key, lsn, will_init), len) in
|
||||||
|
itertools::izip!(sample_key_lsn_willinit.iter(), sample_sizes.iter())
|
||||||
|
{
|
||||||
|
let end = buf_idx + len;
|
||||||
|
self.put_value_flush(*key, *lsn, *will_init, &sample_data[buf_idx..end])?;
|
||||||
|
buf_idx = end;
|
||||||
|
}
|
||||||
|
assert!(buf_idx == sample_data.len());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write a key-value pair to the file, compressing it if applicable.
|
||||||
|
pub fn put_value_flush(
|
||||||
|
&mut self,
|
||||||
|
key: Key,
|
||||||
|
lsn: Lsn,
|
||||||
|
will_init: bool,
|
||||||
|
bytes: &[u8],
|
||||||
|
) -> Result<()> {
|
||||||
|
assert!(!self.training);
|
||||||
assert!(self.lsn_range.start <= lsn);
|
assert!(self.lsn_range.start <= lsn);
|
||||||
|
|
||||||
let off = self.blob_writer.write_blob(&Value::ser(&val)?)?;
|
let mut blob_content = bytes;
|
||||||
|
let mut compressed = false;
|
||||||
|
|
||||||
let blob_ref = BlobRef::new(off, val.will_init());
|
// Try to compress the blob
|
||||||
|
let compressed_bytes;
|
||||||
|
if let Some(ref mut compressor) = self.compressor {
|
||||||
|
compressed_bytes = compressor.compress(blob_content)?;
|
||||||
|
// If compressed version is not any smaller than the original,
|
||||||
|
// store it uncompressed.
|
||||||
|
if compressed_bytes.len() < blob_content.len() {
|
||||||
|
blob_content = &compressed_bytes;
|
||||||
|
compressed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write it to the file
|
||||||
|
let off = self.blob_writer.write_blob(blob_content)?;
|
||||||
|
let blob_ref = BlobRef::new(off, compressed, will_init);
|
||||||
|
|
||||||
|
// And store the reference in the B-tree
|
||||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||||
self.tree.append(&delta_key.0, blob_ref.0)?;
|
self.tree.append(&delta_key.0, blob_ref.0)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Return an estimate of the file, if it was finished now.
|
||||||
|
///
|
||||||
pub fn size(&self) -> u64 {
|
pub fn size(&self) -> u64 {
|
||||||
self.blob_writer.size() + self.tree.borrow_writer().size()
|
self.blob_writer.size() + self.tree.borrow_writer().size()
|
||||||
}
|
}
|
||||||
@@ -679,7 +928,11 @@ impl DeltaLayerWriter {
|
|||||||
///
|
///
|
||||||
/// Finish writing the delta layer.
|
/// Finish writing the delta layer.
|
||||||
///
|
///
|
||||||
pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||||
|
if self.training {
|
||||||
|
self.finish_training()?;
|
||||||
|
}
|
||||||
|
|
||||||
let index_start_blk =
|
let index_start_blk =
|
||||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||||
|
|
||||||
@@ -703,6 +956,7 @@ impl DeltaLayerWriter {
|
|||||||
lsn_range: self.lsn_range.clone(),
|
lsn_range: self.lsn_range.clone(),
|
||||||
index_start_blk,
|
index_start_blk,
|
||||||
index_root_blk,
|
index_root_blk,
|
||||||
|
dictionary_offset: self.dictionary_offset,
|
||||||
};
|
};
|
||||||
file.seek(SeekFrom::Start(0))?;
|
file.seek(SeekFrom::Start(0))?;
|
||||||
Summary::ser_into(&summary, &mut file)?;
|
Summary::ser_into(&summary, &mut file)?;
|
||||||
@@ -719,6 +973,7 @@ impl DeltaLayerWriter {
|
|||||||
inner: RwLock::new(DeltaLayerInner {
|
inner: RwLock::new(DeltaLayerInner {
|
||||||
loaded: false,
|
loaded: false,
|
||||||
file: None,
|
file: None,
|
||||||
|
dictionary: None,
|
||||||
index_start_blk,
|
index_start_blk,
|
||||||
index_root_blk,
|
index_root_blk,
|
||||||
}),
|
}),
|
||||||
@@ -758,6 +1013,9 @@ struct DeltaValueIter<'a> {
|
|||||||
all_offsets: Vec<(DeltaKey, BlobRef)>,
|
all_offsets: Vec<(DeltaKey, BlobRef)>,
|
||||||
next_idx: usize,
|
next_idx: usize,
|
||||||
reader: BlockCursor<Adapter<'a>>,
|
reader: BlockCursor<Adapter<'a>>,
|
||||||
|
decompressor: Option<zstd::bulk::Decompressor<'a>>,
|
||||||
|
|
||||||
|
decompress_buf: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
|
struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
|
||||||
@@ -797,10 +1055,20 @@ impl<'a> DeltaValueIter<'a> {
|
|||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
// We cannot use inner.create_decompressor() here, because it returns
|
||||||
|
// a Decompressor with lifetime that depends on 'inner', and that
|
||||||
|
// doesn't live long enough here. Cannot use the prepared dictionary
|
||||||
|
// for that reason either. Doesn't matter too much in practice because
|
||||||
|
// this Iterator is used for bulk operations, and loading the dictionary
|
||||||
|
// isn't that expensive in comparison.
|
||||||
|
let decompressor = inner.create_decompressor_not_prepared()?;
|
||||||
|
|
||||||
let iter = DeltaValueIter {
|
let iter = DeltaValueIter {
|
||||||
all_offsets,
|
all_offsets,
|
||||||
next_idx: 0,
|
next_idx: 0,
|
||||||
reader: BlockCursor::new(Adapter(inner)),
|
reader: BlockCursor::new(Adapter(inner)),
|
||||||
|
decompressor,
|
||||||
|
decompress_buf: Vec::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(iter)
|
Ok(iter)
|
||||||
@@ -814,7 +1082,31 @@ impl<'a> DeltaValueIter<'a> {
|
|||||||
let lsn = delta_key.lsn();
|
let lsn = delta_key.lsn();
|
||||||
|
|
||||||
let buf = self.reader.read_blob(blob_ref.pos())?;
|
let buf = self.reader.read_blob(blob_ref.pos())?;
|
||||||
let val = Value::des(&buf)?;
|
let uncompressed_bytes = if blob_ref.compressed() {
|
||||||
|
if let Some(decompressor) = &mut self.decompressor {
|
||||||
|
let decompressed_max_len = zstd::bulk::Decompressor::upper_bound(&buf)
|
||||||
|
.ok_or_else(|| {
|
||||||
|
anyhow!(
|
||||||
|
"could not get decompressed length at offset {}",
|
||||||
|
blob_ref.pos()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
self.decompress_buf.clear();
|
||||||
|
self.decompress_buf.reserve(decompressed_max_len);
|
||||||
|
let _ = decompressor.decompress_to_buffer(&buf, &mut self.decompress_buf)?;
|
||||||
|
&self.decompress_buf
|
||||||
|
} else {
|
||||||
|
bail!("blob is compressed, but there was no dictionary");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
&buf
|
||||||
|
};
|
||||||
|
let val = Value::des(uncompressed_bytes).with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to deserialize file blob at offset {}",
|
||||||
|
blob_ref.pos()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
self.next_idx += 1;
|
self.next_idx += 1;
|
||||||
Ok(Some((key, lsn, val)))
|
Ok(Some((key, lsn, val)))
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -19,6 +19,11 @@
|
|||||||
//! layer, and offsets to the other parts. The "index" is a B-tree,
|
//! layer, and offsets to the other parts. The "index" is a B-tree,
|
||||||
//! mapping from Key to an offset in the "values" part. The
|
//! mapping from Key to an offset in the "values" part. The
|
||||||
//! actual page images are stored in the "values" part.
|
//! actual page images are stored in the "values" part.
|
||||||
|
//!
|
||||||
|
//! Each page image is compressed with ZStandard. See Compression section
|
||||||
|
//! in the delta_layer.rs for more discussion. Difference from a delta
|
||||||
|
//! layer is that we don't currently use a dictionary for image layers.
|
||||||
|
use crate::config;
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||||
use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||||
@@ -90,6 +95,35 @@ impl From<&ImageLayer> for Summary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Struct representing reference to BLOB in the file. In an image layer,
|
||||||
|
/// each blob is an image of the page. It can be compressed or not, and
|
||||||
|
/// that is stored in low bit of the BlobRef.
|
||||||
|
///
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||||
|
struct BlobRef(u64);
|
||||||
|
|
||||||
|
/// Flag indicating that this blob is compressed
|
||||||
|
const BLOB_COMPRESSED: u64 = 1;
|
||||||
|
|
||||||
|
impl BlobRef {
|
||||||
|
pub fn compressed(&self) -> bool {
|
||||||
|
(self.0 & BLOB_COMPRESSED) != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn pos(&self) -> u64 {
|
||||||
|
self.0 >> 1
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new(pos: u64, compressed: bool) -> BlobRef {
|
||||||
|
let mut blob_ref = pos << 1;
|
||||||
|
if compressed {
|
||||||
|
blob_ref |= BLOB_COMPRESSED;
|
||||||
|
}
|
||||||
|
BlobRef(blob_ref)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// ImageLayer is the in-memory data structure associated with an on-disk image
|
/// ImageLayer is the in-memory data structure associated with an on-disk image
|
||||||
/// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a
|
/// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a
|
||||||
@@ -121,6 +155,13 @@ pub struct ImageLayerInner {
|
|||||||
file: Option<FileBlockReader<VirtualFile>>,
|
file: Option<FileBlockReader<VirtualFile>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl ImageLayerInner {
|
||||||
|
fn create_decompressor(&self) -> Result<zstd::bulk::Decompressor<'_>> {
|
||||||
|
let decompressor = zstd::bulk::Decompressor::new()?;
|
||||||
|
Ok(decompressor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Layer for ImageLayer {
|
impl Layer for ImageLayer {
|
||||||
fn filename(&self) -> PathBuf {
|
fn filename(&self) -> PathBuf {
|
||||||
PathBuf::from(self.layer_name().to_string())
|
PathBuf::from(self.layer_name().to_string())
|
||||||
@@ -160,20 +201,33 @@ impl Layer for ImageLayer {
|
|||||||
|
|
||||||
let inner = self.load()?;
|
let inner = self.load()?;
|
||||||
|
|
||||||
|
let mut decompressor = inner.create_decompressor()?;
|
||||||
|
|
||||||
let file = inner.file.as_ref().unwrap();
|
let file = inner.file.as_ref().unwrap();
|
||||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||||
|
|
||||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||||
key.write_to_byte_slice(&mut keybuf);
|
key.write_to_byte_slice(&mut keybuf);
|
||||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
if let Some(value) = tree_reader.get(&keybuf)? {
|
||||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
let blob_ref = BlobRef(value);
|
||||||
format!(
|
let blob_content =
|
||||||
"failed to read value from data file {} at offset {}",
|
file.block_cursor()
|
||||||
self.filename().display(),
|
.read_blob(blob_ref.pos())
|
||||||
offset
|
.with_context(|| {
|
||||||
)
|
format!(
|
||||||
})?;
|
"failed to read value from data file {} at offset {}",
|
||||||
let value = Bytes::from(blob);
|
self.filename().display(),
|
||||||
|
blob_ref.pos()
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let uncompressed_bytes = if blob_ref.compressed() {
|
||||||
|
decompressor.decompress(&blob_content, PAGE_SZ)?
|
||||||
|
} else {
|
||||||
|
blob_content
|
||||||
|
};
|
||||||
|
|
||||||
|
let value = Bytes::from(uncompressed_bytes);
|
||||||
|
|
||||||
reconstruct_state.img = Some((self.lsn, value));
|
reconstruct_state.img = Some((self.lsn, value));
|
||||||
Ok(ValueReconstructResult::Complete)
|
Ok(ValueReconstructResult::Complete)
|
||||||
@@ -219,7 +273,17 @@ impl Layer for ImageLayer {
|
|||||||
tree_reader.dump()?;
|
tree_reader.dump()?;
|
||||||
|
|
||||||
tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
|
tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
|
||||||
println!("key: {} offset {}", hex::encode(key), value);
|
let blob_ref = BlobRef(value);
|
||||||
|
println!(
|
||||||
|
"key: {} offset {}{}",
|
||||||
|
hex::encode(key),
|
||||||
|
blob_ref.pos(),
|
||||||
|
if blob_ref.compressed() {
|
||||||
|
" (compressed)"
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
}
|
||||||
|
);
|
||||||
true
|
true
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@@ -423,6 +487,8 @@ pub struct ImageLayerWriter {
|
|||||||
|
|
||||||
blob_writer: WriteBlobWriter<VirtualFile>,
|
blob_writer: WriteBlobWriter<VirtualFile>,
|
||||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||||
|
|
||||||
|
compressor: Option<zstd::bulk::Compressor<'static>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ImageLayerWriter {
|
impl ImageLayerWriter {
|
||||||
@@ -454,6 +520,12 @@ impl ImageLayerWriter {
|
|||||||
let block_buf = BlockBuf::new();
|
let block_buf = BlockBuf::new();
|
||||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||||
|
|
||||||
|
// TODO: use a dictionary
|
||||||
|
let compressor = {
|
||||||
|
let compressor = zstd::bulk::Compressor::new(config::ZSTD_COMPRESSION_LEVEL)?;
|
||||||
|
Some(compressor)
|
||||||
|
};
|
||||||
|
|
||||||
let writer = ImageLayerWriter {
|
let writer = ImageLayerWriter {
|
||||||
conf,
|
conf,
|
||||||
path,
|
path,
|
||||||
@@ -463,6 +535,7 @@ impl ImageLayerWriter {
|
|||||||
lsn,
|
lsn,
|
||||||
tree: tree_builder,
|
tree: tree_builder,
|
||||||
blob_writer,
|
blob_writer,
|
||||||
|
compressor,
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(writer)
|
Ok(writer)
|
||||||
@@ -475,11 +548,37 @@ impl ImageLayerWriter {
|
|||||||
///
|
///
|
||||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
|
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
|
||||||
ensure!(self.key_range.contains(&key));
|
ensure!(self.key_range.contains(&key));
|
||||||
let off = self.blob_writer.write_blob(img)?;
|
|
||||||
|
|
||||||
|
let mut blob_content = img;
|
||||||
|
let mut compressed = false;
|
||||||
|
|
||||||
|
// Try to compress the blob
|
||||||
|
let compressed_bytes;
|
||||||
|
if blob_content.len() <= PAGE_SZ {
|
||||||
|
if let Some(ref mut compressor) = self.compressor {
|
||||||
|
compressed_bytes = compressor.compress(blob_content)?;
|
||||||
|
|
||||||
|
// If compressed version is not any smaller than the original,
|
||||||
|
// store it uncompressed. This not just an optimization, the
|
||||||
|
// the decompression assumes that too. That simplifies the
|
||||||
|
// decompression, because you don't need to jump through any
|
||||||
|
// hoops to determine how large a buffer you need to hold the
|
||||||
|
// decompression result.
|
||||||
|
if compressed_bytes.len() < blob_content.len() {
|
||||||
|
blob_content = &compressed_bytes;
|
||||||
|
compressed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write it to the file
|
||||||
|
let off = self.blob_writer.write_blob(blob_content)?;
|
||||||
|
let blob_ref = BlobRef::new(off, compressed);
|
||||||
|
|
||||||
|
// And store the reference in the B-tree
|
||||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||||
key.write_to_byte_slice(&mut keybuf);
|
key.write_to_byte_slice(&mut keybuf);
|
||||||
self.tree.append(&keybuf, off)?;
|
self.tree.append(&keybuf, blob_ref.0)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ use pgdatadir_mapping::DatadirTimeline;
|
|||||||
/// This is embedded in the metadata file, and also in the header of all the
|
/// This is embedded in the metadata file, and also in the header of all the
|
||||||
/// layer files. If you make any backwards-incompatible changes to the storage
|
/// layer files. If you make any backwards-incompatible changes to the storage
|
||||||
/// format, bump this!
|
/// format, bump this!
|
||||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
pub const STORAGE_FORMAT_VERSION: u16 = 4;
|
||||||
|
|
||||||
// Magic constants used to identify different kinds of files
|
// Magic constants used to identify different kinds of files
|
||||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||||
|
|||||||
@@ -305,7 +305,29 @@ fn page_service_conn_main(
|
|||||||
|
|
||||||
let mut conn_handler = PageServerHandler::new(conf, auth);
|
let mut conn_handler = PageServerHandler::new(conf, auth);
|
||||||
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
|
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
|
||||||
pgbackend.run(&mut conn_handler)
|
match pgbackend.run(&mut conn_handler) {
|
||||||
|
Ok(()) => {
|
||||||
|
// we've been requested to shut down
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
let root_cause_io_err_kind = err
|
||||||
|
.root_cause()
|
||||||
|
.downcast_ref::<io::Error>()
|
||||||
|
.map(|e| e.kind());
|
||||||
|
|
||||||
|
// `ConnectionReset` error happens when the Postgres client closes the connection.
|
||||||
|
// As this disconnection happens quite often and is expected,
|
||||||
|
// we decided to downgrade the logging level to `INFO`.
|
||||||
|
// See: https://github.com/neondatabase/neon/issues/1683.
|
||||||
|
if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
|
||||||
|
info!("Postgres client disconnected");
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -593,7 +615,8 @@ impl PageServerHandler {
|
|||||||
/* Send a tarball of the latest layer on the timeline */
|
/* Send a tarball of the latest layer on the timeline */
|
||||||
{
|
{
|
||||||
let mut writer = CopyDataSink { pgb };
|
let mut writer = CopyDataSink { pgb };
|
||||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
|
||||||
|
let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
||||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||||
basebackup.send_tarball()?;
|
basebackup.send_tarball()?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
//! page server.
|
//! page server.
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::layered_repository::LayeredRepository;
|
use crate::layered_repository::{load_metadata, LayeredRepository};
|
||||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||||
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
||||||
use crate::storage_sync::index::RemoteIndex;
|
use crate::storage_sync::index::RemoteIndex;
|
||||||
@@ -22,6 +22,7 @@ use std::collections::HashMap;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
use utils::zid::{ZTenantId, ZTimelineId};
|
use utils::zid::{ZTenantId, ZTimelineId};
|
||||||
|
|
||||||
@@ -327,8 +328,8 @@ pub fn get_local_timeline_with_load(
|
|||||||
return Ok(Arc::clone(page_tline));
|
return Ok(Arc::clone(page_tline));
|
||||||
}
|
}
|
||||||
|
|
||||||
let page_tline = new_local_timeline(&tenant.repo, timeline_id)
|
let page_tline = load_local_timeline(&tenant.repo, timeline_id)
|
||||||
.with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?;
|
.with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?;
|
||||||
tenant
|
tenant
|
||||||
.local_timelines
|
.local_timelines
|
||||||
.insert(timeline_id, Arc::clone(&page_tline));
|
.insert(timeline_id, Arc::clone(&page_tline));
|
||||||
@@ -365,7 +366,7 @@ pub fn detach_timeline(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new_local_timeline(
|
fn load_local_timeline(
|
||||||
repo: &RepositoryImpl,
|
repo: &RepositoryImpl,
|
||||||
timeline_id: ZTimelineId,
|
timeline_id: ZTimelineId,
|
||||||
) -> anyhow::Result<Arc<DatadirTimeline<LayeredRepository>>> {
|
) -> anyhow::Result<Arc<DatadirTimeline<LayeredRepository>>> {
|
||||||
@@ -399,6 +400,26 @@ pub fn list_tenants() -> Vec<TenantInfo> {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if a given timeline is "broken" \[1\].
|
||||||
|
/// The function returns an error if the timeline is "broken".
|
||||||
|
///
|
||||||
|
/// \[1\]: it's not clear now how should we classify a timeline as broken.
|
||||||
|
/// A timeline is categorized as broken when any of following conditions is true:
|
||||||
|
/// - failed to load the timeline's metadata
|
||||||
|
/// - the timeline's disk consistent LSN is zero
|
||||||
|
fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||||
|
let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id())
|
||||||
|
.context("failed to load metadata")?;
|
||||||
|
|
||||||
|
// A timeline with zero disk consistent LSN can happen when the page server
|
||||||
|
// failed to checkpoint the timeline import data when creating that timeline.
|
||||||
|
if metadata.disk_consistent_lsn() == Lsn::INVALID {
|
||||||
|
bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn init_local_repository(
|
fn init_local_repository(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: ZTenantId,
|
tenant_id: ZTenantId,
|
||||||
@@ -414,7 +435,13 @@ fn init_local_repository(
|
|||||||
match init_status {
|
match init_status {
|
||||||
LocalTimelineInitStatus::LocallyComplete => {
|
LocalTimelineInitStatus::LocallyComplete => {
|
||||||
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
||||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
if let Err(err) = check_broken_timeline(&repo, timeline_id) {
|
||||||
|
info!(
|
||||||
|
"Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository"
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
LocalTimelineInitStatus::NeedsSync => {
|
LocalTimelineInitStatus::NeedsSync => {
|
||||||
debug!(
|
debug!(
|
||||||
@@ -458,8 +485,8 @@ fn apply_timeline_remote_sync_status_updates(
|
|||||||
bail!("Local timeline {timeline_id} already registered")
|
bail!("Local timeline {timeline_id} already registered")
|
||||||
}
|
}
|
||||||
Entry::Vacant(v) => {
|
Entry::Vacant(v) => {
|
||||||
v.insert(new_local_timeline(repo, timeline_id).with_context(|| {
|
v.insert(load_local_timeline(repo, timeline_id).with_context(|| {
|
||||||
format!("Failed to register new local timeline for tenant {tenant_id}")
|
format!("Failed to register add local timeline for tenant {tenant_id}")
|
||||||
})?);
|
})?);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -285,7 +285,9 @@ fn bootstrap_timeline<R: Repository>(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
||||||
|
|
||||||
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
|
let initdb_path = conf
|
||||||
|
.tenant_path(&tenantid)
|
||||||
|
.join(format!("tmp-timeline-{}", tli));
|
||||||
|
|
||||||
// Init temporarily repo to get bootstrap data
|
// Init temporarily repo to get bootstrap data
|
||||||
run_initdb(conf, &initdb_path)?;
|
run_initdb(conf, &initdb_path)?;
|
||||||
@@ -300,10 +302,15 @@ fn bootstrap_timeline<R: Repository>(
|
|||||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||||
let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline, u64::MAX);
|
let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline, u64::MAX);
|
||||||
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
|
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
|
||||||
|
|
||||||
|
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||||
|
bail!("failpoint before-checkpoint-new-timeline");
|
||||||
|
});
|
||||||
|
|
||||||
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
|
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||||
|
|
||||||
println!(
|
info!(
|
||||||
"created initial timeline {} timeline.lsn {}",
|
"created root timeline {} timeline.lsn {}",
|
||||||
tli,
|
tli,
|
||||||
page_tline.tline.get_last_record_lsn()
|
page_tline.tline.get_last_record_lsn()
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -5,12 +5,9 @@ use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
|
|||||||
fn hello_message(redirect_uri: &str, session_id: &str) -> String {
|
fn hello_message(redirect_uri: &str, session_id: &str) -> String {
|
||||||
format!(
|
format!(
|
||||||
concat![
|
concat![
|
||||||
"☀️ Welcome to Neon!\n",
|
"Welcome to Neon!\n",
|
||||||
"To proceed with database creation, open the following link:\n\n",
|
"Authenticate by visiting:\n",
|
||||||
" {redirect_uri}{session_id}\n\n",
|
" {redirect_uri}{session_id}\n\n",
|
||||||
"It needs to be done once and we will send you '.pgpass' file,\n",
|
|
||||||
"which will allow you to access or create ",
|
|
||||||
"databases without opening your web browser."
|
|
||||||
],
|
],
|
||||||
redirect_uri = redirect_uri,
|
redirect_uri = redirect_uri,
|
||||||
session_id = session_id,
|
session_id = session_id,
|
||||||
|
|||||||
@@ -61,7 +61,8 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfi
|
|||||||
let config = rustls::ServerConfig::builder()
|
let config = rustls::ServerConfig::builder()
|
||||||
.with_safe_default_cipher_suites()
|
.with_safe_default_cipher_suites()
|
||||||
.with_safe_default_kx_groups()
|
.with_safe_default_kx_groups()
|
||||||
.with_protocol_versions(&[&rustls::version::TLS13])?
|
// allow TLS 1.2 to be compatible with older client libraries
|
||||||
|
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
||||||
.with_no_client_auth()
|
.with_no_client_auth()
|
||||||
.with_single_cert(cert_chain, key)?;
|
.with_single_cert(cert_chain, key)?;
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,10 @@ const_format = "0.2.21"
|
|||||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||||
tokio-util = { version = "0.7", features = ["io"] }
|
tokio-util = { version = "0.7", features = ["io"] }
|
||||||
git-version = "0.3.5"
|
git-version = "0.3.5"
|
||||||
|
async-trait = "0.1"
|
||||||
|
once_cell = "1.10.0"
|
||||||
|
futures = "0.3.13"
|
||||||
|
toml_edit = { version = "0.13", features = ["easy"] }
|
||||||
|
|
||||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||||
metrics = { path = "../libs/metrics" }
|
metrics = { path = "../libs/metrics" }
|
||||||
|
|||||||
@@ -6,25 +6,30 @@ use clap::{App, Arg};
|
|||||||
use const_format::formatcp;
|
use const_format::formatcp;
|
||||||
use daemonize::Daemonize;
|
use daemonize::Daemonize;
|
||||||
use fs2::FileExt;
|
use fs2::FileExt;
|
||||||
|
use remote_storage::RemoteStorageConfig;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{ErrorKind, Write};
|
use std::io::{ErrorKind, Write};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
|
use toml_edit::Document;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use url::{ParseError, Url};
|
use url::{ParseError, Url};
|
||||||
|
|
||||||
use safekeeper::control_file::{self};
|
use safekeeper::control_file::{self};
|
||||||
use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
|
use safekeeper::defaults::{
|
||||||
|
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||||
|
};
|
||||||
|
use safekeeper::http;
|
||||||
use safekeeper::remove_wal;
|
use safekeeper::remove_wal;
|
||||||
use safekeeper::timeline::GlobalTimelines;
|
use safekeeper::timeline::GlobalTimelines;
|
||||||
|
use safekeeper::wal_backup;
|
||||||
use safekeeper::wal_service;
|
use safekeeper::wal_service;
|
||||||
use safekeeper::SafeKeeperConf;
|
use safekeeper::SafeKeeperConf;
|
||||||
use safekeeper::{broker, callmemaybe};
|
use safekeeper::{broker, callmemaybe};
|
||||||
use safekeeper::{http, s3_offload};
|
|
||||||
use utils::{
|
use utils::{
|
||||||
http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener,
|
http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener,
|
||||||
zid::ZNodeId,
|
zid::NodeId,
|
||||||
};
|
};
|
||||||
|
|
||||||
const LOCK_FILE_NAME: &str = "safekeeper.lock";
|
const LOCK_FILE_NAME: &str = "safekeeper.lock";
|
||||||
@@ -71,12 +76,6 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.long("pageserver")
|
.long("pageserver")
|
||||||
.takes_value(true),
|
.takes_value(true),
|
||||||
)
|
)
|
||||||
.arg(
|
|
||||||
Arg::new("ttl")
|
|
||||||
.long("ttl")
|
|
||||||
.takes_value(true)
|
|
||||||
.help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"),
|
|
||||||
)
|
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("recall")
|
Arg::new("recall")
|
||||||
.long("recall")
|
.long("recall")
|
||||||
@@ -118,12 +117,20 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
|
.help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
|
||||||
)
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("enable-s3-offload")
|
Arg::new("wal-backup-threads").long("backup-threads").takes_value(true).help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")),
|
||||||
.long("enable-s3-offload")
|
).arg(
|
||||||
|
Arg::new("remote-storage")
|
||||||
|
.long("remote-storage")
|
||||||
|
.takes_value(true)
|
||||||
|
.help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"<BUCKETNAME>\", \"bucket_region\":\"<REGION>\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring structure on the file system.")
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("enable-wal-backup")
|
||||||
|
.long("enable-wal-backup")
|
||||||
.takes_value(true)
|
.takes_value(true)
|
||||||
.default_value("true")
|
.default_value("true")
|
||||||
.default_missing_value("true")
|
.default_missing_value("true")
|
||||||
.help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."),
|
.help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."),
|
||||||
)
|
)
|
||||||
.get_matches();
|
.get_matches();
|
||||||
|
|
||||||
@@ -157,17 +164,13 @@ fn main() -> anyhow::Result<()> {
|
|||||||
conf.listen_http_addr = addr.to_owned();
|
conf.listen_http_addr = addr.to_owned();
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(ttl) = arg_matches.value_of("ttl") {
|
|
||||||
conf.ttl = Some(humantime::parse_duration(ttl)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(recall) = arg_matches.value_of("recall") {
|
if let Some(recall) = arg_matches.value_of("recall") {
|
||||||
conf.recall_period = humantime::parse_duration(recall)?;
|
conf.recall_period = humantime::parse_duration(recall)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut given_id = None;
|
let mut given_id = None;
|
||||||
if let Some(given_id_str) = arg_matches.value_of("id") {
|
if let Some(given_id_str) = arg_matches.value_of("id") {
|
||||||
given_id = Some(ZNodeId(
|
given_id = Some(NodeId(
|
||||||
given_id_str
|
given_id_str
|
||||||
.parse()
|
.parse()
|
||||||
.context("failed to parse safekeeper id")?,
|
.context("failed to parse safekeeper id")?,
|
||||||
@@ -182,9 +185,21 @@ fn main() -> anyhow::Result<()> {
|
|||||||
conf.broker_etcd_prefix = prefix.to_string();
|
conf.broker_etcd_prefix = prefix.to_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(backup_threads) = arg_matches.value_of("wal-backup-threads") {
|
||||||
|
conf.backup_runtime_threads = backup_threads
|
||||||
|
.parse()
|
||||||
|
.with_context(|| format!("Failed to parse backup threads {}", backup_threads))?;
|
||||||
|
}
|
||||||
|
if let Some(storage_conf) = arg_matches.value_of("remote-storage") {
|
||||||
|
// funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
|
||||||
|
let storage_conf_toml = format!("remote_storage = {}", storage_conf);
|
||||||
|
let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
|
||||||
|
let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
|
||||||
|
conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?);
|
||||||
|
}
|
||||||
// Seems like there is no better way to accept bool values explicitly in clap.
|
// Seems like there is no better way to accept bool values explicitly in clap.
|
||||||
conf.s3_offload_enabled = arg_matches
|
conf.wal_backup_enabled = arg_matches
|
||||||
.value_of("enable-s3-offload")
|
.value_of("enable-wal-backup")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.parse()
|
.parse()
|
||||||
.context("failed to parse bool enable-s3-offload bool")?;
|
.context("failed to parse bool enable-s3-offload bool")?;
|
||||||
@@ -192,7 +207,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
start_safekeeper(conf, given_id, arg_matches.is_present("init"))
|
start_safekeeper(conf, given_id, arg_matches.is_present("init"))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: bool) -> Result<()> {
|
fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bool) -> Result<()> {
|
||||||
let log_file = logging::init("safekeeper.log", conf.daemonize)?;
|
let log_file = logging::init("safekeeper.log", conf.daemonize)?;
|
||||||
|
|
||||||
info!("version: {GIT_VERSION}");
|
info!("version: {GIT_VERSION}");
|
||||||
@@ -252,7 +267,8 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
|
|||||||
let signals = signals::install_shutdown_handlers()?;
|
let signals = signals::install_shutdown_handlers()?;
|
||||||
let mut threads = vec![];
|
let mut threads = vec![];
|
||||||
let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel();
|
let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel();
|
||||||
GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx);
|
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||||
|
GlobalTimelines::init(callmemaybe_tx, wal_backup_launcher_tx);
|
||||||
|
|
||||||
let conf_ = conf.clone();
|
let conf_ = conf.clone();
|
||||||
threads.push(
|
threads.push(
|
||||||
@@ -270,17 +286,6 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
|
|||||||
})?,
|
})?,
|
||||||
);
|
);
|
||||||
|
|
||||||
if conf.ttl.is_some() {
|
|
||||||
let conf_ = conf.clone();
|
|
||||||
threads.push(
|
|
||||||
thread::Builder::new()
|
|
||||||
.name("S3 offload thread".into())
|
|
||||||
.spawn(|| {
|
|
||||||
s3_offload::thread_main(conf_);
|
|
||||||
})?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let conf_cloned = conf.clone();
|
let conf_cloned = conf.clone();
|
||||||
let safekeeper_thread = thread::Builder::new()
|
let safekeeper_thread = thread::Builder::new()
|
||||||
.name("Safekeeper thread".into())
|
.name("Safekeeper thread".into())
|
||||||
@@ -330,6 +335,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
|
|||||||
})?,
|
})?,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let conf_ = conf.clone();
|
||||||
|
threads.push(
|
||||||
|
thread::Builder::new()
|
||||||
|
.name("wal backup launcher thread".into())
|
||||||
|
.spawn(move || {
|
||||||
|
wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx);
|
||||||
|
})?,
|
||||||
|
);
|
||||||
|
|
||||||
// TODO: put more thoughts into handling of failed threads
|
// TODO: put more thoughts into handling of failed threads
|
||||||
// We probably should restart them.
|
// We probably should restart them.
|
||||||
|
|
||||||
@@ -345,14 +359,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<ZNodeId>, init: b
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Determine safekeeper id and set it in config.
|
/// Determine safekeeper id and set it in config.
|
||||||
fn set_id(conf: &mut SafeKeeperConf, given_id: Option<ZNodeId>) -> Result<()> {
|
fn set_id(conf: &mut SafeKeeperConf, given_id: Option<NodeId>) -> Result<()> {
|
||||||
let id_file_path = conf.workdir.join(ID_FILE_NAME);
|
let id_file_path = conf.workdir.join(ID_FILE_NAME);
|
||||||
|
|
||||||
let my_id: ZNodeId;
|
let my_id: NodeId;
|
||||||
// If ID exists, read it in; otherwise set one passed
|
// If ID exists, read it in; otherwise set one passed
|
||||||
match fs::read(&id_file_path) {
|
match fs::read(&id_file_path) {
|
||||||
Ok(id_serialized) => {
|
Ok(id_serialized) => {
|
||||||
my_id = ZNodeId(
|
my_id = NodeId(
|
||||||
std::str::from_utf8(&id_serialized)
|
std::str::from_utf8(&id_serialized)
|
||||||
.context("failed to parse safekeeper id")?
|
.context("failed to parse safekeeper id")?
|
||||||
.parse()
|
.parse()
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
//! Communication with etcd, providing safekeeper peers and pageserver coordination.
|
//! Communication with etcd, providing safekeeper peers and pageserver coordination.
|
||||||
|
|
||||||
|
use anyhow::anyhow;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
@@ -7,12 +8,14 @@ use etcd_broker::Client;
|
|||||||
use etcd_broker::PutOptions;
|
use etcd_broker::PutOptions;
|
||||||
use etcd_broker::SkTimelineSubscriptionKind;
|
use etcd_broker::SkTimelineSubscriptionKind;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use tokio::spawn;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use tokio::{runtime, time::sleep};
|
use tokio::{runtime, time::sleep};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
use crate::{timeline::GlobalTimelines, SafeKeeperConf};
|
use crate::{timeline::GlobalTimelines, SafeKeeperConf};
|
||||||
use utils::zid::{ZNodeId, ZTenantTimelineId};
|
use utils::zid::{NodeId, ZTenantTimelineId};
|
||||||
|
|
||||||
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||||
@@ -36,7 +39,7 @@ pub fn thread_main(conf: SafeKeeperConf) {
|
|||||||
fn timeline_safekeeper_path(
|
fn timeline_safekeeper_path(
|
||||||
broker_etcd_prefix: String,
|
broker_etcd_prefix: String,
|
||||||
zttid: ZTenantTimelineId,
|
zttid: ZTenantTimelineId,
|
||||||
sk_id: ZNodeId,
|
sk_id: NodeId,
|
||||||
) -> String {
|
) -> String {
|
||||||
format!(
|
format!(
|
||||||
"{}/{sk_id}",
|
"{}/{sk_id}",
|
||||||
@@ -44,6 +47,118 @@ fn timeline_safekeeper_path(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct Election {
|
||||||
|
pub election_name: String,
|
||||||
|
pub candidate_name: String,
|
||||||
|
pub broker_endpoints: Vec<Url>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Election {
|
||||||
|
pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec<Url>) -> Self {
|
||||||
|
Self {
|
||||||
|
election_name,
|
||||||
|
candidate_name,
|
||||||
|
broker_endpoints,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ElectionLeader {
|
||||||
|
client: Client,
|
||||||
|
keep_alive: JoinHandle<Result<()>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ElectionLeader {
|
||||||
|
pub async fn check_am_i(
|
||||||
|
&mut self,
|
||||||
|
election_name: String,
|
||||||
|
candidate_name: String,
|
||||||
|
) -> Result<bool> {
|
||||||
|
let resp = self.client.leader(election_name).await?;
|
||||||
|
|
||||||
|
let kv = resp.kv().ok_or(anyhow!("failed to get leader response"))?;
|
||||||
|
let leader = kv.value_str()?;
|
||||||
|
|
||||||
|
Ok(leader == candidate_name)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn give_up(self) {
|
||||||
|
self.keep_alive.abort();
|
||||||
|
// TODO: it'll be wise to resign here but it'll happen after lease expiration anyway
|
||||||
|
// should we await for keep alive termination?
|
||||||
|
let _ = self.keep_alive.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_leader(req: &Election) -> Result<ElectionLeader> {
|
||||||
|
let mut client = Client::connect(req.broker_endpoints.clone(), None)
|
||||||
|
.await
|
||||||
|
.context("Could not connect to etcd")?;
|
||||||
|
|
||||||
|
let lease = client
|
||||||
|
.lease_grant(LEASE_TTL_SEC, None)
|
||||||
|
.await
|
||||||
|
.context("Could not acquire a lease");
|
||||||
|
|
||||||
|
let lease_id = lease.map(|l| l.id()).unwrap();
|
||||||
|
|
||||||
|
let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id));
|
||||||
|
|
||||||
|
if let Err(e) = client
|
||||||
|
.campaign(
|
||||||
|
req.election_name.clone(),
|
||||||
|
req.candidate_name.clone(),
|
||||||
|
lease_id,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
keep_alive.abort();
|
||||||
|
let _ = keep_alive.await;
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ElectionLeader { client, keep_alive })
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> {
|
||||||
|
let (mut keeper, mut ka_stream) = client
|
||||||
|
.lease_keep_alive(lease_id)
|
||||||
|
.await
|
||||||
|
.context("failed to create keepalive stream")?;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||||
|
|
||||||
|
keeper
|
||||||
|
.keep_alive()
|
||||||
|
.await
|
||||||
|
.context("failed to send LeaseKeepAliveRequest")?;
|
||||||
|
|
||||||
|
ka_stream
|
||||||
|
.message()
|
||||||
|
.await
|
||||||
|
.context("failed to receive LeaseKeepAliveResponse")?;
|
||||||
|
|
||||||
|
sleep(push_interval).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_campaign_name(
|
||||||
|
election_name: String,
|
||||||
|
broker_prefix: String,
|
||||||
|
timeline_id: &ZTenantTimelineId,
|
||||||
|
) -> String {
|
||||||
|
return format!(
|
||||||
|
"{}/{}",
|
||||||
|
SkTimelineSubscriptionKind::timeline(broker_prefix, *timeline_id).watch_key(),
|
||||||
|
election_name
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_candiate_name(system_id: NodeId) -> String {
|
||||||
|
format!("id_{}", system_id)
|
||||||
|
}
|
||||||
|
|
||||||
/// Push once in a while data about all active timelines to the broker.
|
/// Push once in a while data about all active timelines to the broker.
|
||||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||||
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
||||||
@@ -59,7 +174,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
|||||||
// sensitive and there is no risk of deadlock as we don't await while
|
// sensitive and there is no risk of deadlock as we don't await while
|
||||||
// lock is held.
|
// lock is held.
|
||||||
for zttid in GlobalTimelines::get_active_timelines() {
|
for zttid in GlobalTimelines::get_active_timelines() {
|
||||||
if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) {
|
if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
|
||||||
let sk_info = tli.get_public_info(&conf)?;
|
let sk_info = tli.get_public_info(&conf)?;
|
||||||
let put_opts = PutOptions::new().with_lease(lease.id());
|
let put_opts = PutOptions::new().with_lease(lease.id());
|
||||||
client
|
client
|
||||||
@@ -106,12 +221,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
|||||||
// note: there are blocking operations below, but it's considered fine for now
|
// note: there are blocking operations below, but it's considered fine for now
|
||||||
if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) {
|
if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) {
|
||||||
for (safekeeper_id, info) in sk_info {
|
for (safekeeper_id, info) in sk_info {
|
||||||
tli.record_safekeeper_info(&info, safekeeper_id)?
|
tli.record_safekeeper_info(&info, safekeeper_id).await?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
|
// XXX it means we lost connection with etcd, error is consumed inside sub object
|
||||||
debug!("timeline updates sender closed, aborting the pull loop");
|
debug!("timeline updates sender closed, aborting the pull loop");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
@@ -142,11 +258,12 @@ async fn main_loop(conf: SafeKeeperConf) {
|
|||||||
},
|
},
|
||||||
res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => {
|
res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => {
|
||||||
// was it panic or normal error?
|
// was it panic or normal error?
|
||||||
let err = match res {
|
match res {
|
||||||
Ok(res_internal) => res_internal.unwrap_err(),
|
Ok(res_internal) => if let Err(err_inner) = res_internal {
|
||||||
Err(err_outer) => err_outer.into(),
|
warn!("pull task failed: {:?}", err_inner);
|
||||||
|
}
|
||||||
|
Err(err_outer) => { warn!("pull task panicked: {:?}", err_outer) }
|
||||||
};
|
};
|
||||||
warn!("pull task failed: {:?}", err);
|
|
||||||
pull_handle = None;
|
pull_handle = None;
|
||||||
},
|
},
|
||||||
_ = ticker.tick() => {
|
_ = ticker.tick() => {
|
||||||
|
|||||||
@@ -165,7 +165,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
|||||||
timeline_start_lsn: Lsn(0),
|
timeline_start_lsn: Lsn(0),
|
||||||
local_start_lsn: Lsn(0),
|
local_start_lsn: Lsn(0),
|
||||||
commit_lsn: oldstate.commit_lsn,
|
commit_lsn: oldstate.commit_lsn,
|
||||||
s3_wal_lsn: Lsn(0),
|
backup_lsn: Lsn(0),
|
||||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: Peers(vec![]),
|
peers: Peers(vec![]),
|
||||||
@@ -188,7 +188,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
|||||||
timeline_start_lsn: Lsn(0),
|
timeline_start_lsn: Lsn(0),
|
||||||
local_start_lsn: Lsn(0),
|
local_start_lsn: Lsn(0),
|
||||||
commit_lsn: oldstate.commit_lsn,
|
commit_lsn: oldstate.commit_lsn,
|
||||||
s3_wal_lsn: Lsn(0),
|
backup_lsn: Lsn(0),
|
||||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: Peers(vec![]),
|
peers: Peers(vec![]),
|
||||||
@@ -211,7 +211,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
|||||||
timeline_start_lsn: Lsn(0),
|
timeline_start_lsn: Lsn(0),
|
||||||
local_start_lsn: Lsn(0),
|
local_start_lsn: Lsn(0),
|
||||||
commit_lsn: oldstate.commit_lsn,
|
commit_lsn: oldstate.commit_lsn,
|
||||||
s3_wal_lsn: Lsn(0),
|
backup_lsn: Lsn(0),
|
||||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: Peers(vec![]),
|
peers: Peers(vec![]),
|
||||||
@@ -234,7 +234,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
|||||||
timeline_start_lsn: Lsn(0),
|
timeline_start_lsn: Lsn(0),
|
||||||
local_start_lsn: Lsn(0),
|
local_start_lsn: Lsn(0),
|
||||||
commit_lsn: oldstate.commit_lsn,
|
commit_lsn: oldstate.commit_lsn,
|
||||||
s3_wal_lsn: Lsn(0),
|
backup_lsn: Lsn::INVALID,
|
||||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: Peers(vec![]),
|
peers: Peers(vec![]),
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
use utils::zid::{NodeId, ZTenantId, ZTimelineId};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TimelineCreateRequest {
|
pub struct TimelineCreateRequest {
|
||||||
pub tenant_id: ZTenantId,
|
pub tenant_id: ZTenantId,
|
||||||
pub timeline_id: ZTimelineId,
|
pub timeline_id: ZTimelineId,
|
||||||
pub peer_ids: Vec<ZNodeId>,
|
pub peer_ids: Vec<NodeId>,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,14 +20,14 @@ use utils::{
|
|||||||
RequestExt, RouterBuilder,
|
RequestExt, RouterBuilder,
|
||||||
},
|
},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::models::TimelineCreateRequest;
|
use super::models::TimelineCreateRequest;
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
struct SafekeeperStatus {
|
struct SafekeeperStatus {
|
||||||
id: ZNodeId,
|
id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Healthcheck handler.
|
/// Healthcheck handler.
|
||||||
@@ -70,19 +70,19 @@ struct TimelineStatus {
|
|||||||
timeline_id: ZTimelineId,
|
timeline_id: ZTimelineId,
|
||||||
acceptor_state: AcceptorStateStatus,
|
acceptor_state: AcceptorStateStatus,
|
||||||
#[serde(serialize_with = "display_serialize")]
|
#[serde(serialize_with = "display_serialize")]
|
||||||
|
flush_lsn: Lsn,
|
||||||
|
#[serde(serialize_with = "display_serialize")]
|
||||||
timeline_start_lsn: Lsn,
|
timeline_start_lsn: Lsn,
|
||||||
#[serde(serialize_with = "display_serialize")]
|
#[serde(serialize_with = "display_serialize")]
|
||||||
local_start_lsn: Lsn,
|
local_start_lsn: Lsn,
|
||||||
#[serde(serialize_with = "display_serialize")]
|
#[serde(serialize_with = "display_serialize")]
|
||||||
commit_lsn: Lsn,
|
commit_lsn: Lsn,
|
||||||
#[serde(serialize_with = "display_serialize")]
|
#[serde(serialize_with = "display_serialize")]
|
||||||
s3_wal_lsn: Lsn,
|
backup_lsn: Lsn,
|
||||||
#[serde(serialize_with = "display_serialize")]
|
#[serde(serialize_with = "display_serialize")]
|
||||||
peer_horizon_lsn: Lsn,
|
peer_horizon_lsn: Lsn,
|
||||||
#[serde(serialize_with = "display_serialize")]
|
#[serde(serialize_with = "display_serialize")]
|
||||||
remote_consistent_lsn: Lsn,
|
remote_consistent_lsn: Lsn,
|
||||||
#[serde(serialize_with = "display_serialize")]
|
|
||||||
flush_lsn: Lsn,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Report info about timeline.
|
/// Report info about timeline.
|
||||||
@@ -107,13 +107,13 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
|||||||
tenant_id: zttid.tenant_id,
|
tenant_id: zttid.tenant_id,
|
||||||
timeline_id: zttid.timeline_id,
|
timeline_id: zttid.timeline_id,
|
||||||
acceptor_state: acc_state,
|
acceptor_state: acc_state,
|
||||||
|
flush_lsn,
|
||||||
timeline_start_lsn: state.timeline_start_lsn,
|
timeline_start_lsn: state.timeline_start_lsn,
|
||||||
local_start_lsn: state.local_start_lsn,
|
local_start_lsn: state.local_start_lsn,
|
||||||
commit_lsn: inmem.commit_lsn,
|
commit_lsn: inmem.commit_lsn,
|
||||||
s3_wal_lsn: inmem.s3_wal_lsn,
|
backup_lsn: inmem.backup_lsn,
|
||||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||||
remote_consistent_lsn: inmem.remote_consistent_lsn,
|
remote_consistent_lsn: inmem.remote_consistent_lsn,
|
||||||
flush_lsn,
|
|
||||||
};
|
};
|
||||||
json_response(StatusCode::OK, status)
|
json_response(StatusCode::OK, status)
|
||||||
}
|
}
|
||||||
@@ -148,7 +148,9 @@ async fn timeline_delete_force_handler(
|
|||||||
ensure_no_body(&mut request).await?;
|
ensure_no_body(&mut request).await?;
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
GlobalTimelines::delete_force(get_conf(&request), &zttid).map_err(ApiError::from_err)?,
|
GlobalTimelines::delete_force(get_conf(&request), &zttid)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::from_err)?,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,6 +164,7 @@ async fn tenant_delete_force_handler(
|
|||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id)
|
GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id)
|
||||||
|
.await
|
||||||
.map_err(ApiError::from_err)?
|
.map_err(ApiError::from_err)?
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp))
|
.map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp))
|
||||||
@@ -178,7 +181,8 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
|||||||
let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;
|
let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;
|
||||||
|
|
||||||
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
|
let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?;
|
||||||
tli.record_safekeeper_info(&safekeeper_info, ZNodeId(1))?;
|
tli.record_safekeeper_info(&safekeeper_info, NodeId(1))
|
||||||
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
|
use defaults::DEFAULT_WAL_BACKUP_RUNTIME_THREADS;
|
||||||
//
|
//
|
||||||
|
use remote_storage::RemoteStorageConfig;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId};
|
use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId};
|
||||||
|
|
||||||
pub mod broker;
|
pub mod broker;
|
||||||
pub mod callmemaybe;
|
pub mod callmemaybe;
|
||||||
@@ -14,10 +16,10 @@ pub mod http;
|
|||||||
pub mod json_ctrl;
|
pub mod json_ctrl;
|
||||||
pub mod receive_wal;
|
pub mod receive_wal;
|
||||||
pub mod remove_wal;
|
pub mod remove_wal;
|
||||||
pub mod s3_offload;
|
|
||||||
pub mod safekeeper;
|
pub mod safekeeper;
|
||||||
pub mod send_wal;
|
pub mod send_wal;
|
||||||
pub mod timeline;
|
pub mod timeline;
|
||||||
|
pub mod wal_backup;
|
||||||
pub mod wal_service;
|
pub mod wal_service;
|
||||||
pub mod wal_storage;
|
pub mod wal_storage;
|
||||||
|
|
||||||
@@ -31,6 +33,7 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
||||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||||
pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10);
|
pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10);
|
||||||
|
pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -47,12 +50,13 @@ pub struct SafeKeeperConf {
|
|||||||
pub no_sync: bool,
|
pub no_sync: bool,
|
||||||
pub listen_pg_addr: String,
|
pub listen_pg_addr: String,
|
||||||
pub listen_http_addr: String,
|
pub listen_http_addr: String,
|
||||||
pub ttl: Option<Duration>,
|
|
||||||
pub recall_period: Duration,
|
pub recall_period: Duration,
|
||||||
pub my_id: ZNodeId,
|
pub remote_storage: Option<RemoteStorageConfig>,
|
||||||
|
pub backup_runtime_threads: usize,
|
||||||
|
pub wal_backup_enabled: bool,
|
||||||
|
pub my_id: NodeId,
|
||||||
pub broker_endpoints: Vec<Url>,
|
pub broker_endpoints: Vec<Url>,
|
||||||
pub broker_etcd_prefix: String,
|
pub broker_etcd_prefix: String,
|
||||||
pub s3_offload_enabled: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SafeKeeperConf {
|
impl SafeKeeperConf {
|
||||||
@@ -77,12 +81,13 @@ impl Default for SafeKeeperConf {
|
|||||||
no_sync: false,
|
no_sync: false,
|
||||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||||
ttl: None,
|
remote_storage: None,
|
||||||
recall_period: defaults::DEFAULT_RECALL_PERIOD,
|
recall_period: defaults::DEFAULT_RECALL_PERIOD,
|
||||||
my_id: ZNodeId(0),
|
my_id: NodeId(0),
|
||||||
broker_endpoints: Vec::new(),
|
broker_endpoints: Vec::new(),
|
||||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||||
s3_offload_enabled: true,
|
backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
|
||||||
|
wal_backup_enabled: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -85,16 +85,10 @@ impl<'pg> ReceiveWalConn<'pg> {
|
|||||||
_ => bail!("unexpected message {:?} instead of greeting", next_msg),
|
_ => bail!("unexpected message {:?} instead of greeting", next_msg),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register the connection and defer unregister.
|
|
||||||
spg.timeline
|
|
||||||
.get()
|
|
||||||
.on_compute_connect(self.pageserver_connstr.as_ref())?;
|
|
||||||
let _guard = ComputeConnectionGuard {
|
|
||||||
timeline: Arc::clone(spg.timeline.get()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut next_msg = Some(next_msg);
|
let mut next_msg = Some(next_msg);
|
||||||
|
|
||||||
|
let mut first_time_through = true;
|
||||||
|
let mut _guard: Option<ComputeConnectionGuard> = None;
|
||||||
loop {
|
loop {
|
||||||
if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) {
|
if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) {
|
||||||
// poll AppendRequest's without blocking and write WAL to disk without flushing,
|
// poll AppendRequest's without blocking and write WAL to disk without flushing,
|
||||||
@@ -122,6 +116,18 @@ impl<'pg> ReceiveWalConn<'pg> {
|
|||||||
self.write_msg(&reply)?;
|
self.write_msg(&reply)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if first_time_through {
|
||||||
|
// Register the connection and defer unregister. Do that only
|
||||||
|
// after processing first message, as it sets wal_seg_size,
|
||||||
|
// wanted by many.
|
||||||
|
spg.timeline
|
||||||
|
.get()
|
||||||
|
.on_compute_connect(self.pageserver_connstr.as_ref())?;
|
||||||
|
_guard = Some(ComputeConnectionGuard {
|
||||||
|
timeline: Arc::clone(spg.timeline.get()),
|
||||||
|
});
|
||||||
|
first_time_through = false;
|
||||||
|
}
|
||||||
|
|
||||||
// blocking wait for the next message
|
// blocking wait for the next message
|
||||||
if next_msg.is_none() {
|
if next_msg.is_none() {
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) {
|
|||||||
let active_tlis = GlobalTimelines::get_active_timelines();
|
let active_tlis = GlobalTimelines::get_active_timelines();
|
||||||
for zttid in &active_tlis {
|
for zttid in &active_tlis {
|
||||||
if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) {
|
if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) {
|
||||||
if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) {
|
if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) {
|
||||||
warn!(
|
warn!(
|
||||||
"failed to remove WAL for tenant {} timeline {}: {}",
|
"failed to remove WAL for tenant {} timeline {}: {}",
|
||||||
tli.zttid.tenant_id, tli.zttid.timeline_id, e
|
tli.zttid.tenant_id, tli.zttid.timeline_id, e
|
||||||
|
|||||||
@@ -1,107 +0,0 @@
|
|||||||
//
|
|
||||||
// Offload old WAL segments to S3 and remove them locally
|
|
||||||
// Needs `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to be set
|
|
||||||
// if no IAM bucket access is used.
|
|
||||||
//
|
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
|
||||||
use postgres_ffi::xlog_utils::*;
|
|
||||||
use remote_storage::{
|
|
||||||
GenericRemoteStorage, RemoteStorage, RemoteStorageConfig, S3Bucket, S3Config, S3ObjectKey,
|
|
||||||
};
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::env;
|
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
|
||||||
use std::path::Path;
|
|
||||||
use std::time::SystemTime;
|
|
||||||
use tokio::fs::{self, File};
|
|
||||||
use tokio::io::BufReader;
|
|
||||||
use tokio::runtime;
|
|
||||||
use tokio::time::sleep;
|
|
||||||
use tracing::*;
|
|
||||||
use walkdir::WalkDir;
|
|
||||||
|
|
||||||
use crate::SafeKeeperConf;
|
|
||||||
|
|
||||||
pub fn thread_main(conf: SafeKeeperConf) {
|
|
||||||
// Create a new thread pool
|
|
||||||
//
|
|
||||||
// FIXME: keep it single-threaded for now, make it easier to debug with gdb,
|
|
||||||
// and we're not concerned with performance yet.
|
|
||||||
//let runtime = runtime::Runtime::new().unwrap();
|
|
||||||
let runtime = runtime::Builder::new_current_thread()
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
info!("Starting S3 offload task");
|
|
||||||
|
|
||||||
runtime.block_on(async {
|
|
||||||
main_loop(&conf).await.unwrap();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn offload_files(
|
|
||||||
remote_storage: &S3Bucket,
|
|
||||||
listing: &HashSet<S3ObjectKey>,
|
|
||||||
dir_path: &Path,
|
|
||||||
conf: &SafeKeeperConf,
|
|
||||||
) -> anyhow::Result<u64> {
|
|
||||||
let horizon = SystemTime::now() - conf.ttl.unwrap();
|
|
||||||
let mut n: u64 = 0;
|
|
||||||
for entry in WalkDir::new(dir_path) {
|
|
||||||
let entry = entry?;
|
|
||||||
let path = entry.path();
|
|
||||||
|
|
||||||
if path.is_file()
|
|
||||||
&& IsXLogFileName(entry.file_name().to_str().unwrap())
|
|
||||||
&& entry.metadata().unwrap().created().unwrap() <= horizon
|
|
||||||
{
|
|
||||||
let remote_path = remote_storage.remote_object_id(path)?;
|
|
||||||
if !listing.contains(&remote_path) {
|
|
||||||
let file = File::open(&path).await?;
|
|
||||||
let file_length = file.metadata().await?.len() as usize;
|
|
||||||
remote_storage
|
|
||||||
.upload(BufReader::new(file), file_length, &remote_path, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
fs::remove_file(&path).await?;
|
|
||||||
n += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(n)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> {
|
|
||||||
let remote_storage = match GenericRemoteStorage::new(
|
|
||||||
conf.workdir.clone(),
|
|
||||||
&RemoteStorageConfig {
|
|
||||||
max_concurrent_syncs: NonZeroUsize::new(10).unwrap(),
|
|
||||||
max_sync_errors: NonZeroU32::new(1).unwrap(),
|
|
||||||
storage: remote_storage::RemoteStorageKind::AwsS3(S3Config {
|
|
||||||
bucket_name: "zenith-testbucket".to_string(),
|
|
||||||
bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?,
|
|
||||||
prefix_in_bucket: Some("walarchive/".to_string()),
|
|
||||||
endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?),
|
|
||||||
concurrency_limit: NonZeroUsize::new(20).unwrap(),
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
)? {
|
|
||||||
GenericRemoteStorage::Local(_) => {
|
|
||||||
bail!("Unexpected: got local storage for the remote config")
|
|
||||||
}
|
|
||||||
GenericRemoteStorage::S3(remote_storage) => remote_storage,
|
|
||||||
};
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let listing = remote_storage
|
|
||||||
.list()
|
|
||||||
.await?
|
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?;
|
|
||||||
info!("Offload {n} files to S3");
|
|
||||||
sleep(conf.ttl.unwrap()).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -19,6 +19,7 @@ use lazy_static::lazy_static;
|
|||||||
|
|
||||||
use crate::control_file;
|
use crate::control_file;
|
||||||
use crate::send_wal::HotStandbyFeedback;
|
use crate::send_wal::HotStandbyFeedback;
|
||||||
|
|
||||||
use crate::wal_storage;
|
use crate::wal_storage;
|
||||||
use metrics::{register_gauge_vec, Gauge, GaugeVec};
|
use metrics::{register_gauge_vec, Gauge, GaugeVec};
|
||||||
use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
|
use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
|
||||||
@@ -26,7 +27,7 @@ use utils::{
|
|||||||
bin_ser::LeSer,
|
bin_ser::LeSer,
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
pq_proto::{SystemId, ZenithFeedback},
|
pq_proto::{SystemId, ZenithFeedback},
|
||||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||||
@@ -141,7 +142,7 @@ pub struct ServerInfo {
|
|||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct PeerInfo {
|
pub struct PeerInfo {
|
||||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||||
s3_wal_lsn: Lsn,
|
backup_lsn: Lsn,
|
||||||
/// Term of the last entry.
|
/// Term of the last entry.
|
||||||
term: Term,
|
term: Term,
|
||||||
/// LSN of the last record.
|
/// LSN of the last record.
|
||||||
@@ -153,7 +154,7 @@ pub struct PeerInfo {
|
|||||||
impl PeerInfo {
|
impl PeerInfo {
|
||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
s3_wal_lsn: Lsn(0),
|
backup_lsn: Lsn::INVALID,
|
||||||
term: INVALID_TERM,
|
term: INVALID_TERM,
|
||||||
flush_lsn: Lsn(0),
|
flush_lsn: Lsn(0),
|
||||||
commit_lsn: Lsn(0),
|
commit_lsn: Lsn(0),
|
||||||
@@ -164,7 +165,7 @@ impl PeerInfo {
|
|||||||
// vector-based node id -> peer state map with very limited functionality we
|
// vector-based node id -> peer state map with very limited functionality we
|
||||||
// need/
|
// need/
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>);
|
pub struct Peers(pub Vec<(NodeId, PeerInfo)>);
|
||||||
|
|
||||||
/// Persistent information stored on safekeeper node
|
/// Persistent information stored on safekeeper node
|
||||||
/// On disk data is prefixed by magic and format version and followed by checksum.
|
/// On disk data is prefixed by magic and format version and followed by checksum.
|
||||||
@@ -193,9 +194,9 @@ pub struct SafeKeeperState {
|
|||||||
/// Part of WAL acknowledged by quorum and available locally. Always points
|
/// Part of WAL acknowledged by quorum and available locally. Always points
|
||||||
/// to record boundary.
|
/// to record boundary.
|
||||||
pub commit_lsn: Lsn,
|
pub commit_lsn: Lsn,
|
||||||
/// First LSN not yet offloaded to s3. Useful to persist to avoid finding
|
/// LSN that points to the end of the last backed up segment. Useful to
|
||||||
/// out offloading progress on boot.
|
/// persist to avoid finding out offloading progress on boot.
|
||||||
pub s3_wal_lsn: Lsn,
|
pub backup_lsn: Lsn,
|
||||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||||
/// of last record streamed to everyone). Persisting it helps skipping
|
/// of last record streamed to everyone). Persisting it helps skipping
|
||||||
/// recovery in walproposer, generally we compute it from peers. In
|
/// recovery in walproposer, generally we compute it from peers. In
|
||||||
@@ -217,14 +218,14 @@ pub struct SafeKeeperState {
|
|||||||
// are not flushed yet.
|
// are not flushed yet.
|
||||||
pub struct SafekeeperMemState {
|
pub struct SafekeeperMemState {
|
||||||
pub commit_lsn: Lsn,
|
pub commit_lsn: Lsn,
|
||||||
pub s3_wal_lsn: Lsn, // TODO: keep only persistent version
|
pub backup_lsn: Lsn,
|
||||||
pub peer_horizon_lsn: Lsn,
|
pub peer_horizon_lsn: Lsn,
|
||||||
pub remote_consistent_lsn: Lsn,
|
pub remote_consistent_lsn: Lsn,
|
||||||
pub proposer_uuid: PgUuid,
|
pub proposer_uuid: PgUuid,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SafeKeeperState {
|
impl SafeKeeperState {
|
||||||
pub fn new(zttid: &ZTenantTimelineId, peers: Vec<ZNodeId>) -> SafeKeeperState {
|
pub fn new(zttid: &ZTenantTimelineId, peers: Vec<NodeId>) -> SafeKeeperState {
|
||||||
SafeKeeperState {
|
SafeKeeperState {
|
||||||
tenant_id: zttid.tenant_id,
|
tenant_id: zttid.tenant_id,
|
||||||
timeline_id: zttid.timeline_id,
|
timeline_id: zttid.timeline_id,
|
||||||
@@ -241,7 +242,7 @@ impl SafeKeeperState {
|
|||||||
timeline_start_lsn: Lsn(0),
|
timeline_start_lsn: Lsn(0),
|
||||||
local_start_lsn: Lsn(0),
|
local_start_lsn: Lsn(0),
|
||||||
commit_lsn: Lsn(0),
|
commit_lsn: Lsn(0),
|
||||||
s3_wal_lsn: Lsn(0),
|
backup_lsn: Lsn::INVALID,
|
||||||
peer_horizon_lsn: Lsn(0),
|
peer_horizon_lsn: Lsn(0),
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()),
|
peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()),
|
||||||
@@ -277,7 +278,7 @@ pub struct ProposerGreeting {
|
|||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
pub struct AcceptorGreeting {
|
pub struct AcceptorGreeting {
|
||||||
term: u64,
|
term: u64,
|
||||||
node_id: ZNodeId,
|
node_id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Vote request sent from proposer to safekeepers
|
/// Vote request sent from proposer to safekeepers
|
||||||
@@ -531,7 +532,7 @@ pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
|
|||||||
|
|
||||||
pub wal_store: WAL,
|
pub wal_store: WAL,
|
||||||
|
|
||||||
node_id: ZNodeId, // safekeeper's node id
|
node_id: NodeId, // safekeeper's node id
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<CTRL, WAL> SafeKeeper<CTRL, WAL>
|
impl<CTRL, WAL> SafeKeeper<CTRL, WAL>
|
||||||
@@ -544,7 +545,7 @@ where
|
|||||||
ztli: ZTimelineId,
|
ztli: ZTimelineId,
|
||||||
state: CTRL,
|
state: CTRL,
|
||||||
mut wal_store: WAL,
|
mut wal_store: WAL,
|
||||||
node_id: ZNodeId,
|
node_id: NodeId,
|
||||||
) -> Result<SafeKeeper<CTRL, WAL>> {
|
) -> Result<SafeKeeper<CTRL, WAL>> {
|
||||||
if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id {
|
if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id {
|
||||||
bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id);
|
bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id);
|
||||||
@@ -559,7 +560,7 @@ where
|
|||||||
epoch_start_lsn: Lsn(0),
|
epoch_start_lsn: Lsn(0),
|
||||||
inmem: SafekeeperMemState {
|
inmem: SafekeeperMemState {
|
||||||
commit_lsn: state.commit_lsn,
|
commit_lsn: state.commit_lsn,
|
||||||
s3_wal_lsn: state.s3_wal_lsn,
|
backup_lsn: state.backup_lsn,
|
||||||
peer_horizon_lsn: state.peer_horizon_lsn,
|
peer_horizon_lsn: state.peer_horizon_lsn,
|
||||||
remote_consistent_lsn: state.remote_consistent_lsn,
|
remote_consistent_lsn: state.remote_consistent_lsn,
|
||||||
proposer_uuid: state.proposer_uuid,
|
proposer_uuid: state.proposer_uuid,
|
||||||
@@ -649,7 +650,6 @@ where
|
|||||||
self.state.persist(&state)?;
|
self.state.persist(&state)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// pass wal_seg_size to read WAL and find flush_lsn
|
|
||||||
self.wal_store.init_storage(&self.state)?;
|
self.wal_store.init_storage(&self.state)?;
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
@@ -731,24 +731,36 @@ where
|
|||||||
{
|
{
|
||||||
let mut state = self.state.clone();
|
let mut state = self.state.clone();
|
||||||
|
|
||||||
// Remeber point where WAL begins globally, if not yet.
|
// Here we learn initial LSN for the first time, set fields
|
||||||
|
// interested in that.
|
||||||
|
|
||||||
if state.timeline_start_lsn == Lsn(0) {
|
if state.timeline_start_lsn == Lsn(0) {
|
||||||
|
// Remember point where WAL begins globally.
|
||||||
state.timeline_start_lsn = msg.timeline_start_lsn;
|
state.timeline_start_lsn = msg.timeline_start_lsn;
|
||||||
info!(
|
info!(
|
||||||
"setting timeline_start_lsn to {:?}",
|
"setting timeline_start_lsn to {:?}",
|
||||||
state.timeline_start_lsn
|
state.timeline_start_lsn
|
||||||
);
|
);
|
||||||
}
|
|
||||||
|
|
||||||
// Remember point where WAL begins locally, if not yet. (I doubt the
|
|
||||||
// second condition is ever possible)
|
|
||||||
if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at {
|
|
||||||
state.local_start_lsn = msg.start_streaming_at;
|
state.local_start_lsn = msg.start_streaming_at;
|
||||||
info!("setting local_start_lsn to {:?}", state.local_start_lsn);
|
info!("setting local_start_lsn to {:?}", state.local_start_lsn);
|
||||||
}
|
}
|
||||||
|
// Initializing commit_lsn before acking first flushed record is
|
||||||
|
// important to let find_end_of_wal skip the whole in the beginning
|
||||||
|
// of the first segment.
|
||||||
|
//
|
||||||
|
// NB: on new clusters, this happens at the same time as
|
||||||
|
// timeline_start_lsn initialization, it is taken outside to provide
|
||||||
|
// upgrade.
|
||||||
|
self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn);
|
||||||
|
self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn);
|
||||||
|
self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64);
|
||||||
|
|
||||||
|
// Initalizing backup_lsn is useful to avoid making backup think it should upload 0 segment.
|
||||||
|
self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);
|
||||||
|
|
||||||
state.acceptor_state.term_history = msg.term_history.clone();
|
state.acceptor_state.term_history = msg.term_history.clone();
|
||||||
self.state.persist(&state)?;
|
self.persist_control_file(state)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||||
@@ -772,25 +784,16 @@ where
|
|||||||
// that we receive new epoch_start_lsn, and we still need to sync
|
// that we receive new epoch_start_lsn, and we still need to sync
|
||||||
// control file in this case.
|
// control file in this case.
|
||||||
if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
|
if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
|
||||||
self.persist_control_file()?;
|
self.persist_control_file(self.state.clone())?;
|
||||||
}
|
|
||||||
|
|
||||||
// We got our first commit_lsn, which means we should sync
|
|
||||||
// everything to disk, to initialize the state.
|
|
||||||
if self.state.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) {
|
|
||||||
self.wal_store.flush_wal()?;
|
|
||||||
self.persist_control_file()?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Persist in-memory state to the disk.
|
/// Persist in-memory state to the disk, taking other data from state.
|
||||||
fn persist_control_file(&mut self) -> Result<()> {
|
fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
|
||||||
let mut state = self.state.clone();
|
|
||||||
|
|
||||||
state.commit_lsn = self.inmem.commit_lsn;
|
state.commit_lsn = self.inmem.commit_lsn;
|
||||||
state.s3_wal_lsn = self.inmem.s3_wal_lsn;
|
state.backup_lsn = self.inmem.backup_lsn;
|
||||||
state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
|
state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
|
||||||
state.remote_consistent_lsn = self.inmem.remote_consistent_lsn;
|
state.remote_consistent_lsn = self.inmem.remote_consistent_lsn;
|
||||||
state.proposer_uuid = self.inmem.proposer_uuid;
|
state.proposer_uuid = self.inmem.proposer_uuid;
|
||||||
@@ -823,13 +826,6 @@ where
|
|||||||
// do the job
|
// do the job
|
||||||
if !msg.wal_data.is_empty() {
|
if !msg.wal_data.is_empty() {
|
||||||
self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?;
|
self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?;
|
||||||
|
|
||||||
// If this was the first record we ever received, initialize
|
|
||||||
// commit_lsn to help find_end_of_wal skip the hole in the
|
|
||||||
// beginning.
|
|
||||||
if self.global_commit_lsn == Lsn(0) {
|
|
||||||
self.global_commit_lsn = msg.h.begin_lsn;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// flush wal to the disk, if required
|
// flush wal to the disk, if required
|
||||||
@@ -852,7 +848,7 @@ where
|
|||||||
if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
|
if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
|
||||||
< self.inmem.peer_horizon_lsn
|
< self.inmem.peer_horizon_lsn
|
||||||
{
|
{
|
||||||
self.persist_control_file()?;
|
self.persist_control_file(self.state.clone())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
trace!(
|
trace!(
|
||||||
@@ -898,11 +894,11 @@ where
|
|||||||
self.update_commit_lsn()?;
|
self.update_commit_lsn()?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn {
|
if let Some(backup_lsn) = sk_info.backup_lsn {
|
||||||
let new_s3_wal_lsn = max(s3_wal_lsn, self.inmem.s3_wal_lsn);
|
let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn);
|
||||||
sync_control_file |=
|
sync_control_file |=
|
||||||
self.state.s3_wal_lsn + (self.state.server.wal_seg_size as u64) < new_s3_wal_lsn;
|
self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn;
|
||||||
self.inmem.s3_wal_lsn = new_s3_wal_lsn;
|
self.inmem.backup_lsn = new_backup_lsn;
|
||||||
}
|
}
|
||||||
if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn {
|
if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn {
|
||||||
let new_remote_consistent_lsn =
|
let new_remote_consistent_lsn =
|
||||||
@@ -920,7 +916,7 @@ where
|
|||||||
self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
|
self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
|
||||||
}
|
}
|
||||||
if sync_control_file {
|
if sync_control_file {
|
||||||
self.persist_control_file()?;
|
self.persist_control_file(self.state.clone())?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -930,29 +926,23 @@ where
|
|||||||
/// offloading.
|
/// offloading.
|
||||||
/// While it is safe to use inmem values for determining horizon,
|
/// While it is safe to use inmem values for determining horizon,
|
||||||
/// we use persistent to make possible normal states less surprising.
|
/// we use persistent to make possible normal states less surprising.
|
||||||
pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo {
|
pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo {
|
||||||
let s3_offload_horizon = if s3_offload_enabled {
|
let mut horizon_lsn = min(
|
||||||
self.state.s3_wal_lsn
|
self.state.remote_consistent_lsn,
|
||||||
} else {
|
self.state.peer_horizon_lsn,
|
||||||
Lsn(u64::MAX)
|
|
||||||
};
|
|
||||||
let horizon_lsn = min(
|
|
||||||
min(
|
|
||||||
self.state.remote_consistent_lsn,
|
|
||||||
self.state.peer_horizon_lsn,
|
|
||||||
),
|
|
||||||
s3_offload_horizon,
|
|
||||||
);
|
);
|
||||||
|
if wal_backup_enabled {
|
||||||
|
horizon_lsn = min(horizon_lsn, self.state.backup_lsn);
|
||||||
|
}
|
||||||
horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
|
horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::wal_storage::Storage;
|
use crate::wal_storage::Storage;
|
||||||
|
use std::ops::Deref;
|
||||||
|
|
||||||
// fake storage for tests
|
// fake storage for tests
|
||||||
struct InMemoryState {
|
struct InMemoryState {
|
||||||
@@ -1013,7 +1003,8 @@ mod tests {
|
|||||||
};
|
};
|
||||||
let wal_store = DummyWalStore { lsn: Lsn(0) };
|
let wal_store = DummyWalStore { lsn: Lsn(0) };
|
||||||
let ztli = ZTimelineId::from([0u8; 16]);
|
let ztli = ZTimelineId::from([0u8; 16]);
|
||||||
let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap();
|
|
||||||
|
let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap();
|
||||||
|
|
||||||
// check voting for 1 is ok
|
// check voting for 1 is ok
|
||||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
|
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
|
||||||
@@ -1028,7 +1019,8 @@ mod tests {
|
|||||||
let storage = InMemoryState {
|
let storage = InMemoryState {
|
||||||
persisted_state: state,
|
persisted_state: state,
|
||||||
};
|
};
|
||||||
sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap();
|
|
||||||
|
sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap();
|
||||||
|
|
||||||
// and ensure voting second time for 1 is not ok
|
// and ensure voting second time for 1 is not ok
|
||||||
vote_resp = sk.process_msg(&vote_request);
|
vote_resp = sk.process_msg(&vote_request);
|
||||||
@@ -1045,7 +1037,8 @@ mod tests {
|
|||||||
};
|
};
|
||||||
let wal_store = DummyWalStore { lsn: Lsn(0) };
|
let wal_store = DummyWalStore { lsn: Lsn(0) };
|
||||||
let ztli = ZTimelineId::from([0u8; 16]);
|
let ztli = ZTimelineId::from([0u8; 16]);
|
||||||
let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap();
|
|
||||||
|
let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap();
|
||||||
|
|
||||||
let mut ar_hdr = AppendRequestHeader {
|
let mut ar_hdr = AppendRequestHeader {
|
||||||
term: 1,
|
term: 1,
|
||||||
|
|||||||
@@ -315,7 +315,7 @@ impl ReplicationConn {
|
|||||||
} else {
|
} else {
|
||||||
// TODO: also check once in a while whether we are walsender
|
// TODO: also check once in a while whether we are walsender
|
||||||
// to right pageserver.
|
// to right pageserver.
|
||||||
if spg.timeline.get().check_deactivate(replica_id)? {
|
if spg.timeline.get().stop_walsender(replica_id)? {
|
||||||
// Shut down, timeline is suspended.
|
// Shut down, timeline is suspended.
|
||||||
// TODO create proper error type for this
|
// TODO create proper error type for this
|
||||||
bail!("end streaming to {:?}", spg.appname);
|
bail!("end streaming to {:?}", spg.appname);
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use lazy_static::lazy_static;
|
|||||||
use postgres_ffi::xlog_utils::XLogSegNo;
|
use postgres_ffi::xlog_utils::XLogSegNo;
|
||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
use tokio::sync::watch;
|
||||||
|
|
||||||
use std::cmp::{max, min};
|
use std::cmp::{max, min};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
@@ -15,23 +16,23 @@ use std::fs::{self};
|
|||||||
|
|
||||||
use std::sync::{Arc, Condvar, Mutex, MutexGuard};
|
use std::sync::{Arc, Condvar, Mutex, MutexGuard};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::sync::mpsc::UnboundedSender;
|
use tokio::sync::mpsc::{Sender, UnboundedSender};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use utils::{
|
use utils::{
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
pq_proto::ZenithFeedback,
|
pq_proto::ZenithFeedback,
|
||||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
|
zid::{NodeId, ZTenantId, ZTenantTimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey};
|
use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey};
|
||||||
|
|
||||||
use crate::control_file;
|
use crate::control_file;
|
||||||
use crate::safekeeper::{
|
use crate::safekeeper::{
|
||||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||||
SafekeeperMemState,
|
SafekeeperMemState,
|
||||||
};
|
};
|
||||||
use crate::send_wal::HotStandbyFeedback;
|
use crate::send_wal::HotStandbyFeedback;
|
||||||
|
|
||||||
use crate::wal_storage;
|
use crate::wal_storage;
|
||||||
use crate::wal_storage::Storage as wal_storage_iface;
|
use crate::wal_storage::Storage as wal_storage_iface;
|
||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
@@ -81,10 +82,14 @@ struct SharedState {
|
|||||||
notified_commit_lsn: Lsn,
|
notified_commit_lsn: Lsn,
|
||||||
/// State of replicas
|
/// State of replicas
|
||||||
replicas: Vec<Option<ReplicaState>>,
|
replicas: Vec<Option<ReplicaState>>,
|
||||||
/// Inactive clusters shouldn't occupy any resources, so timeline is
|
/// True when WAL backup launcher oversees the timeline, making sure WAL is
|
||||||
/// activated whenever there is a compute connection or pageserver is not
|
/// offloaded, allows to bother launcher less.
|
||||||
/// caughtup (it must have latest WAL for new compute start) and suspended
|
wal_backup_active: bool,
|
||||||
/// otherwise.
|
/// True whenever there is at least some pending activity on timeline: live
|
||||||
|
/// compute connection, pageserver is not caughtup (it must have latest WAL
|
||||||
|
/// for new compute start) or WAL backuping is not finished. Practically it
|
||||||
|
/// means safekeepers broadcast info to peers about the timeline, old WAL is
|
||||||
|
/// trimmed.
|
||||||
///
|
///
|
||||||
/// TODO: it might be better to remove tli completely from GlobalTimelines
|
/// TODO: it might be better to remove tli completely from GlobalTimelines
|
||||||
/// when tli is inactive instead of having this flag.
|
/// when tli is inactive instead of having this flag.
|
||||||
@@ -99,10 +104,11 @@ impl SharedState {
|
|||||||
fn create(
|
fn create(
|
||||||
conf: &SafeKeeperConf,
|
conf: &SafeKeeperConf,
|
||||||
zttid: &ZTenantTimelineId,
|
zttid: &ZTenantTimelineId,
|
||||||
peer_ids: Vec<ZNodeId>,
|
peer_ids: Vec<NodeId>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let state = SafeKeeperState::new(zttid, peer_ids);
|
let state = SafeKeeperState::new(zttid, peer_ids);
|
||||||
let control_store = control_file::FileStorage::create_new(zttid, conf, state)?;
|
let control_store = control_file::FileStorage::create_new(zttid, conf, state)?;
|
||||||
|
|
||||||
let wal_store = wal_storage::PhysicalStorage::new(zttid, conf);
|
let wal_store = wal_storage::PhysicalStorage::new(zttid, conf);
|
||||||
let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?;
|
let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?;
|
||||||
|
|
||||||
@@ -110,6 +116,7 @@ impl SharedState {
|
|||||||
notified_commit_lsn: Lsn(0),
|
notified_commit_lsn: Lsn(0),
|
||||||
sk,
|
sk,
|
||||||
replicas: Vec::new(),
|
replicas: Vec::new(),
|
||||||
|
wal_backup_active: false,
|
||||||
active: false,
|
active: false,
|
||||||
num_computes: 0,
|
num_computes: 0,
|
||||||
pageserver_connstr: None,
|
pageserver_connstr: None,
|
||||||
@@ -129,15 +136,62 @@ impl SharedState {
|
|||||||
notified_commit_lsn: Lsn(0),
|
notified_commit_lsn: Lsn(0),
|
||||||
sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?,
|
sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?,
|
||||||
replicas: Vec::new(),
|
replicas: Vec::new(),
|
||||||
|
wal_backup_active: false,
|
||||||
active: false,
|
active: false,
|
||||||
num_computes: 0,
|
num_computes: 0,
|
||||||
pageserver_connstr: None,
|
pageserver_connstr: None,
|
||||||
last_removed_segno: 0,
|
last_removed_segno: 0,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
fn is_active(&self) -> bool {
|
||||||
|
self.is_wal_backup_required()
|
||||||
|
// FIXME: add tracking of relevant pageservers and check them here individually,
|
||||||
|
// otherwise migration won't work (we suspend too early).
|
||||||
|
|| self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn
|
||||||
|
}
|
||||||
|
|
||||||
/// Activate the timeline: start/change walsender (via callmemaybe).
|
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||||
fn activate(
|
/// start/stop action.
|
||||||
|
fn update_status(&mut self) -> bool {
|
||||||
|
self.active = self.is_active();
|
||||||
|
self.is_wal_backup_action_pending()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Should we run s3 offloading in current state?
|
||||||
|
fn is_wal_backup_required(&self) -> bool {
|
||||||
|
let seg_size = self.get_wal_seg_size();
|
||||||
|
self.num_computes > 0 ||
|
||||||
|
// Currently only the whole segment is offloaded, so compare segment numbers.
|
||||||
|
(self.sk.inmem.commit_lsn.segment_number(seg_size) >
|
||||||
|
self.sk.inmem.backup_lsn.segment_number(seg_size))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is current state of s3 offloading is not what it ought to be?
|
||||||
|
fn is_wal_backup_action_pending(&self) -> bool {
|
||||||
|
let res = self.wal_backup_active != self.is_wal_backup_required();
|
||||||
|
if res {
|
||||||
|
let action_pending = if self.is_wal_backup_required() {
|
||||||
|
"start"
|
||||||
|
} else {
|
||||||
|
"stop"
|
||||||
|
};
|
||||||
|
trace!(
|
||||||
|
"timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
|
||||||
|
self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns whether s3 offloading is required and sets current status as
|
||||||
|
/// matching.
|
||||||
|
fn wal_backup_attend(&mut self) -> bool {
|
||||||
|
self.wal_backup_active = self.is_wal_backup_required();
|
||||||
|
self.wal_backup_active
|
||||||
|
}
|
||||||
|
|
||||||
|
/// start/change walsender (via callmemaybe).
|
||||||
|
fn callmemaybe_sub(
|
||||||
&mut self,
|
&mut self,
|
||||||
zttid: &ZTenantTimelineId,
|
zttid: &ZTenantTimelineId,
|
||||||
pageserver_connstr: Option<&String>,
|
pageserver_connstr: Option<&String>,
|
||||||
@@ -179,42 +233,42 @@ impl SharedState {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned());
|
self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned());
|
||||||
self.active = true;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deactivate the timeline: stop callmemaybe.
|
/// Deactivate the timeline: stop callmemaybe.
|
||||||
fn deactivate(
|
fn callmemaybe_unsub(
|
||||||
&mut self,
|
&mut self,
|
||||||
zttid: &ZTenantTimelineId,
|
zttid: &ZTenantTimelineId,
|
||||||
callmemaybe_tx: &UnboundedSender<CallmeEvent>,
|
callmemaybe_tx: &UnboundedSender<CallmeEvent>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if self.active {
|
if let Some(ref pageserver_connstr) = self.pageserver_connstr {
|
||||||
if let Some(ref pageserver_connstr) = self.pageserver_connstr {
|
let subscription_key = SubscriptionStateKey::new(
|
||||||
let subscription_key = SubscriptionStateKey::new(
|
zttid.tenant_id,
|
||||||
zttid.tenant_id,
|
zttid.timeline_id,
|
||||||
zttid.timeline_id,
|
pageserver_connstr.to_owned(),
|
||||||
pageserver_connstr.to_owned(),
|
);
|
||||||
);
|
callmemaybe_tx
|
||||||
callmemaybe_tx
|
.send(CallmeEvent::Unsubscribe(subscription_key))
|
||||||
.send(CallmeEvent::Unsubscribe(subscription_key))
|
.unwrap_or_else(|e| {
|
||||||
.unwrap_or_else(|e| {
|
error!(
|
||||||
error!(
|
"failed to send Unsubscribe request to callmemaybe thread {}",
|
||||||
"failed to send Unsubscribe request to callmemaybe thread {}",
|
e
|
||||||
e
|
);
|
||||||
);
|
});
|
||||||
});
|
info!(
|
||||||
info!(
|
"timeline {} is unsubscribed from callmemaybe to {}",
|
||||||
"timeline {} is unsubscribed from callmemaybe to {}",
|
zttid.timeline_id,
|
||||||
zttid.timeline_id,
|
self.pageserver_connstr.as_ref().unwrap()
|
||||||
self.pageserver_connstr.as_ref().unwrap()
|
);
|
||||||
);
|
|
||||||
}
|
|
||||||
self.active = false;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_wal_seg_size(&self) -> usize {
|
||||||
|
self.sk.state.server.wal_seg_size as usize
|
||||||
|
}
|
||||||
|
|
||||||
/// Get combined state of all alive replicas
|
/// Get combined state of all alive replicas
|
||||||
pub fn get_replicas_state(&self) -> ReplicaState {
|
pub fn get_replicas_state(&self) -> ReplicaState {
|
||||||
let mut acc = ReplicaState::new();
|
let mut acc = ReplicaState::new();
|
||||||
@@ -278,6 +332,13 @@ impl SharedState {
|
|||||||
pub struct Timeline {
|
pub struct Timeline {
|
||||||
pub zttid: ZTenantTimelineId,
|
pub zttid: ZTenantTimelineId,
|
||||||
pub callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
pub callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
||||||
|
/// Sending here asks for wal backup launcher attention (start/stop
|
||||||
|
/// offloading). Sending zttid instead of concrete command allows to do
|
||||||
|
/// sending without timeline lock.
|
||||||
|
wal_backup_launcher_tx: Sender<ZTenantTimelineId>,
|
||||||
|
commit_lsn_watch_tx: watch::Sender<Lsn>,
|
||||||
|
/// For breeding receivers.
|
||||||
|
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||||
mutex: Mutex<SharedState>,
|
mutex: Mutex<SharedState>,
|
||||||
/// conditional variable used to notify wal senders
|
/// conditional variable used to notify wal senders
|
||||||
cond: Condvar,
|
cond: Condvar,
|
||||||
@@ -287,11 +348,17 @@ impl Timeline {
|
|||||||
fn new(
|
fn new(
|
||||||
zttid: ZTenantTimelineId,
|
zttid: ZTenantTimelineId,
|
||||||
callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
||||||
|
wal_backup_launcher_tx: Sender<ZTenantTimelineId>,
|
||||||
shared_state: SharedState,
|
shared_state: SharedState,
|
||||||
) -> Timeline {
|
) -> Timeline {
|
||||||
|
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
|
||||||
|
watch::channel(shared_state.sk.inmem.commit_lsn);
|
||||||
Timeline {
|
Timeline {
|
||||||
zttid,
|
zttid,
|
||||||
callmemaybe_tx,
|
callmemaybe_tx,
|
||||||
|
wal_backup_launcher_tx,
|
||||||
|
commit_lsn_watch_tx,
|
||||||
|
commit_lsn_watch_rx,
|
||||||
mutex: Mutex::new(shared_state),
|
mutex: Mutex::new(shared_state),
|
||||||
cond: Condvar::new(),
|
cond: Condvar::new(),
|
||||||
}
|
}
|
||||||
@@ -301,13 +368,21 @@ impl Timeline {
|
|||||||
/// not running yet.
|
/// not running yet.
|
||||||
/// Can fail only if channel to a static thread got closed, which is not normal at all.
|
/// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||||
pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> {
|
pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> {
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let is_wal_backup_action_pending: bool;
|
||||||
shared_state.num_computes += 1;
|
{
|
||||||
// FIXME: currently we always adopt latest pageserver connstr, but we
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
// should have kind of generations assigned by compute to distinguish
|
shared_state.num_computes += 1;
|
||||||
// the latest one or even pass it through consensus to reliably deliver
|
is_wal_backup_action_pending = shared_state.update_status();
|
||||||
// to all safekeepers.
|
// FIXME: currently we always adopt latest pageserver connstr, but we
|
||||||
shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?;
|
// should have kind of generations assigned by compute to distinguish
|
||||||
|
// the latest one or even pass it through consensus to reliably deliver
|
||||||
|
// to all safekeepers.
|
||||||
|
shared_state.callmemaybe_sub(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?;
|
||||||
|
}
|
||||||
|
// Wake up wal backup launcher, if offloading not started yet.
|
||||||
|
if is_wal_backup_action_pending {
|
||||||
|
self.wal_backup_launcher_tx.blocking_send(self.zttid)?;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -315,38 +390,43 @@ impl Timeline {
|
|||||||
/// pageserver doesn't need catchup.
|
/// pageserver doesn't need catchup.
|
||||||
/// Can fail only if channel to a static thread got closed, which is not normal at all.
|
/// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||||
pub fn on_compute_disconnect(&self) -> Result<()> {
|
pub fn on_compute_disconnect(&self) -> Result<()> {
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let is_wal_backup_action_pending: bool;
|
||||||
shared_state.num_computes -= 1;
|
{
|
||||||
// If there is no pageserver, can suspend right away; otherwise let
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
// walsender do that.
|
shared_state.num_computes -= 1;
|
||||||
if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() {
|
is_wal_backup_action_pending = shared_state.update_status();
|
||||||
shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?;
|
}
|
||||||
|
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||||
|
if is_wal_backup_action_pending {
|
||||||
|
self.wal_backup_launcher_tx.blocking_send(self.zttid)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deactivate tenant if there is no computes and pageserver is caughtup,
|
/// Whether we still need this walsender running?
|
||||||
/// assuming the pageserver status is in replica_id.
|
/// TODO: check this pageserver is actually interested in this timeline.
|
||||||
/// Returns true if deactivated.
|
pub fn stop_walsender(&self, replica_id: usize) -> Result<bool> {
|
||||||
pub fn check_deactivate(&self, replica_id: usize) -> Result<bool> {
|
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
if !shared_state.active {
|
|
||||||
// already suspended
|
|
||||||
return Ok(true);
|
|
||||||
}
|
|
||||||
if shared_state.num_computes == 0 {
|
if shared_state.num_computes == 0 {
|
||||||
let replica_state = shared_state.replicas[replica_id].unwrap();
|
let replica_state = shared_state.replicas[replica_id].unwrap();
|
||||||
let deactivate = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet
|
let stop = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet
|
||||||
(replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
|
(replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
|
||||||
replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn);
|
replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);
|
||||||
if deactivate {
|
if stop {
|
||||||
shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?;
|
shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?;
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(false)
|
Ok(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns whether s3 offloading is required and sets current status as
|
||||||
|
/// matching it.
|
||||||
|
pub fn wal_backup_attend(&self) -> bool {
|
||||||
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
|
shared_state.wal_backup_attend()
|
||||||
|
}
|
||||||
|
|
||||||
/// Deactivates the timeline, assuming it is being deleted.
|
/// Deactivates the timeline, assuming it is being deleted.
|
||||||
/// Returns whether the timeline was already active.
|
/// Returns whether the timeline was already active.
|
||||||
///
|
///
|
||||||
@@ -354,10 +434,14 @@ impl Timeline {
|
|||||||
/// will stop by themselves eventually (possibly with errors, but no panics). There should be no
|
/// will stop by themselves eventually (possibly with errors, but no panics). There should be no
|
||||||
/// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but
|
/// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but
|
||||||
/// we're deleting the timeline anyway.
|
/// we're deleting the timeline anyway.
|
||||||
pub fn deactivate_for_delete(&self) -> Result<bool> {
|
pub async fn deactivate_for_delete(&self) -> Result<bool> {
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let was_active: bool;
|
||||||
let was_active = shared_state.active;
|
{
|
||||||
shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?;
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
|
was_active = shared_state.active;
|
||||||
|
shared_state.callmemaybe_unsub(&self.zttid, &self.callmemaybe_tx)?;
|
||||||
|
}
|
||||||
|
self.wal_backup_launcher_tx.send(self.zttid).await?;
|
||||||
Ok(was_active)
|
Ok(was_active)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -391,6 +475,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Notify caught-up WAL senders about new WAL data received
|
// Notify caught-up WAL senders about new WAL data received
|
||||||
|
// TODO: replace-unify it with commit_lsn_watch.
|
||||||
fn notify_wal_senders(&self, shared_state: &mut MutexGuard<SharedState>) {
|
fn notify_wal_senders(&self, shared_state: &mut MutexGuard<SharedState>) {
|
||||||
if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn {
|
if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn {
|
||||||
shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn;
|
shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||||
@@ -398,12 +483,17 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
|
||||||
|
self.commit_lsn_watch_rx.clone()
|
||||||
|
}
|
||||||
|
|
||||||
/// Pass arrived message to the safekeeper.
|
/// Pass arrived message to the safekeeper.
|
||||||
pub fn process_msg(
|
pub fn process_msg(
|
||||||
&self,
|
&self,
|
||||||
msg: &ProposerAcceptorMessage,
|
msg: &ProposerAcceptorMessage,
|
||||||
) -> Result<Option<AcceptorProposerMessage>> {
|
) -> Result<Option<AcceptorProposerMessage>> {
|
||||||
let mut rmsg: Option<AcceptorProposerMessage>;
|
let mut rmsg: Option<AcceptorProposerMessage>;
|
||||||
|
let commit_lsn: Lsn;
|
||||||
{
|
{
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
rmsg = shared_state.sk.process_msg(msg)?;
|
rmsg = shared_state.sk.process_msg(msg)?;
|
||||||
@@ -419,15 +509,31 @@ impl Timeline {
|
|||||||
|
|
||||||
// Ping wal sender that new data might be available.
|
// Ping wal sender that new data might be available.
|
||||||
self.notify_wal_senders(&mut shared_state);
|
self.notify_wal_senders(&mut shared_state);
|
||||||
|
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||||
}
|
}
|
||||||
|
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||||
Ok(rmsg)
|
Ok(rmsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_wal_seg_size(&self) -> usize {
|
||||||
|
self.mutex.lock().unwrap().get_wal_seg_size()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
|
pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
|
||||||
let shared_state = self.mutex.lock().unwrap();
|
let shared_state = self.mutex.lock().unwrap();
|
||||||
(shared_state.sk.inmem.clone(), shared_state.sk.state.clone())
|
(shared_state.sk.inmem.clone(), shared_state.sk.state.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_wal_backup_lsn(&self) -> Lsn {
|
||||||
|
self.mutex.lock().unwrap().sk.inmem.backup_lsn
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) {
|
||||||
|
self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn;
|
||||||
|
// we should check whether to shut down offloader, but this will be done
|
||||||
|
// soon by peer communication anyway.
|
||||||
|
}
|
||||||
|
|
||||||
/// Prepare public safekeeper info for reporting.
|
/// Prepare public safekeeper info for reporting.
|
||||||
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result<SkTimelineInfo> {
|
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result<SkTimelineInfo> {
|
||||||
let shared_state = self.mutex.lock().unwrap();
|
let shared_state = self.mutex.lock().unwrap();
|
||||||
@@ -436,7 +542,6 @@ impl Timeline {
|
|||||||
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
|
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
|
||||||
// note: this value is not flushed to control file yet and can be lost
|
// note: this value is not flushed to control file yet and can be lost
|
||||||
commit_lsn: Some(shared_state.sk.inmem.commit_lsn),
|
commit_lsn: Some(shared_state.sk.inmem.commit_lsn),
|
||||||
s3_wal_lsn: Some(shared_state.sk.inmem.s3_wal_lsn),
|
|
||||||
// TODO: rework feedbacks to avoid max here
|
// TODO: rework feedbacks to avoid max here
|
||||||
remote_consistent_lsn: Some(max(
|
remote_consistent_lsn: Some(max(
|
||||||
shared_state.get_replicas_state().remote_consistent_lsn,
|
shared_state.get_replicas_state().remote_consistent_lsn,
|
||||||
@@ -444,14 +549,35 @@ impl Timeline {
|
|||||||
)),
|
)),
|
||||||
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
|
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
|
||||||
safekeeper_connection_string: Some(conf.listen_pg_addr.clone()),
|
safekeeper_connection_string: Some(conf.listen_pg_addr.clone()),
|
||||||
|
backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update timeline state with peer safekeeper data.
|
/// Update timeline state with peer safekeeper data.
|
||||||
pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> {
|
pub async fn record_safekeeper_info(
|
||||||
let mut shared_state = self.mutex.lock().unwrap();
|
&self,
|
||||||
shared_state.sk.record_safekeeper_info(sk_info)?;
|
sk_info: &SkTimelineInfo,
|
||||||
self.notify_wal_senders(&mut shared_state);
|
_sk_id: NodeId,
|
||||||
|
) -> Result<()> {
|
||||||
|
let is_wal_backup_action_pending: bool;
|
||||||
|
let commit_lsn: Lsn;
|
||||||
|
{
|
||||||
|
let mut shared_state = self.mutex.lock().unwrap();
|
||||||
|
// WAL seg size not initialized yet (no message from compute ever
|
||||||
|
// received), can't do much without it.
|
||||||
|
if shared_state.get_wal_seg_size() == 0 {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
shared_state.sk.record_safekeeper_info(sk_info)?;
|
||||||
|
self.notify_wal_senders(&mut shared_state);
|
||||||
|
is_wal_backup_action_pending = shared_state.update_status();
|
||||||
|
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||||
|
}
|
||||||
|
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||||
|
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||||
|
if is_wal_backup_action_pending {
|
||||||
|
self.wal_backup_launcher_tx.send(self.zttid).await?;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -476,16 +602,16 @@ impl Timeline {
|
|||||||
shared_state.sk.wal_store.flush_lsn()
|
shared_state.sk.wal_store.flush_lsn()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> {
|
pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
|
||||||
let horizon_segno: XLogSegNo;
|
let horizon_segno: XLogSegNo;
|
||||||
let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
|
let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
|
||||||
{
|
{
|
||||||
let shared_state = self.mutex.lock().unwrap();
|
let shared_state = self.mutex.lock().unwrap();
|
||||||
// WAL seg size not initialized yet, no WAL exists.
|
// WAL seg size not initialized yet, no WAL exists.
|
||||||
if shared_state.sk.state.server.wal_seg_size == 0 {
|
if shared_state.get_wal_seg_size() == 0 {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled);
|
horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
|
||||||
remover = shared_state.sk.wal_store.remove_up_to();
|
remover = shared_state.sk.wal_store.remove_up_to();
|
||||||
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
|
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -522,12 +648,14 @@ impl TimelineTools for Option<Arc<Timeline>> {
|
|||||||
struct GlobalTimelinesState {
|
struct GlobalTimelinesState {
|
||||||
timelines: HashMap<ZTenantTimelineId, Arc<Timeline>>,
|
timelines: HashMap<ZTenantTimelineId, Arc<Timeline>>,
|
||||||
callmemaybe_tx: Option<UnboundedSender<CallmeEvent>>,
|
callmemaybe_tx: Option<UnboundedSender<CallmeEvent>>,
|
||||||
|
wal_backup_launcher_tx: Option<Sender<ZTenantTimelineId>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref TIMELINES_STATE: Mutex<GlobalTimelinesState> = Mutex::new(GlobalTimelinesState {
|
static ref TIMELINES_STATE: Mutex<GlobalTimelinesState> = Mutex::new(GlobalTimelinesState {
|
||||||
timelines: HashMap::new(),
|
timelines: HashMap::new(),
|
||||||
callmemaybe_tx: None
|
callmemaybe_tx: None,
|
||||||
|
wal_backup_launcher_tx: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -541,17 +669,22 @@ pub struct TimelineDeleteForceResult {
|
|||||||
pub struct GlobalTimelines;
|
pub struct GlobalTimelines;
|
||||||
|
|
||||||
impl GlobalTimelines {
|
impl GlobalTimelines {
|
||||||
pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender<CallmeEvent>) {
|
pub fn init(
|
||||||
|
callmemaybe_tx: UnboundedSender<CallmeEvent>,
|
||||||
|
wal_backup_launcher_tx: Sender<ZTenantTimelineId>,
|
||||||
|
) {
|
||||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||||
assert!(state.callmemaybe_tx.is_none());
|
assert!(state.callmemaybe_tx.is_none());
|
||||||
state.callmemaybe_tx = Some(callmemaybe_tx);
|
state.callmemaybe_tx = Some(callmemaybe_tx);
|
||||||
|
assert!(state.wal_backup_launcher_tx.is_none());
|
||||||
|
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn create_internal(
|
fn create_internal(
|
||||||
mut state: MutexGuard<GlobalTimelinesState>,
|
mut state: MutexGuard<GlobalTimelinesState>,
|
||||||
conf: &SafeKeeperConf,
|
conf: &SafeKeeperConf,
|
||||||
zttid: ZTenantTimelineId,
|
zttid: ZTenantTimelineId,
|
||||||
peer_ids: Vec<ZNodeId>,
|
peer_ids: Vec<NodeId>,
|
||||||
) -> Result<Arc<Timeline>> {
|
) -> Result<Arc<Timeline>> {
|
||||||
match state.timelines.get(&zttid) {
|
match state.timelines.get(&zttid) {
|
||||||
Some(_) => bail!("timeline {} already exists", zttid),
|
Some(_) => bail!("timeline {} already exists", zttid),
|
||||||
@@ -559,12 +692,14 @@ impl GlobalTimelines {
|
|||||||
// TODO: check directory existence
|
// TODO: check directory existence
|
||||||
let dir = conf.timeline_dir(&zttid);
|
let dir = conf.timeline_dir(&zttid);
|
||||||
fs::create_dir_all(dir)?;
|
fs::create_dir_all(dir)?;
|
||||||
|
|
||||||
let shared_state = SharedState::create(conf, &zttid, peer_ids)
|
let shared_state = SharedState::create(conf, &zttid, peer_ids)
|
||||||
.context("failed to create shared state")?;
|
.context("failed to create shared state")?;
|
||||||
|
|
||||||
let new_tli = Arc::new(Timeline::new(
|
let new_tli = Arc::new(Timeline::new(
|
||||||
zttid,
|
zttid,
|
||||||
state.callmemaybe_tx.as_ref().unwrap().clone(),
|
state.callmemaybe_tx.as_ref().unwrap().clone(),
|
||||||
|
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
|
||||||
shared_state,
|
shared_state,
|
||||||
));
|
));
|
||||||
state.timelines.insert(zttid, Arc::clone(&new_tli));
|
state.timelines.insert(zttid, Arc::clone(&new_tli));
|
||||||
@@ -576,7 +711,7 @@ impl GlobalTimelines {
|
|||||||
pub fn create(
|
pub fn create(
|
||||||
conf: &SafeKeeperConf,
|
conf: &SafeKeeperConf,
|
||||||
zttid: ZTenantTimelineId,
|
zttid: ZTenantTimelineId,
|
||||||
peer_ids: Vec<ZNodeId>,
|
peer_ids: Vec<NodeId>,
|
||||||
) -> Result<Arc<Timeline>> {
|
) -> Result<Arc<Timeline>> {
|
||||||
let state = TIMELINES_STATE.lock().unwrap();
|
let state = TIMELINES_STATE.lock().unwrap();
|
||||||
GlobalTimelines::create_internal(state, conf, zttid, peer_ids)
|
GlobalTimelines::create_internal(state, conf, zttid, peer_ids)
|
||||||
@@ -594,8 +729,7 @@ impl GlobalTimelines {
|
|||||||
match state.timelines.get(&zttid) {
|
match state.timelines.get(&zttid) {
|
||||||
Some(result) => Ok(Arc::clone(result)),
|
Some(result) => Ok(Arc::clone(result)),
|
||||||
None => {
|
None => {
|
||||||
let shared_state =
|
let shared_state = SharedState::restore(conf, &zttid);
|
||||||
SharedState::restore(conf, &zttid).context("failed to restore shared state");
|
|
||||||
|
|
||||||
let shared_state = match shared_state {
|
let shared_state = match shared_state {
|
||||||
Ok(shared_state) => shared_state,
|
Ok(shared_state) => shared_state,
|
||||||
@@ -617,6 +751,7 @@ impl GlobalTimelines {
|
|||||||
let new_tli = Arc::new(Timeline::new(
|
let new_tli = Arc::new(Timeline::new(
|
||||||
zttid,
|
zttid,
|
||||||
state.callmemaybe_tx.as_ref().unwrap().clone(),
|
state.callmemaybe_tx.as_ref().unwrap().clone(),
|
||||||
|
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
|
||||||
shared_state,
|
shared_state,
|
||||||
));
|
));
|
||||||
state.timelines.insert(zttid, Arc::clone(&new_tli));
|
state.timelines.insert(zttid, Arc::clone(&new_tli));
|
||||||
@@ -625,6 +760,12 @@ impl GlobalTimelines {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get loaded timeline, if it exists.
|
||||||
|
pub fn get_loaded(zttid: ZTenantTimelineId) -> Option<Arc<Timeline>> {
|
||||||
|
let state = TIMELINES_STATE.lock().unwrap();
|
||||||
|
state.timelines.get(&zttid).map(Arc::clone)
|
||||||
|
}
|
||||||
|
|
||||||
/// Get ZTenantTimelineIDs of all active timelines.
|
/// Get ZTenantTimelineIDs of all active timelines.
|
||||||
pub fn get_active_timelines() -> Vec<ZTenantTimelineId> {
|
pub fn get_active_timelines() -> Vec<ZTenantTimelineId> {
|
||||||
let state = TIMELINES_STATE.lock().unwrap();
|
let state = TIMELINES_STATE.lock().unwrap();
|
||||||
@@ -665,22 +806,23 @@ impl GlobalTimelines {
|
|||||||
/// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or
|
/// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or
|
||||||
/// c) an HTTP POST request for timeline creation is made after the timeline is already deleted.
|
/// c) an HTTP POST request for timeline creation is made after the timeline is already deleted.
|
||||||
/// TODO: ensure all of the above never happens.
|
/// TODO: ensure all of the above never happens.
|
||||||
pub fn delete_force(
|
pub async fn delete_force(
|
||||||
conf: &SafeKeeperConf,
|
conf: &SafeKeeperConf,
|
||||||
zttid: &ZTenantTimelineId,
|
zttid: &ZTenantTimelineId,
|
||||||
) -> Result<TimelineDeleteForceResult> {
|
) -> Result<TimelineDeleteForceResult> {
|
||||||
info!("deleting timeline {}", zttid);
|
info!("deleting timeline {}", zttid);
|
||||||
let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) {
|
let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid);
|
||||||
None => false,
|
let mut was_active = false;
|
||||||
Some(tli) => tli.deactivate_for_delete()?,
|
if let Some(tli) = timeline {
|
||||||
};
|
was_active = tli.deactivate_for_delete().await?;
|
||||||
|
}
|
||||||
GlobalTimelines::delete_force_internal(conf, zttid, was_active)
|
GlobalTimelines::delete_force_internal(conf, zttid, was_active)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deactivates and deletes all timelines for the tenant, see `delete()`.
|
/// Deactivates and deletes all timelines for the tenant, see `delete()`.
|
||||||
/// Returns map of all timelines which the tenant had, `true` if a timeline was active.
|
/// Returns map of all timelines which the tenant had, `true` if a timeline was active.
|
||||||
/// There may be a race if new timelines are created simultaneously.
|
/// There may be a race if new timelines are created simultaneously.
|
||||||
pub fn delete_force_all_for_tenant(
|
pub async fn delete_force_all_for_tenant(
|
||||||
conf: &SafeKeeperConf,
|
conf: &SafeKeeperConf,
|
||||||
tenant_id: &ZTenantId,
|
tenant_id: &ZTenantId,
|
||||||
) -> Result<HashMap<ZTenantTimelineId, TimelineDeleteForceResult>> {
|
) -> Result<HashMap<ZTenantTimelineId, TimelineDeleteForceResult>> {
|
||||||
@@ -691,14 +833,15 @@ impl GlobalTimelines {
|
|||||||
let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines;
|
let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines;
|
||||||
for (&zttid, tli) in timelines.iter() {
|
for (&zttid, tli) in timelines.iter() {
|
||||||
if zttid.tenant_id == *tenant_id {
|
if zttid.tenant_id == *tenant_id {
|
||||||
to_delete.insert(zttid, tli.deactivate_for_delete()?);
|
to_delete.insert(zttid, tli.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently.
|
// TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently.
|
||||||
timelines.retain(|zttid, _| !to_delete.contains_key(zttid));
|
timelines.retain(|zttid, _| !to_delete.contains_key(zttid));
|
||||||
}
|
}
|
||||||
let mut deleted = HashMap::new();
|
let mut deleted = HashMap::new();
|
||||||
for (zttid, was_active) in to_delete {
|
for (zttid, timeline) in to_delete {
|
||||||
|
let was_active = timeline.deactivate_for_delete().await?;
|
||||||
deleted.insert(
|
deleted.insert(
|
||||||
zttid,
|
zttid,
|
||||||
GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?,
|
GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?,
|
||||||
|
|||||||
417
safekeeper/src/wal_backup.rs
Normal file
417
safekeeper/src/wal_backup.rs
Normal file
@@ -0,0 +1,417 @@
|
|||||||
|
use anyhow::{Context, Result};
|
||||||
|
use tokio::task::JoinHandle;
|
||||||
|
|
||||||
|
use std::cmp::min;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI};
|
||||||
|
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
||||||
|
use tokio::fs::File;
|
||||||
|
use tokio::runtime::Builder;
|
||||||
|
|
||||||
|
use tokio::select;
|
||||||
|
use tokio::sync::mpsc::{self, Receiver, Sender};
|
||||||
|
use tokio::sync::watch;
|
||||||
|
use tokio::time::sleep;
|
||||||
|
use tracing::*;
|
||||||
|
|
||||||
|
use utils::{lsn::Lsn, zid::ZTenantTimelineId};
|
||||||
|
|
||||||
|
use crate::broker::{Election, ElectionLeader};
|
||||||
|
use crate::timeline::{GlobalTimelines, Timeline};
|
||||||
|
use crate::{broker, SafeKeeperConf};
|
||||||
|
|
||||||
|
use once_cell::sync::OnceCell;
|
||||||
|
|
||||||
|
const BACKUP_ELECTION_NAME: &str = "WAL_BACKUP";
|
||||||
|
|
||||||
|
const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000;
|
||||||
|
|
||||||
|
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
||||||
|
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
|
||||||
|
|
||||||
|
pub fn wal_backup_launcher_thread_main(
|
||||||
|
conf: SafeKeeperConf,
|
||||||
|
wal_backup_launcher_rx: Receiver<ZTenantTimelineId>,
|
||||||
|
) {
|
||||||
|
let rt = Builder::new_multi_thread()
|
||||||
|
.worker_threads(conf.backup_runtime_threads)
|
||||||
|
.enable_all()
|
||||||
|
.build()
|
||||||
|
.expect("failed to create wal backup runtime");
|
||||||
|
|
||||||
|
rt.block_on(async {
|
||||||
|
wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check whether wal backup is required for timeline and mark that launcher is
|
||||||
|
/// aware of current status (if timeline exists).
|
||||||
|
fn is_wal_backup_required(zttid: ZTenantTimelineId) -> bool {
|
||||||
|
if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
|
||||||
|
tli.wal_backup_attend()
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct WalBackupTaskHandle {
|
||||||
|
shutdown_tx: Sender<()>,
|
||||||
|
handle: JoinHandle<()>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
||||||
|
/// tasks. Having this in separate task simplifies locking, allows to reap
|
||||||
|
/// panics and separate elections from offloading itself.
|
||||||
|
async fn wal_backup_launcher_main_loop(
|
||||||
|
conf: SafeKeeperConf,
|
||||||
|
mut wal_backup_launcher_rx: Receiver<ZTenantTimelineId>,
|
||||||
|
) {
|
||||||
|
info!(
|
||||||
|
"WAL backup launcher: started, remote config {:?}",
|
||||||
|
conf.remote_storage
|
||||||
|
);
|
||||||
|
|
||||||
|
let conf_ = conf.clone();
|
||||||
|
REMOTE_STORAGE.get_or_init(|| {
|
||||||
|
conf_.remote_storage.as_ref().map(|c| {
|
||||||
|
GenericRemoteStorage::new(conf_.workdir, c).expect("failed to create remote storage")
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut tasks: HashMap<ZTenantTimelineId, WalBackupTaskHandle> = HashMap::new();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
// channel is never expected to get closed
|
||||||
|
let zttid = wal_backup_launcher_rx.recv().await.unwrap();
|
||||||
|
let is_wal_backup_required = is_wal_backup_required(zttid);
|
||||||
|
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
|
||||||
|
continue; /* just drain the channel and do nothing */
|
||||||
|
}
|
||||||
|
// do we need to do anything at all?
|
||||||
|
if is_wal_backup_required != tasks.contains_key(&zttid) {
|
||||||
|
if is_wal_backup_required {
|
||||||
|
// need to start the task
|
||||||
|
info!("starting WAL backup task for {}", zttid);
|
||||||
|
|
||||||
|
// TODO: decide who should offload in launcher itself by simply checking current state
|
||||||
|
let election_name = broker::get_campaign_name(
|
||||||
|
BACKUP_ELECTION_NAME.to_string(),
|
||||||
|
conf.broker_etcd_prefix.clone(),
|
||||||
|
&zttid,
|
||||||
|
);
|
||||||
|
let my_candidate_name = broker::get_candiate_name(conf.my_id);
|
||||||
|
let election = broker::Election::new(
|
||||||
|
election_name,
|
||||||
|
my_candidate_name,
|
||||||
|
conf.broker_endpoints.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||||
|
let timeline_dir = conf.timeline_dir(&zttid);
|
||||||
|
|
||||||
|
let handle = tokio::spawn(
|
||||||
|
backup_task_main(zttid, timeline_dir, shutdown_rx, election)
|
||||||
|
.instrument(info_span!("WAL backup task", zttid = %zttid)),
|
||||||
|
);
|
||||||
|
|
||||||
|
tasks.insert(
|
||||||
|
zttid,
|
||||||
|
WalBackupTaskHandle {
|
||||||
|
shutdown_tx,
|
||||||
|
handle,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// need to stop the task
|
||||||
|
info!("stopping WAL backup task for {}", zttid);
|
||||||
|
|
||||||
|
let wb_handle = tasks.remove(&zttid).unwrap();
|
||||||
|
// Tell the task to shutdown. Error means task exited earlier, that's ok.
|
||||||
|
let _ = wb_handle.shutdown_tx.send(()).await;
|
||||||
|
// Await the task itself. TODO: restart panicked tasks earlier.
|
||||||
|
// Hm, why I can't await on reference to handle?
|
||||||
|
if let Err(e) = wb_handle.handle.await {
|
||||||
|
warn!("WAL backup task for {} panicked: {}", zttid, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct WalBackupTask {
|
||||||
|
timeline: Arc<Timeline>,
|
||||||
|
timeline_dir: PathBuf,
|
||||||
|
wal_seg_size: usize,
|
||||||
|
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||||
|
leader: Option<ElectionLeader>,
|
||||||
|
election: Election,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Offload single timeline.
|
||||||
|
async fn backup_task_main(
|
||||||
|
zttid: ZTenantTimelineId,
|
||||||
|
timeline_dir: PathBuf,
|
||||||
|
mut shutdown_rx: Receiver<()>,
|
||||||
|
election: Election,
|
||||||
|
) {
|
||||||
|
info!("started");
|
||||||
|
let timeline: Arc<Timeline> = if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
|
||||||
|
tli
|
||||||
|
} else {
|
||||||
|
/* Timeline could get deleted while task was starting, just exit then. */
|
||||||
|
info!("no timeline, exiting");
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut wb = WalBackupTask {
|
||||||
|
wal_seg_size: timeline.get_wal_seg_size(),
|
||||||
|
commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(),
|
||||||
|
timeline,
|
||||||
|
timeline_dir,
|
||||||
|
leader: None,
|
||||||
|
election,
|
||||||
|
};
|
||||||
|
|
||||||
|
// task is spinned up only when wal_seg_size already initialized
|
||||||
|
assert!(wb.wal_seg_size > 0);
|
||||||
|
|
||||||
|
let mut canceled = false;
|
||||||
|
select! {
|
||||||
|
_ = wb.run() => {}
|
||||||
|
_ = shutdown_rx.recv() => {
|
||||||
|
canceled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(l) = wb.leader {
|
||||||
|
l.give_up().await;
|
||||||
|
}
|
||||||
|
info!("task {}", if canceled { "canceled" } else { "terminated" });
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WalBackupTask {
|
||||||
|
async fn run(&mut self) {
|
||||||
|
let mut backup_lsn = Lsn(0);
|
||||||
|
|
||||||
|
// election loop
|
||||||
|
loop {
|
||||||
|
let mut retry_attempt = 0u32;
|
||||||
|
|
||||||
|
if let Some(l) = self.leader.take() {
|
||||||
|
l.give_up().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
match broker::get_leader(&self.election).await {
|
||||||
|
Ok(l) => {
|
||||||
|
self.leader = Some(l);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("error during leader election {:?}", e);
|
||||||
|
sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// offload loop
|
||||||
|
loop {
|
||||||
|
if retry_attempt == 0 {
|
||||||
|
// wait for new WAL to arrive
|
||||||
|
if let Err(e) = self.commit_lsn_watch_rx.changed().await {
|
||||||
|
// should never happen, as we hold Arc to timeline.
|
||||||
|
error!("commit_lsn watch shut down: {:?}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// or just sleep if we errored previously
|
||||||
|
let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS;
|
||||||
|
if let Some(backoff_delay) =
|
||||||
|
UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt)
|
||||||
|
{
|
||||||
|
retry_delay = min(retry_delay, backoff_delay);
|
||||||
|
}
|
||||||
|
sleep(Duration::from_millis(retry_delay)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
let commit_lsn = *self.commit_lsn_watch_rx.borrow();
|
||||||
|
|
||||||
|
// Note that backup_lsn can be higher than commit_lsn if we
|
||||||
|
// don't have much local WAL and others already uploaded
|
||||||
|
// segments we don't even have.
|
||||||
|
if backup_lsn.segment_number(self.wal_seg_size)
|
||||||
|
>= commit_lsn.segment_number(self.wal_seg_size)
|
||||||
|
{
|
||||||
|
continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
|
||||||
|
}
|
||||||
|
// Perhaps peers advanced the position, check shmem value.
|
||||||
|
backup_lsn = self.timeline.get_wal_backup_lsn();
|
||||||
|
if backup_lsn.segment_number(self.wal_seg_size)
|
||||||
|
>= commit_lsn.segment_number(self.wal_seg_size)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(l) = self.leader.as_mut() {
|
||||||
|
// Optimization idea for later:
|
||||||
|
// Avoid checking election leader every time by returning current lease grant expiration time
|
||||||
|
// Re-check leadership only after expiration time,
|
||||||
|
// such approach woud reduce overhead on write-intensive workloads
|
||||||
|
|
||||||
|
match l
|
||||||
|
.check_am_i(
|
||||||
|
self.election.election_name.clone(),
|
||||||
|
self.election.candidate_name.clone(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(leader) => {
|
||||||
|
if !leader {
|
||||||
|
info!("leader has changed");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("error validating leader, {:?}", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match backup_lsn_range(
|
||||||
|
backup_lsn,
|
||||||
|
commit_lsn,
|
||||||
|
self.wal_seg_size,
|
||||||
|
&self.timeline_dir,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(backup_lsn_result) => {
|
||||||
|
backup_lsn = backup_lsn_result;
|
||||||
|
self.timeline.set_wal_backup_lsn(backup_lsn_result);
|
||||||
|
retry_attempt = 0;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(
|
||||||
|
"failed while offloading range {}-{}: {:?}",
|
||||||
|
backup_lsn, commit_lsn, e
|
||||||
|
);
|
||||||
|
|
||||||
|
retry_attempt = min(retry_attempt + 1, u32::MAX);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn backup_lsn_range(
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
wal_seg_size: usize,
|
||||||
|
timeline_dir: &Path,
|
||||||
|
) -> Result<Lsn> {
|
||||||
|
let mut res = start_lsn;
|
||||||
|
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
|
||||||
|
for s in &segments {
|
||||||
|
backup_single_segment(s, timeline_dir)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("offloading segno {}", s.seg_no))?;
|
||||||
|
|
||||||
|
res = s.end_lsn;
|
||||||
|
}
|
||||||
|
info!(
|
||||||
|
"offloaded segnos {:?} up to {}, previous backup_lsn {}",
|
||||||
|
segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
|
||||||
|
end_lsn,
|
||||||
|
start_lsn,
|
||||||
|
);
|
||||||
|
Ok(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> {
|
||||||
|
let segment_file_name = seg.file_path(timeline_dir)?;
|
||||||
|
|
||||||
|
backup_object(&segment_file_name, seg.size()).await?;
|
||||||
|
debug!("Backup of {} done", segment_file_name.display());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
pub struct Segment {
|
||||||
|
seg_no: XLogSegNo,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Segment {
|
||||||
|
pub fn new(seg_no: u64, start_lsn: Lsn, end_lsn: Lsn) -> Self {
|
||||||
|
Self {
|
||||||
|
seg_no,
|
||||||
|
start_lsn,
|
||||||
|
end_lsn,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn object_name(self) -> String {
|
||||||
|
XLogFileName(PG_TLI, self.seg_no, self.size())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn file_path(self, timeline_dir: &Path) -> Result<PathBuf> {
|
||||||
|
Ok(timeline_dir.join(self.object_name()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn size(self) -> usize {
|
||||||
|
(u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
|
||||||
|
let first_seg = start.segment_number(seg_size);
|
||||||
|
let last_seg = end.segment_number(seg_size);
|
||||||
|
|
||||||
|
let res: Vec<Segment> = (first_seg..last_seg)
|
||||||
|
.map(|s| {
|
||||||
|
let start_lsn = XLogSegNoOffsetToRecPtr(s, 0, seg_size);
|
||||||
|
let end_lsn = XLogSegNoOffsetToRecPtr(s + 1, 0, seg_size);
|
||||||
|
Segment::new(s, Lsn::from(start_lsn), Lsn::from(end_lsn))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
|
||||||
|
|
||||||
|
async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
|
||||||
|
let storage = REMOTE_STORAGE.get().expect("failed to get remote storage");
|
||||||
|
|
||||||
|
let file = File::open(&source_file).await?;
|
||||||
|
|
||||||
|
// Storage is initialized by launcher at ths point.
|
||||||
|
match storage.as_ref().unwrap() {
|
||||||
|
GenericRemoteStorage::Local(local_storage) => {
|
||||||
|
let destination = local_storage.remote_object_id(source_file)?;
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"local upload about to start from {} to {}",
|
||||||
|
source_file.display(),
|
||||||
|
destination.display()
|
||||||
|
);
|
||||||
|
local_storage.upload(file, size, &destination, None).await
|
||||||
|
}
|
||||||
|
GenericRemoteStorage::S3(s3_storage) => {
|
||||||
|
let s3key = s3_storage.remote_object_id(source_file)?;
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"S3 upload about to start from {} to {:?}",
|
||||||
|
source_file.display(),
|
||||||
|
s3key
|
||||||
|
);
|
||||||
|
s3_storage.upload(file, size, &s3key, None).await
|
||||||
|
}
|
||||||
|
}?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
20
test_runner/batch_others/test_basebackup_error.py
Normal file
20
test_runner/batch_others/test_basebackup_error.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
import pytest
|
||||||
|
from contextlib import closing
|
||||||
|
|
||||||
|
from fixtures.zenith_fixtures import ZenithEnv
|
||||||
|
from fixtures.log_helper import log
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Test error handling, if the 'basebackup' command fails in the middle
|
||||||
|
# of building the tar archive.
|
||||||
|
#
|
||||||
|
def test_basebackup_error(zenith_simple_env: ZenithEnv):
|
||||||
|
env = zenith_simple_env
|
||||||
|
env.zenith_cli.create_branch("test_basebackup_error", "empty")
|
||||||
|
|
||||||
|
# Introduce failpoint
|
||||||
|
env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return")
|
||||||
|
|
||||||
|
with pytest.raises(Exception, match="basebackup-before-control-file"):
|
||||||
|
pg = env.postgres.create_start('test_basebackup_error')
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
import concurrent.futures
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@@ -78,3 +79,37 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder):
|
|||||||
with pytest.raises(Exception, match="Cannot load local timeline") as err:
|
with pytest.raises(Exception, match="Cannot load local timeline") as err:
|
||||||
pg.start()
|
pg.start()
|
||||||
log.info(f'compute startup failed as expected: {err}')
|
log.info(f'compute startup failed as expected: {err}')
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv):
|
||||||
|
env = zenith_simple_env
|
||||||
|
|
||||||
|
tenant_id, _ = env.zenith_cli.create_tenant()
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(env.zenith_cli.create_timeline,
|
||||||
|
f"test-create-multiple-timelines-{i}",
|
||||||
|
tenant_id) for i in range(4)
|
||||||
|
]
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_broken_timelines_on_startup(zenith_simple_env: ZenithEnv):
|
||||||
|
env = zenith_simple_env
|
||||||
|
|
||||||
|
tenant_id, _ = env.zenith_cli.create_tenant()
|
||||||
|
|
||||||
|
# Introduce failpoint when creating a new timeline
|
||||||
|
env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return")
|
||||||
|
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
||||||
|
_ = env.zenith_cli.create_timeline("test_fix_broken_timelines", tenant_id)
|
||||||
|
|
||||||
|
# Restart the page server
|
||||||
|
env.zenith_cli.pageserver_stop(immediate=True)
|
||||||
|
env.zenith_cli.pageserver_start()
|
||||||
|
|
||||||
|
# Check that the "broken" timeline is not loaded
|
||||||
|
timelines = env.zenith_cli.list_timelines(tenant_id)
|
||||||
|
assert len(timelines) == 1
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from contextlib import closing
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from multiprocessing import Process, Value
|
from multiprocessing import Process, Value
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol
|
from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol
|
||||||
from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex
|
from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from typing import List, Optional, Any
|
from typing import List, Optional, Any
|
||||||
@@ -401,7 +401,7 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder):
|
|||||||
|
|
||||||
http_cli = env.safekeepers[0].http_client()
|
http_cli = env.safekeepers[0].http_client()
|
||||||
# Pretend WAL is offloaded to s3.
|
# Pretend WAL is offloaded to s3.
|
||||||
http_cli.record_safekeeper_info(tenant_id, timeline_id, {'s3_wal_lsn': 'FFFFFFFF/FEFFFFFF'})
|
http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'})
|
||||||
|
|
||||||
# wait till first segment is removed on all safekeepers
|
# wait till first segment is removed on all safekeepers
|
||||||
started_at = time.time()
|
started_at = time.time()
|
||||||
@@ -414,6 +414,56 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder):
|
|||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs'])
|
||||||
|
def test_wal_backup(zenith_env_builder: ZenithEnvBuilder, storage_type: str):
|
||||||
|
zenith_env_builder.num_safekeepers = 3
|
||||||
|
if storage_type == 'local_fs':
|
||||||
|
zenith_env_builder.enable_local_fs_remote_storage()
|
||||||
|
elif storage_type == 'mock_s3':
|
||||||
|
zenith_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup')
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f'Unknown storage type: {storage_type}')
|
||||||
|
zenith_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
|
||||||
|
|
||||||
|
env = zenith_env_builder.init_start()
|
||||||
|
|
||||||
|
env.zenith_cli.create_branch('test_safekeepers_wal_backup')
|
||||||
|
pg = env.postgres.create_start('test_safekeepers_wal_backup')
|
||||||
|
|
||||||
|
# learn zenith timeline from compute
|
||||||
|
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
|
||||||
|
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
|
||||||
|
|
||||||
|
pg_conn = pg.connect()
|
||||||
|
cur = pg_conn.cursor()
|
||||||
|
cur.execute('create table t(key int, value text)')
|
||||||
|
|
||||||
|
# Shut down subsequently each of safekeepers and fill a segment while sk is
|
||||||
|
# down; ensure segment gets offloaded by others.
|
||||||
|
offloaded_seg_end = ['0/2000000', '0/3000000', '0/4000000']
|
||||||
|
for victim, seg_end in zip(env.safekeepers, offloaded_seg_end):
|
||||||
|
victim.stop()
|
||||||
|
# roughly fills one segment
|
||||||
|
cur.execute("insert into t select generate_series(1,250000), 'payload'")
|
||||||
|
live_sk = [sk for sk in env.safekeepers if sk != victim][0]
|
||||||
|
http_cli = live_sk.http_client()
|
||||||
|
|
||||||
|
started_at = time.time()
|
||||||
|
while True:
|
||||||
|
tli_status = http_cli.timeline_status(tenant_id, timeline_id)
|
||||||
|
log.info(f"live sk status is {tli_status}")
|
||||||
|
|
||||||
|
if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end):
|
||||||
|
break
|
||||||
|
elapsed = time.time() - started_at
|
||||||
|
if elapsed > 20:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"timed out waiting {elapsed:.0f}s segment ending at {seg_end} get offloaded")
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
victim.start()
|
||||||
|
|
||||||
|
|
||||||
class ProposerPostgres(PgProtocol):
|
class ProposerPostgres(PgProtocol):
|
||||||
"""Object for running postgres without ZenithEnv"""
|
"""Object for running postgres without ZenithEnv"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import field
|
from dataclasses import field
|
||||||
|
from enum import Flag, auto
|
||||||
import textwrap
|
import textwrap
|
||||||
from cached_property import cached_property
|
from cached_property import cached_property
|
||||||
import asyncpg
|
import asyncpg
|
||||||
@@ -421,10 +422,51 @@ class MockS3Server:
|
|||||||
def secret_key(self) -> str:
|
def secret_key(self) -> str:
|
||||||
return 'test'
|
return 'test'
|
||||||
|
|
||||||
|
def access_env_vars(self) -> Dict[Any, Any]:
|
||||||
|
return {
|
||||||
|
'AWS_ACCESS_KEY_ID': self.access_key(),
|
||||||
|
'AWS_SECRET_ACCESS_KEY': self.secret_key(),
|
||||||
|
}
|
||||||
|
|
||||||
def kill(self):
|
def kill(self):
|
||||||
self.subprocess.kill()
|
self.subprocess.kill()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LocalFsStorage:
|
||||||
|
local_path: Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class S3Storage:
|
||||||
|
bucket_name: str
|
||||||
|
bucket_region: str
|
||||||
|
endpoint: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
RemoteStorage = Union[LocalFsStorage, S3Storage]
|
||||||
|
|
||||||
|
|
||||||
|
# serialize as toml inline table
|
||||||
|
def remote_storage_to_toml_inline_table(remote_storage):
|
||||||
|
if isinstance(remote_storage, LocalFsStorage):
|
||||||
|
res = f"local_path='{remote_storage.local_path}'"
|
||||||
|
elif isinstance(remote_storage, S3Storage):
|
||||||
|
res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'"
|
||||||
|
if remote_storage.endpoint is not None:
|
||||||
|
res += f", endpoint='{remote_storage.endpoint}'"
|
||||||
|
else:
|
||||||
|
raise Exception(f'Unknown storage configuration {remote_storage}')
|
||||||
|
else:
|
||||||
|
raise Exception("invalid remote storage type")
|
||||||
|
return f"{{{res}}}"
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteStorageUsers(Flag):
|
||||||
|
PAGESERVER = auto()
|
||||||
|
SAFEKEEPER = auto()
|
||||||
|
|
||||||
|
|
||||||
class ZenithEnvBuilder:
|
class ZenithEnvBuilder:
|
||||||
"""
|
"""
|
||||||
Builder object to create a Zenith runtime environment
|
Builder object to create a Zenith runtime environment
|
||||||
@@ -440,6 +482,7 @@ class ZenithEnvBuilder:
|
|||||||
broker: Etcd,
|
broker: Etcd,
|
||||||
mock_s3_server: MockS3Server,
|
mock_s3_server: MockS3Server,
|
||||||
remote_storage: Optional[RemoteStorage] = None,
|
remote_storage: Optional[RemoteStorage] = None,
|
||||||
|
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
|
||||||
pageserver_config_override: Optional[str] = None,
|
pageserver_config_override: Optional[str] = None,
|
||||||
num_safekeepers: int = 1,
|
num_safekeepers: int = 1,
|
||||||
pageserver_auth_enabled: bool = False,
|
pageserver_auth_enabled: bool = False,
|
||||||
@@ -449,6 +492,7 @@ class ZenithEnvBuilder:
|
|||||||
self.rust_log_override = rust_log_override
|
self.rust_log_override = rust_log_override
|
||||||
self.port_distributor = port_distributor
|
self.port_distributor = port_distributor
|
||||||
self.remote_storage = remote_storage
|
self.remote_storage = remote_storage
|
||||||
|
self.remote_storage_users = remote_storage_users
|
||||||
self.broker = broker
|
self.broker = broker
|
||||||
self.mock_s3_server = mock_s3_server
|
self.mock_s3_server = mock_s3_server
|
||||||
self.pageserver_config_override = pageserver_config_override
|
self.pageserver_config_override = pageserver_config_override
|
||||||
@@ -497,9 +541,9 @@ class ZenithEnvBuilder:
|
|||||||
aws_access_key_id=self.mock_s3_server.access_key(),
|
aws_access_key_id=self.mock_s3_server.access_key(),
|
||||||
aws_secret_access_key=self.mock_s3_server.secret_key(),
|
aws_secret_access_key=self.mock_s3_server.secret_key(),
|
||||||
).create_bucket(Bucket=bucket_name)
|
).create_bucket(Bucket=bucket_name)
|
||||||
self.remote_storage = S3Storage(bucket=bucket_name,
|
self.remote_storage = S3Storage(bucket_name=bucket_name,
|
||||||
endpoint=mock_endpoint,
|
endpoint=mock_endpoint,
|
||||||
region=mock_region)
|
bucket_region=mock_region)
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self
|
return self
|
||||||
@@ -557,6 +601,7 @@ class ZenithEnv:
|
|||||||
self.safekeepers: List[Safekeeper] = []
|
self.safekeepers: List[Safekeeper] = []
|
||||||
self.broker = config.broker
|
self.broker = config.broker
|
||||||
self.remote_storage = config.remote_storage
|
self.remote_storage = config.remote_storage
|
||||||
|
self.remote_storage_users = config.remote_storage_users
|
||||||
|
|
||||||
# generate initial tenant ID here instead of letting 'zenith init' generate it,
|
# generate initial tenant ID here instead of letting 'zenith init' generate it,
|
||||||
# so that we don't need to dig it out of the config file afterwards.
|
# so that we don't need to dig it out of the config file afterwards.
|
||||||
@@ -605,8 +650,12 @@ class ZenithEnv:
|
|||||||
id = {id}
|
id = {id}
|
||||||
pg_port = {port.pg}
|
pg_port = {port.pg}
|
||||||
http_port = {port.http}
|
http_port = {port.http}
|
||||||
sync = false # Disable fsyncs to make the tests go faster
|
sync = false # Disable fsyncs to make the tests go faster""")
|
||||||
""")
|
if bool(self.remote_storage_users
|
||||||
|
& RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None:
|
||||||
|
toml += textwrap.dedent(f"""
|
||||||
|
remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}"
|
||||||
|
""")
|
||||||
safekeeper = Safekeeper(env=self, id=id, port=port)
|
safekeeper = Safekeeper(env=self, id=id, port=port)
|
||||||
self.safekeepers.append(safekeeper)
|
self.safekeepers.append(safekeeper)
|
||||||
|
|
||||||
@@ -638,7 +687,7 @@ def _shared_simple_env(request: Any,
|
|||||||
mock_s3_server: MockS3Server,
|
mock_s3_server: MockS3Server,
|
||||||
default_broker: Etcd) -> Iterator[ZenithEnv]:
|
default_broker: Etcd) -> Iterator[ZenithEnv]:
|
||||||
"""
|
"""
|
||||||
Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES
|
# Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES
|
||||||
is set, this is shared by all tests using `zenith_simple_env`.
|
is set, this is shared by all tests using `zenith_simple_env`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -822,20 +871,6 @@ class PageserverPort:
|
|||||||
http: int
|
http: int
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class LocalFsStorage:
|
|
||||||
root: Path
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class S3Storage:
|
|
||||||
bucket: str
|
|
||||||
region: str
|
|
||||||
endpoint: Optional[str]
|
|
||||||
|
|
||||||
|
|
||||||
RemoteStorage = Union[LocalFsStorage, S3Storage]
|
|
||||||
|
|
||||||
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
|
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
|
||||||
re.MULTILINE)
|
re.MULTILINE)
|
||||||
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
|
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
|
||||||
@@ -998,6 +1033,7 @@ class ZenithCli:
|
|||||||
append_pageserver_param_overrides(
|
append_pageserver_param_overrides(
|
||||||
params_to_update=cmd,
|
params_to_update=cmd,
|
||||||
remote_storage=self.env.remote_storage,
|
remote_storage=self.env.remote_storage,
|
||||||
|
remote_storage_users=self.env.remote_storage_users,
|
||||||
pageserver_config_override=self.env.pageserver.config_override)
|
pageserver_config_override=self.env.pageserver.config_override)
|
||||||
|
|
||||||
res = self.raw_cli(cmd)
|
res = self.raw_cli(cmd)
|
||||||
@@ -1022,14 +1058,10 @@ class ZenithCli:
|
|||||||
append_pageserver_param_overrides(
|
append_pageserver_param_overrides(
|
||||||
params_to_update=start_args,
|
params_to_update=start_args,
|
||||||
remote_storage=self.env.remote_storage,
|
remote_storage=self.env.remote_storage,
|
||||||
|
remote_storage_users=self.env.remote_storage_users,
|
||||||
pageserver_config_override=self.env.pageserver.config_override)
|
pageserver_config_override=self.env.pageserver.config_override)
|
||||||
|
|
||||||
s3_env_vars = None
|
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
|
||||||
if self.env.s3_mock_server:
|
|
||||||
s3_env_vars = {
|
|
||||||
'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(),
|
|
||||||
'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(),
|
|
||||||
}
|
|
||||||
return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
|
return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
|
||||||
|
|
||||||
def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]':
|
def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]':
|
||||||
@@ -1041,7 +1073,8 @@ class ZenithCli:
|
|||||||
return self.raw_cli(cmd)
|
return self.raw_cli(cmd)
|
||||||
|
|
||||||
def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]':
|
def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]':
|
||||||
return self.raw_cli(['safekeeper', 'start', str(id)])
|
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
|
||||||
|
return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars)
|
||||||
|
|
||||||
def safekeeper_stop(self,
|
def safekeeper_stop(self,
|
||||||
id: Optional[int] = None,
|
id: Optional[int] = None,
|
||||||
@@ -1237,22 +1270,13 @@ class ZenithPageserver(PgProtocol):
|
|||||||
def append_pageserver_param_overrides(
|
def append_pageserver_param_overrides(
|
||||||
params_to_update: List[str],
|
params_to_update: List[str],
|
||||||
remote_storage: Optional[RemoteStorage],
|
remote_storage: Optional[RemoteStorage],
|
||||||
|
remote_storage_users: RemoteStorageUsers,
|
||||||
pageserver_config_override: Optional[str] = None,
|
pageserver_config_override: Optional[str] = None,
|
||||||
):
|
):
|
||||||
if remote_storage is not None:
|
if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None:
|
||||||
if isinstance(remote_storage, LocalFsStorage):
|
remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
|
||||||
pageserver_storage_override = f"local_path='{remote_storage.root}'"
|
|
||||||
elif isinstance(remote_storage, S3Storage):
|
|
||||||
pageserver_storage_override = f"bucket_name='{remote_storage.bucket}',\
|
|
||||||
bucket_region='{remote_storage.region}'"
|
|
||||||
|
|
||||||
if remote_storage.endpoint is not None:
|
|
||||||
pageserver_storage_override += f",endpoint='{remote_storage.endpoint}'"
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise Exception(f'Unknown storage configuration {remote_storage}')
|
|
||||||
params_to_update.append(
|
params_to_update.append(
|
||||||
f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}')
|
f'--pageserver-config-override=remote_storage={remote_storage_toml_table}')
|
||||||
|
|
||||||
env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES')
|
env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES')
|
||||||
if env_overrides is not None:
|
if env_overrides is not None:
|
||||||
@@ -1786,8 +1810,9 @@ class Safekeeper:
|
|||||||
class SafekeeperTimelineStatus:
|
class SafekeeperTimelineStatus:
|
||||||
acceptor_epoch: int
|
acceptor_epoch: int
|
||||||
flush_lsn: str
|
flush_lsn: str
|
||||||
remote_consistent_lsn: str
|
|
||||||
timeline_start_lsn: str
|
timeline_start_lsn: str
|
||||||
|
backup_lsn: str
|
||||||
|
remote_consistent_lsn: str
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -1812,8 +1837,9 @@ class SafekeeperHttpClient(requests.Session):
|
|||||||
resj = res.json()
|
resj = res.json()
|
||||||
return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
|
return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
|
||||||
flush_lsn=resj['flush_lsn'],
|
flush_lsn=resj['flush_lsn'],
|
||||||
remote_consistent_lsn=resj['remote_consistent_lsn'],
|
timeline_start_lsn=resj['timeline_start_lsn'],
|
||||||
timeline_start_lsn=resj['timeline_start_lsn'])
|
backup_lsn=resj['backup_lsn'],
|
||||||
|
remote_consistent_lsn=resj['remote_consistent_lsn'])
|
||||||
|
|
||||||
def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body):
|
def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body):
|
||||||
res = self.post(
|
res = self.post(
|
||||||
@@ -1893,7 +1919,11 @@ class Etcd:
|
|||||||
f"--data-dir={self.datadir}",
|
f"--data-dir={self.datadir}",
|
||||||
f"--listen-client-urls={client_url}",
|
f"--listen-client-urls={client_url}",
|
||||||
f"--advertise-client-urls={client_url}",
|
f"--advertise-client-urls={client_url}",
|
||||||
f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}"
|
f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}",
|
||||||
|
# Set --quota-backend-bytes to keep the etcd virtual memory
|
||||||
|
# size smaller. Our test etcd clusters are very small.
|
||||||
|
# See https://github.com/etcd-io/etcd/issues/7910
|
||||||
|
f"--quota-backend-bytes=100000000"
|
||||||
]
|
]
|
||||||
self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file)
|
self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file)
|
||||||
|
|
||||||
|
|||||||
24
test_runner/performance/test_compression.py
Normal file
24
test_runner/performance/test_compression.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# Test sequential scan speed
|
||||||
|
#
|
||||||
|
from contextlib import closing
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from fixtures.zenith_fixtures import ZenithEnv
|
||||||
|
from fixtures.log_helper import log
|
||||||
|
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
|
||||||
|
from fixtures.compare_fixtures import PgCompare
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('rows', [pytest.param(10000000)])
|
||||||
|
def test_compression(zenith_with_baseline: PgCompare, rows: int):
|
||||||
|
env = zenith_with_baseline
|
||||||
|
|
||||||
|
with closing(env.pg.connect()) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
with env.record_duration('insert'):
|
||||||
|
cur.execute(
|
||||||
|
f'create table t as select generate_series(1,{rows}) as pk,(random()*10)::bigint as r10,(random()*100)::bigint as r100,(random()*1000)::bigint as r1000,(random()*10000)::bigint as r10000'
|
||||||
|
)
|
||||||
|
cur.execute("vacuum t")
|
||||||
|
with env.record_duration('select'):
|
||||||
|
cur.execute('select sum(r100) from t')
|
||||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 79af2faf08...038b2b98e5
Reference in New Issue
Block a user