Compare commits

..

3 Commits

Author SHA1 Message Date
Bojan Serafimov
688f68ecba Undo whitespace 2022-08-04 09:43:27 +02:00
Bojan Serafimov
fb2ffac8b9 Ignore metrics static 2022-08-04 09:42:27 +02:00
Bojan Serafimov
8173e36a1b Find all problematic statics 2022-08-04 09:30:22 +02:00
89 changed files with 833 additions and 2223 deletions

View File

@@ -27,26 +27,6 @@ inputs:
description: 'Whether to upload the performance report'
required: false
default: 'false'
run_with_real_s3:
description: 'Whether to pass real s3 credentials to the test suite'
required: false
default: 'false'
real_s3_bucket:
description: 'Bucket name for real s3 tests'
required: false
default: ''
real_s3_region:
description: 'Region name for real s3 tests'
required: false
default: ''
real_s3_access_key_id:
description: 'Access key id'
required: false
default: ''
real_s3_secret_access_key:
description: 'Secret access key'
required: false
default: ''
runs:
using: "composite"
@@ -83,9 +63,6 @@ runs:
# this variable will be embedded in perf test report
# and is needed to distinguish different environments
PLATFORM: github-actions-selfhosted
BUILD_TYPE: ${{ inputs.build_type }}
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
shell: bash -euxo pipefail {0}
run: |
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
@@ -100,14 +77,6 @@ runs:
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
fi
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
echo "REAL S3 ENABLED"
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }}
export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }}
fi
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
mkdir -p "$PERF_REPORT_DIR"

View File

@@ -35,16 +35,6 @@ jobs:
GIT_VERSION: ${{ github.sha }}
steps:
- name: Fix git ownerwhip
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Checkout
uses: actions/checkout@v3
with:
@@ -219,11 +209,7 @@ jobs:
build_type: ${{ matrix.build_type }}
rust_toolchain: ${{ matrix.rust_toolchain }}
test_selection: batch_others
run_with_real_s3: true
real_s3_bucket: ci-tests-s3
real_s3_region: us-west-2
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data

View File

@@ -11,15 +11,17 @@ than it was before.
## Submitting changes
1. Get at least one +1 on your PR before you push.
1. Make a PR for every change.
Even seemingly trivial patches can break things in surprising ways.
Use of common sense is OK. If you're only fixing a typo in a comment,
it's probably fine to just push it. But if in doubt, open a PR.
2. Get at least one +1 on your PR before you push.
For simple patches, it will only take a minute for someone to review
it.
2. Don't force push small changes after making the PR ready for review.
Doing so will force readers to re-read your entire PR, which will delay
the review process.
3. Always keep the CI green.
Do not push, if the CI failed on your PR. Even if you think it's not

59
Cargo.lock generated
View File

@@ -154,9 +154,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "axum"
version = "0.5.13"
version = "0.5.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b9496f0c1d1afb7a2af4338bbe1d969cddfead41d87a9fb3aaa6d0bbc7af648"
checksum = "d16705af05732b7d3258ec0f7b73c03a658a28925e050d8852d5b568ee8bcf4e"
dependencies = [
"async-trait",
"axum-core",
@@ -317,6 +317,15 @@ dependencies = [
"serde",
]
[[package]]
name = "cast"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
dependencies = [
"rustc_version",
]
[[package]]
name = "cast"
version = "0.3.0"
@@ -495,8 +504,8 @@ name = "control_plane"
version = "0.1.0"
dependencies = [
"anyhow",
"lazy_static",
"nix",
"once_cell",
"pageserver",
"postgres",
"regex",
@@ -570,7 +579,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
dependencies = [
"atty",
"cast",
"cast 0.3.0",
"clap 2.34.0",
"criterion-plot",
"csv",
@@ -591,11 +600,11 @@ dependencies = [
[[package]]
name = "criterion-plot"
version = "0.4.5"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
dependencies = [
"cast",
"cast 0.2.7",
"itertools",
]
@@ -671,9 +680,9 @@ dependencies = [
[[package]]
name = "crypto-common"
version = "0.1.6"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
checksum = "2ccfd8c0ee4cce11e45b3fd6f9d5e69e0cc62912aa6a0cb1bf4617b0eba5a12f"
dependencies = [
"generic-array",
"typenum",
@@ -1107,9 +1116,9 @@ dependencies = [
[[package]]
name = "gimli"
version = "0.26.2"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d"
checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4"
[[package]]
name = "git-version"
@@ -1175,9 +1184,9 @@ dependencies = [
[[package]]
name = "hashbrown"
version = "0.12.3"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022"
[[package]]
name = "heck"
@@ -1379,7 +1388,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
dependencies = [
"autocfg",
"hashbrown 0.12.3",
"hashbrown 0.12.2",
]
[[package]]
@@ -1591,8 +1600,8 @@ dependencies = [
name = "metrics"
version = "0.1.0"
dependencies = [
"lazy_static",
"libc",
"once_cell",
"prometheus",
"workspace_hack",
]
@@ -1842,9 +1851,9 @@ dependencies = [
[[package]]
name = "os_str_bytes"
version = "6.2.0"
version = "6.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4"
checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa"
[[package]]
name = "pageserver"
@@ -1870,6 +1879,7 @@ dependencies = [
"humantime-serde",
"hyper",
"itertools",
"lazy_static",
"metrics",
"nix",
"once_cell",
@@ -2115,9 +2125,9 @@ dependencies = [
"crc32c",
"env_logger",
"hex",
"lazy_static",
"log",
"memoffset",
"once_cell",
"postgres",
"rand",
"regex",
@@ -2277,9 +2287,9 @@ dependencies = [
"hex",
"hmac 0.12.1",
"hyper",
"lazy_static",
"md5",
"metrics",
"once_cell",
"parking_lot 0.12.1",
"pin-project-lite",
"rand",
@@ -2725,9 +2735,9 @@ dependencies = [
[[package]]
name = "rustversion"
version = "1.0.8"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"
checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf"
[[package]]
name = "ryu"
@@ -2753,6 +2763,7 @@ dependencies = [
"hex",
"humantime",
"hyper",
"lazy_static",
"metrics",
"once_cell",
"postgres",
@@ -3606,9 +3617,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"
[[package]]
name = "unicode-ident"
version = "1.0.2"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c"
[[package]]
name = "unicode-normalization"
@@ -3669,9 +3680,9 @@ dependencies = [
"hex-literal",
"hyper",
"jsonwebtoken",
"lazy_static",
"metrics",
"nix",
"once_cell",
"pin-project-lite",
"postgres",
"postgres-protocol",

View File

@@ -9,7 +9,7 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8
serde = { version = "1.0", features = ["derive"] }
serde_with = "1.12.0"
toml = "0.5"
once_cell = "1.13.0"
lazy_static = "1.4"
regex = "1"
anyhow = "1.0"
thiserror = "1"

View File

@@ -30,14 +30,14 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_stdout_file =
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
format!(
"Failed to create etcd stout file in directory {}",
"Failed to create ectd stout file in directory {}",
etcd_data_dir.display()
)
})?;
let etcd_stderr_file =
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
format!(
"Failed to create etcd stderr file in directory {}",
"Failed to create ectd stderr file in directory {}",
etcd_data_dir.display()
)
})?;

View File

@@ -51,11 +51,7 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
] {
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
if let Ok(value) = std::env::var(env_key) {
cmd = cmd.env(env_key, value);
}

View File

@@ -5,7 +5,7 @@
/// enough to extract a few settings we need in Zenith, assuming you don't do
/// funny stuff like include-directives or funny escaping.
use anyhow::{bail, Context, Result};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use regex::Regex;
use std::collections::HashMap;
use std::fmt;
@@ -19,7 +19,9 @@ pub struct PostgresConf {
hash: HashMap<String, String>,
}
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
lazy_static! {
static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
}
impl PostgresConf {
pub fn new() -> PostgresConf {
@@ -137,10 +139,10 @@ fn escape_str(s: &str) -> String {
//
// This regex is a bit more conservative than the rules in guc-file.l, so we quote some
// strings that PostgreSQL would accept without quoting, but that's OK.
static UNQUOTED_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap());
lazy_static! {
static ref UNQUOTED_RE: Regex =
Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
}
if UNQUOTED_RE.is_match(s) {
s.to_string()
} else {

View File

@@ -247,7 +247,7 @@ impl SafekeeperNode {
// Shutting down may take a long time,
// if safekeeper flushes a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
for _ in 0..100 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
@@ -272,11 +272,9 @@ impl SafekeeperNode {
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1));
}
bail!("Failed to stop safekeeper with pid {}", pid);

View File

@@ -318,7 +318,7 @@ impl PageServerNode {
// Shutting down may take a long time,
// if pageserver checkpoints a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
for _ in 0..100 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
@@ -344,11 +344,9 @@ impl PageServerNode {
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1));
}
bail!("Failed to stop pageserver with pid {}", pid);
@@ -401,7 +399,6 @@ impl PageServerNode {
.get("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()?,
checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.get("compaction_target_size")
.map(|x| x.parse::<u64>())
@@ -456,7 +453,6 @@ impl PageServerNode {
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.get("compaction_target_size")
.map(|x| x.parse::<u64>())

View File

@@ -1,8 +1,6 @@
#!/bin/sh
set -eux
pageserver_id_param="${NODE_ID:-10}"
broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
if [ "$broker_endpoints_param" != "absent" ]; then
broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
@@ -10,12 +8,10 @@ else
broker_endpoints_param=''
fi
remote_storage_param="${REMOTE_STORAGE:-}"
if [ "$1" = 'pageserver' ]; then
if [ ! -d "/data/tenants" ]; then
echo "Initializing pageserver data directory"
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
fi
echo "Staring pageserver at 0.0.0.0:6400"
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data

View File

@@ -52,8 +52,10 @@
- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [settings.md](./settings.md)
#FIXME: move these under sourcetree.md
#- [pageserver/README.md](/pageserver/README.md)
#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
#- [test_runner/README.md](/test_runner/README.md)
#- [safekeeper/README.md](/safekeeper/README.md)
# RFCs

View File

@@ -75,7 +75,7 @@ layer's Segment and range of LSNs.
There are two kinds of layers, in-memory and on-disk layers. In-memory
layers are used to ingest incoming WAL, and provide fast access
to the recent page versions. On-disk layers are stored as files on disk, and
are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more.
are immutable. See pageserver/src/layered_repository/README.md for more.
### Layer file (on-disk layer)
@@ -111,7 +111,7 @@ PostgreSQL LSNs and functions to monitor them:
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information.
Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
* `RestartLSN`: position in WAL confirmed by all safekeepers.
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.

View File

@@ -68,6 +68,8 @@ There are the following implementations present:
* local filesystem — to use in tests mainly
* AWS S3 - to use in production
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
The backup service is disabled by default and can be enabled to interact with a single remote storage.
CLI examples:
@@ -116,7 +118,7 @@ implemented by the LayeredRepository object in
`layered_repository.rs`. There is only that one implementation of the
Repository trait, but it's still a useful abstraction that keeps the
interface for the low-level storage functionality clean. The layered
storage format is described in [pageserver-storage.md](./pageserver-storage.md).
storage format is described in layered_repository/README.md.
Each repository consists of multiple Timelines. Timeline is a
workhorse that accepts page changes from the WAL, and serves

View File

@@ -15,7 +15,7 @@ listen_pg_addr = '127.0.0.1:64000'
listen_http_addr = '127.0.0.1:9898'
checkpoint_distance = '268435456' # in bytes
checkpoint_timeout = '10m'
checkpoint_period = '1 s'
gc_period = '100 s'
gc_horizon = '67108864'
@@ -46,7 +46,7 @@ Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#ta
All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form.
Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage={local_path='/some/local/path/'}"`
Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage={local_path='/some/local/path/'}"`
Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
@@ -82,14 +82,6 @@ S3.
The unit is # of bytes.
#### checkpoint_timeout
Apart from `checkpoint_distance`, open layer flushing is also triggered
`checkpoint_timeout` after the last flush. This makes WAL eventually uploaded to
s3 when activity is stopped.
The default is 10m.
#### compaction_period
Every `compaction_period` seconds, the page server checks if

View File

@@ -28,7 +28,7 @@ The pageserver has a few different duties:
- Receive WAL from the WAL service and decode it.
- Replay WAL that's applicable to the chunks that the Page Server maintains
For more detailed info, see [pageserver-services.md](./pageserver-services.md)
For more detailed info, see [/pageserver/README](/pageserver/README.md)
`/proxy`:
@@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
It acts as a holding area and redistribution center for recently generated WAL.
For more detailed info, see [walservice.md](./walservice.md)
For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
`/workspace_hack`:
The workspace_hack crate exists only to pin down some dependencies.

View File

@@ -75,8 +75,8 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only
one primary node can be actively streaming WAL to the quorum of
safekeepers.
See [this section](safekeeper-protocol.md) for a more detailed description of
the consensus protocol. spec/ contains TLA+ specification of it.
See README_PROTO.md for a more detailed description of the consensus
protocol. spec/ contains TLA+ specification of it.
# Q&A

View File

@@ -9,7 +9,7 @@
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "1.12.0"
once_cell = "1.13.0"
once_cell = "1.8.0"
utils = { path = "../utils" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -6,5 +6,5 @@ edition = "2021"
[dependencies]
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
libc = "0.2"
once_cell = "1.13.0"
lazy_static = "1.4"
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -2,7 +2,7 @@
//! make sure that we use the same dep version everywhere.
//! Otherwise, we might not see all metrics registered via
//! a default registry.
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
pub use prometheus::opts;
pub use prometheus::register;
@@ -41,22 +41,19 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
prometheus::gather()
}
static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
lazy_static! {
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
"libmetrics_disk_io_bytes_total",
"Bytes written and read from disk, grouped by the operation (read|write)",
&["io_operation"]
)
.expect("Failed to register disk i/o bytes int gauge vec")
});
static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
.expect("Failed to register disk i/o bytes int gauge vec");
static ref MAXRSS_KB: IntGauge = register_int_gauge!(
"libmetrics_maxrss_kb",
"Memory usage (Maximum Resident Set Size)"
)
.expect("Failed to register maxrss_kb int gauge")
});
.expect("Failed to register maxrss_kb int gauge");
}
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,

View File

@@ -10,13 +10,13 @@ use std::io::{Read, Result, Write};
/// # use std::io::{Result, Read};
/// # use metrics::{register_int_counter, IntCounter};
/// # use metrics::CountedReader;
/// # use once_cell::sync::Lazy;
/// #
/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
/// # lazy_static::lazy_static! {
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
/// # "int_counter",
/// # "let's count something!"
/// # ).unwrap()
/// # });
/// # ).unwrap();
/// # }
/// #
/// fn do_some_reads(stream: impl Read, count: usize) -> Result<Vec<u8>> {
/// let mut reader = CountedReader::new(stream, |cnt| {
@@ -85,13 +85,13 @@ impl<T: Read> Read for CountedReader<'_, T> {
/// # use std::io::{Result, Write};
/// # use metrics::{register_int_counter, IntCounter};
/// # use metrics::CountedWriter;
/// # use once_cell::sync::Lazy;
/// #
/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
/// # lazy_static::lazy_static! {
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
/// # "int_counter",
/// # "let's count something!"
/// # ).unwrap()
/// # });
/// # ).unwrap();
/// # }
/// #
/// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> {
/// let mut writer = CountedWriter::new(stream, |cnt| {

View File

@@ -12,7 +12,7 @@ byteorder = "1.4.3"
anyhow = "1.0"
crc32c = "0.6.0"
hex = "0.4.3"
once_cell = "1.13.0"
lazy_static = "1.4"
log = "0.4.14"
memoffset = "0.6.2"
thiserror = "1.0"

View File

@@ -2,7 +2,7 @@
//! Common utilities for dealing with PostgreSQL relation files.
//!
use crate::pg_constants;
use once_cell::sync::OnceCell;
use lazy_static::lazy_static;
use regex::Regex;
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
@@ -54,14 +54,11 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
///
pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
static RELFILE_RE: OnceCell<Regex> = OnceCell::new();
RELFILE_RE.get_or_init(|| {
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap()
});
lazy_static! {
static ref RELFILE_RE: Regex =
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
}
let caps = RELFILE_RE
.get()
.unwrap()
.captures(fname)
.ok_or(FilePathError::InvalidFileName)?;

View File

@@ -13,30 +13,24 @@ use super::xlog_utils::*;
use super::XLogLongPageHeaderData;
use super::XLogPageHeaderData;
use super::XLogRecord;
use super::XLOG_PAGE_MAGIC;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crc32c::*;
use log::*;
use std::cmp::min;
use std::num::NonZeroU32;
use thiserror::Error;
use utils::lsn::Lsn;
enum State {
WaitingForRecord,
ReassemblingRecord {
recordbuf: BytesMut,
contlen: NonZeroU32,
},
SkippingEverything {
skip_until_lsn: Lsn,
},
}
pub struct WalStreamDecoder {
lsn: Lsn,
startlsn: Lsn, // LSN where this record starts
contlen: u32,
padlen: u32,
inputbuf: BytesMut,
state: State,
/// buffer used to reassemble records that cross page boundaries.
recordbuf: BytesMut,
}
#[derive(Error, Debug, Clone)]
@@ -54,8 +48,13 @@ impl WalStreamDecoder {
pub fn new(lsn: Lsn) -> WalStreamDecoder {
WalStreamDecoder {
lsn,
startlsn: Lsn(0),
contlen: 0,
padlen: 0,
inputbuf: BytesMut::new(),
state: State::WaitingForRecord,
recordbuf: BytesMut::new(),
}
}
@@ -68,58 +67,6 @@ impl WalStreamDecoder {
self.inputbuf.extend_from_slice(buf);
}
fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> {
let validate_impl = || {
if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 {
return Err(format!(
"invalid xlog page header: xlp_magic={}, expected {}",
hdr.xlp_magic, XLOG_PAGE_MAGIC
));
}
if hdr.xlp_pageaddr != self.lsn.0 {
return Err(format!(
"invalid xlog page header: xlp_pageaddr={}, expected {}",
hdr.xlp_pageaddr, self.lsn
));
}
match self.state {
State::WaitingForRecord => {
if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 {
return Err(
"invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(),
);
}
if hdr.xlp_rem_len != 0 {
return Err(format!(
"invalid xlog page header: xlp_rem_len={}, but it's not a contrecord",
hdr.xlp_rem_len
));
}
}
State::ReassemblingRecord { contlen, .. } => {
if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 {
return Err(
"invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found"
.into(),
);
}
if hdr.xlp_rem_len != contlen.get() {
return Err(format!(
"invalid xlog page header: xlp_rem_len={}, expected {}",
hdr.xlp_rem_len,
contlen.get()
));
}
}
State::SkippingEverything { .. } => {
panic!("Should not be validating page header in the SkippingEverything state");
}
};
Ok(())
};
validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn })
}
/// Attempt to decode another WAL record from the input that has been fed to the
/// decoder so far.
///
@@ -129,121 +76,128 @@ impl WalStreamDecoder {
/// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid.
///
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
let recordbuf;
// Run state machine that validates page headers, and reassembles records
// that cross page boundaries.
loop {
// parse and verify page boundaries as we go
// However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason.
match self.state {
State::WaitingForRecord | State::ReassemblingRecord { .. } => {
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
// parse long header
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
return Ok(None);
}
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(
|e| WalDecodeError {
msg: format!("long header deserialization failed {}", e),
lsn: self.lsn,
},
)?;
self.validate_page_header(&hdr.std)?;
self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
} else if self.lsn.block_offset() == 0 {
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
return Ok(None);
}
let hdr =
XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
WalDecodeError {
msg: format!("header deserialization failed {}", e),
lsn: self.lsn,
}
})?;
self.validate_page_header(&hdr)?;
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
}
if self.padlen > 0 {
// We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record.
if self.inputbuf.remaining() < self.padlen as usize {
return Ok(None);
}
State::SkippingEverything { .. } => {}
}
match &mut self.state {
State::WaitingForRecord => {
// need to have at least the xl_tot_len field
if self.inputbuf.remaining() < 4 {
return Ok(None);
}
// peek xl_tot_len at the beginning of the record.
// FIXME: assumes little-endian
let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
return Err(WalDecodeError {
msg: format!("invalid xl_tot_len {}", xl_tot_len),
lsn: self.lsn,
});
}
// Fast path for the common case that the whole record fits on the page.
let pageleft = self.lsn.remaining_in_block() as u32;
if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
self.lsn += xl_tot_len as u64;
let recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
return Ok(Some(self.complete_record(recordbuf)?));
} else {
// Need to assemble the record from pieces. Remember the size of the
// record, and loop back. On next iteration, we will reach the 'else'
// branch below, and copy the part of the record that was on this page
// to 'recordbuf'. Subsequent iterations will skip page headers, and
// append the continuations from the next pages to 'recordbuf'.
self.state = State::ReassemblingRecord {
recordbuf: BytesMut::with_capacity(xl_tot_len as usize),
contlen: NonZeroU32::new(xl_tot_len).unwrap(),
}
}
// skip padding
self.inputbuf.advance(self.padlen as usize);
self.lsn += self.padlen as u64;
self.padlen = 0;
} else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
// parse long header
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
return Ok(None);
}
State::ReassemblingRecord { recordbuf, contlen } => {
// we're continuing a record, possibly from previous page.
let pageleft = self.lsn.remaining_in_block() as u32;
// read the rest of the record, or as much as fits on this page.
let n = min(contlen.get(), pageleft) as usize;
if self.inputbuf.remaining() < n {
return Ok(None);
let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
WalDecodeError {
msg: format!("long header deserialization failed {}", e),
lsn: self.lsn,
}
})?;
recordbuf.put(self.inputbuf.split_to(n));
self.lsn += n as u64;
*contlen = match NonZeroU32::new(contlen.get() - n as u32) {
Some(x) => x,
None => {
// The record is now complete.
let recordbuf = std::mem::replace(recordbuf, BytesMut::new()).freeze();
return Ok(Some(self.complete_record(recordbuf)?));
}
}
if hdr.std.xlp_pageaddr != self.lsn.0 {
return Err(WalDecodeError {
msg: "invalid xlog segment header".into(),
lsn: self.lsn,
});
}
State::SkippingEverything { skip_until_lsn } => {
assert!(*skip_until_lsn >= self.lsn);
let n = skip_until_lsn.0 - self.lsn.0;
if self.inputbuf.remaining() < n as usize {
return Ok(None);
}
self.inputbuf.advance(n as usize);
self.lsn += n;
self.state = State::WaitingForRecord;
// TODO: verify the remaining fields in the header
self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
continue;
} else if self.lsn.block_offset() == 0 {
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
return Ok(None);
}
let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
WalDecodeError {
msg: format!("header deserialization failed {}", e),
lsn: self.lsn,
}
})?;
if hdr.xlp_pageaddr != self.lsn.0 {
return Err(WalDecodeError {
msg: "invalid xlog page header".into(),
lsn: self.lsn,
});
}
// TODO: verify the remaining fields in the header
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
continue;
} else if self.contlen == 0 {
assert!(self.recordbuf.is_empty());
// need to have at least the xl_tot_len field
if self.inputbuf.remaining() < 4 {
return Ok(None);
}
// peek xl_tot_len at the beginning of the record.
// FIXME: assumes little-endian
self.startlsn = self.lsn;
let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
return Err(WalDecodeError {
msg: format!("invalid xl_tot_len {}", xl_tot_len),
lsn: self.lsn,
});
}
// Fast path for the common case that the whole record fits on the page.
let pageleft = self.lsn.remaining_in_block() as u32;
if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
// Take the record from the 'inputbuf', and validate it.
recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
self.lsn += xl_tot_len as u64;
break;
} else {
// Need to assemble the record from pieces. Remember the size of the
// record, and loop back. On next iteration, we will reach the 'else'
// branch below, and copy the part of the record that was on this page
// to 'recordbuf'. Subsequent iterations will skip page headers, and
// append the continuations from the next pages to 'recordbuf'.
self.recordbuf.reserve(xl_tot_len as usize);
self.contlen = xl_tot_len;
continue;
}
} else {
// we're continuing a record, possibly from previous page.
let pageleft = self.lsn.remaining_in_block() as u32;
// read the rest of the record, or as much as fits on this page.
let n = min(self.contlen, pageleft) as usize;
if self.inputbuf.remaining() < n {
return Ok(None);
}
self.recordbuf.put(self.inputbuf.split_to(n));
self.lsn += n as u64;
self.contlen -= n as u32;
if self.contlen == 0 {
// The record is now complete.
recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
break;
}
continue;
}
}
}
fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError> {
// We now have a record in the 'recordbuf' local variable.
let xlogrec =
XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
@@ -265,20 +219,18 @@ impl WalStreamDecoder {
// XLOG_SWITCH records are special. If we see one, we need to skip
// to the next WAL segment.
let next_lsn = if xlogrec.is_xlog_switch_record() {
if xlogrec.is_xlog_switch_record() {
trace!("saw xlog switch record at {}", self.lsn);
self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64)
self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
} else {
// Pad to an 8-byte boundary
self.lsn.align()
};
self.state = State::SkippingEverything {
skip_until_lsn: next_lsn,
};
self.padlen = self.lsn.calc_padding(8u32) as u32;
}
// We should return LSN of the next record, not the last byte of this record or
// the byte immediately after. Note that this handles both XLOG_SWITCH and usual
// records, the former "spans" until the next WAL segment (see test_xlog_switch).
Ok((next_lsn, recordbuf))
let result = (self.lsn + self.padlen as u64, recordbuf);
Ok(Some(result))
}
}

View File

@@ -16,7 +16,7 @@ use crate::XLogRecord;
use crate::XLOG_PAGE_MAGIC;
use crate::pg_constants::WAL_SEGMENT_SIZE;
use anyhow::{anyhow, bail, ensure};
use anyhow::{bail, ensure};
use byteorder::{ByteOrder, LittleEndian};
use bytes::BytesMut;
use bytes::{Buf, Bytes};
@@ -159,7 +159,7 @@ fn find_end_of_wal_segment(
let mut buf = [0u8; XLOG_BLCKSZ];
let file_name = XLogFileName(tli, segno, wal_seg_size);
let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
file.seek(SeekFrom::Start(offs as u64))?;
// xl_crc is the last field in XLogRecord, will not be read into rec_hdr
const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
@@ -396,13 +396,10 @@ pub fn find_end_of_wal(
let mut high_tli: TimeLineID = 0;
let mut high_ispartial = false;
for entry in fs::read_dir(data_dir)?.flatten() {
for entry in fs::read_dir(data_dir).unwrap().flatten() {
let ispartial: bool;
let entry_name = entry.file_name();
let fname = entry_name
.to_str()
.ok_or_else(|| anyhow!("Invalid file name"))?;
let fname = entry_name.to_str().unwrap();
/*
* Check if the filename looks like an xlog file, or a .partial file.
*/
@@ -414,7 +411,7 @@ pub fn find_end_of_wal(
continue;
}
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
continue;
}
if segno > high_segno

View File

@@ -10,7 +10,7 @@ anyhow = "1.0"
clap = "3.0"
env_logger = "0.9"
log = "0.4"
once_cell = "1.13.0"
once_cell = "1.8.0"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres_ffi = { path = "../" }
tempfile = "3.2"

View File

@@ -7,7 +7,7 @@ edition = "2021"
anyhow = { version = "1.0", features = ["backtrace"] }
async-trait = "0.1"
metrics = { version = "0.1", path = "../metrics" }
once_cell = "1.13.0"
once_cell = "1.8.0"
rusoto_core = "0.48"
rusoto_s3 = "0.48"
serde = { version = "1.0", features = ["derive"] }

View File

@@ -66,9 +66,6 @@ pub trait RemoteStorage: Send + Sync {
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
/// so this method doesnt need to.
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,

View File

@@ -116,7 +116,7 @@ impl RemoteStorage for LocalFs {
prefix: Option<Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
let path = match prefix {
Some(prefix) => Cow::Owned(prefix),
Some(prefix) => Cow::Owned(self.storage_root.join(prefix)),
None => Cow::Borrowed(&self.storage_root),
};
get_all_files(path.as_ref(), false).await

View File

@@ -171,25 +171,17 @@ impl S3Bucket {
let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
// session token is used when authorizing through sso
// which is typically the case when testing locally on developer machine
let session_token = std::env::var("AWS_SESSION_TOKEN").ok();
let client = if access_key_id.is_none() && secret_access_key.is_none() {
debug!("Using IAM-based AWS access");
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
} else {
debug!(
"Using credentials-based AWS access. Session token is set: {}",
session_token.is_some()
);
debug!("Using credentials-based AWS access");
S3Client::new_with(
request_dispatcher,
StaticProvider::new(
StaticProvider::new_minimal(
access_key_id.unwrap_or_default(),
secret_access_key.unwrap_or_default(),
session_token,
None,
),
region,
)
@@ -312,24 +304,32 @@ impl RemoteStorage for S3Bucket {
Ok(document_keys)
}
/// See the doc for `RemoteStorage::list_prefixes`
/// Note: it wont include empty "directories"
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| p.0)
.or_else(|| self.prefix_in_bucket.clone())
.map(|mut p| {
let list_prefix = match prefix {
Some(prefix) => {
let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default();
// if there is no trailing / in default prefix and
// supplied prefix does not start with "/" insert it
if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR)
|| prefix.0.starts_with(S3_PREFIX_SEPARATOR))
{
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
}
prefix_in_bucket.push_str(&prefix.0);
// required to end with a separator
// otherwise request will return only the entry of a prefix
if !p.ends_with(S3_PREFIX_SEPARATOR) {
p.push(S3_PREFIX_SEPARATOR);
if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) {
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
}
p
});
Some(prefix_in_bucket)
}
None => self.prefix_in_bucket.clone(),
};
let mut document_keys = Vec::new();

View File

@@ -8,6 +8,7 @@ anyhow = "1.0"
bincode = "1.3"
bytes = "1.0.1"
hyper = { version = "0.14.7", features = ["full"] }
lazy_static = "1.4.0"
pin-project-lite = "0.2.7"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -27,8 +28,6 @@ rustls = "0.20.2"
rustls-split = "0.3.0"
git-version = "0.3.5"
serde_with = "1.12.0"
once_cell = "1.13.0"
metrics = { path = "../metrics" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -4,8 +4,8 @@ use crate::zid::ZTenantId;
use anyhow::anyhow;
use hyper::header::AUTHORIZATION;
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
use lazy_static::lazy_static;
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::RequestInfo;
use routerify::{Middleware, Router, RouterBuilder, RouterService};
@@ -16,13 +16,13 @@ use std::net::TcpListener;
use super::error::ApiError;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
lazy_static! {
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
"libmetrics_metric_handler_requests_total",
"Number of metric requests made"
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);

View File

@@ -7,7 +7,7 @@ use std::{
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
@@ -19,15 +19,16 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) {
(server_stream, client_stream)
}
static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
let mut cursor = Cursor::new(include_bytes!("key.pem"));
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
});
static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
});
lazy_static! {
static ref KEY: rustls::PrivateKey = {
let mut cursor = Cursor::new(include_bytes!("key.pem"));
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
};
static ref CERT: rustls::Certificate = {
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
};
}
#[test]
fn ssl() {

View File

@@ -884,7 +884,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
match sub_match.subcommand() {
Some(("start", start_match)) => {
if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
eprintln!("pageserver start failed: {e}");
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
@@ -906,19 +906,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
}
if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
eprintln!("pageserver start failed: {e}");
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
Ok(_) => println!("Page server is up and running"),
Err(err) => {
eprintln!("Page server is not available: {}", err);
exit(1);
}
},
Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
None => bail!("no pageserver subcommand provided"),
}

View File

@@ -21,6 +21,7 @@ futures = "0.3.13"
hex = "0.4.3"
hyper = "0.14"
itertools = "0.10.3"
lazy_static = "1.4.0"
clap = "3.0"
daemonize = "0.4.1"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
@@ -47,7 +48,7 @@ tracing = "0.1.27"
signal-hook = "0.3.10"
url = "2"
nix = "0.23"
once_cell = "1.13.0"
once_cell = "1.8.0"
crossbeam-utils = "0.8.5"
fail = "0.5.0"
git-version = "0.3.5"

View File

@@ -59,7 +59,6 @@ pub mod defaults {
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
@@ -453,13 +452,6 @@ impl PageServerConf {
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
}
if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
t_conf.checkpoint_timeout = Some(parse_toml_duration(
"checkpoint_timeout",
checkpoint_timeout,
)?);
}
if let Some(compaction_target_size) = item.get("compaction_target_size") {
t_conf.compaction_target_size = Some(parse_toml_u64(
"compaction_target_size",

View File

@@ -32,7 +32,6 @@ pub struct TenantCreateRequest {
#[serde_as(as = "Option<DisplayFromStr>")]
pub new_tenant_id: Option<ZTenantId>,
pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>,
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
@@ -71,7 +70,6 @@ pub struct TenantConfigRequest {
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>,
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
@@ -89,7 +87,6 @@ impl TenantConfigRequest {
TenantConfigRequest {
tenant_id,
checkpoint_distance: None,
checkpoint_timeout: None,
compaction_target_size: None,
compaction_period: None,
compaction_threshold: None,

View File

@@ -560,8 +560,6 @@ components:
type: string
checkpoint_distance:
type: integer
checkpoint_timeout:
type: string
compaction_period:
type: string
compaction_threshold:
@@ -580,8 +578,6 @@ components:
type: string
checkpoint_distance:
type: integer
checkpoint_timeout:
type: string
compaction_period:
type: string
compaction_threshold:

View File

@@ -623,11 +623,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
tenant_conf.checkpoint_timeout =
Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
}
tenant_conf.compaction_target_size = request_data.compaction_target_size;
tenant_conf.compaction_threshold = request_data.compaction_threshold;
@@ -688,10 +683,6 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
tenant_conf.checkpoint_timeout =
Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
}
tenant_conf.compaction_target_size = request_data.compaction_target_size;
tenant_conf.compaction_threshold = request_data.compaction_threshold;

View File

@@ -37,7 +37,7 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
// Then fishing out pg_control would be unnecessary
let mut modification = tline.begin_modification(lsn);
let mut modification = tline.begin_modification();
modification.init_empty()?;
// Import all but pg_wal
@@ -56,12 +56,12 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
pg_control = Some(control_file);
}
modification.flush()?;
modification.flush(lsn)?;
}
}
// We're done importing all the data files.
modification.commit()?;
modification.commit(lsn)?;
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -267,7 +267,7 @@ fn import_wal<T: DatadirTimeline>(
waldecoder.feed_bytes(&buf);
let mut nrecords = 0;
let mut modification = tline.begin_modification(endpoint);
let mut modification = tline.begin_modification();
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
@@ -301,7 +301,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
base_lsn: Lsn,
) -> Result<()> {
info!("importing base at {}", base_lsn);
let mut modification = tline.begin_modification(base_lsn);
let mut modification = tline.begin_modification();
modification.init_empty()?;
let mut pg_control: Option<ControlFileData> = None;
@@ -319,7 +319,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush()?;
modification.flush(base_lsn)?;
}
tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
@@ -333,7 +333,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
// sanity check: ensure that pg_control is loaded
let _pg_control = pg_control.context("pg_control file not found")?;
modification.commit()?;
modification.commit(base_lsn)?;
Ok(())
}
@@ -385,7 +385,7 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
waldecoder.feed_bytes(&bytes[offset..]);
let mut modification = tline.begin_modification(end_lsn);
let mut modification = tline.begin_modification();
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {

View File

@@ -5,7 +5,7 @@
//! get/put call, walking back the timeline branching history as needed.
//!
//! The files are stored in the .neon/tenants/<tenantid>/timelines/<timelineid>
//! directory. See docs/pageserver-storage.md for how the files are managed.
//! directory. See layered_repository/README for how the files are managed.
//! In addition to the layer files, there is a metadata file in the same
//! directory that contains information about the timeline, in particular its
//! parent timeline, and the last LSN that has been written to disk.
@@ -433,13 +433,6 @@ impl LayeredRepository {
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
}
pub fn get_checkpoint_timeout(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.checkpoint_timeout
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
pub fn get_compaction_target_size(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf

View File

@@ -5,7 +5,7 @@
use crate::page_cache;
use crate::page_cache::{ReadBufResult, PAGE_SZ};
use bytes::Bytes;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::ops::{Deref, DerefMut};
use std::os::unix::fs::FileExt;
use std::sync::atomic::AtomicU64;
@@ -117,7 +117,9 @@ where
}
}
static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
lazy_static! {
static ref NEXT_ID: AtomicU64 = AtomicU64::new(1);
}
/// An adapter for reading a (virtual) file using the page cache.
///

View File

@@ -8,7 +8,7 @@ use crate::page_cache;
use crate::page_cache::PAGE_SZ;
use crate::page_cache::{ReadBufResult, WriteBufResult};
use crate::virtual_file::VirtualFile;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::cmp::min;
use std::collections::HashMap;
use std::fs::OpenOptions;
@@ -21,15 +21,15 @@ use utils::zid::{ZTenantId, ZTimelineId};
use std::os::unix::fs::FileExt;
///
/// This is the global cache of file descriptors (File objects).
///
static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
RwLock::new(EphemeralFiles {
lazy_static! {
///
/// This is the global cache of file descriptors (File objects).
///
static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
next_file_id: 1,
files: HashMap::new(),
})
});
});
}
pub struct EphemeralFiles {
next_file_id: u64,

View File

@@ -15,18 +15,19 @@ use crate::layered_repository::storage_layer::Layer;
use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
use crate::repository::Key;
use anyhow::Result;
use lazy_static::lazy_static;
use metrics::{register_int_gauge, IntGauge};
use once_cell::sync::Lazy;
use std::collections::VecDeque;
use std::ops::Range;
use std::sync::Arc;
use tracing::*;
use utils::lsn::Lsn;
static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
.expect("failed to define a metric")
});
lazy_static! {
static ref NUM_ONDISK_LAYERS: IntGauge =
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
.expect("failed to define a metric");
}
///
/// LayerMap tracks what layers exist on a timeline.

View File

@@ -4,11 +4,11 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use fail::fail_point;
use itertools::Itertools;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use tracing::*;
use std::cmp::{max, min, Ordering};
use std::collections::{hash_map::Entry, HashMap, HashSet};
use std::collections::HashSet;
use std::fs;
use std::fs::{File, OpenOptions};
use std::io::Write;
@@ -16,7 +16,7 @@ use std::ops::{Deref, Range};
use std::path::PathBuf;
use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError};
use std::time::{Duration, Instant, SystemTime};
use std::time::{Duration, SystemTime};
use metrics::{
register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec,
@@ -38,9 +38,7 @@ use crate::layered_repository::{
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::pgdatadir_mapping::BlockNumber;
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::reltag::RelTag;
use crate::tenant_config::TenantConfOpt;
use crate::DatadirTimeline;
@@ -60,102 +58,76 @@ use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
use crate::{page_cache, storage_sync};
/// Prometheus histogram buckets (in seconds) that capture the majority of
/// latencies in the microsecond range but also extend far enough up to distinguish
/// "bad" from "really bad".
fn get_buckets_for_critical_operations() -> Vec<f64> {
let buckets_per_digit = 5;
let min_exponent = -6;
let max_exponent = 2;
let mut buckets = vec![];
// Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
// because it's more numerically stable and doesn't result in numbers like 9.999999
for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
}
buckets
// Metrics collected on operations on the storage repository.
lazy_static! {
pub static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
"pageserver_storage_operations_seconds",
"Time spent on storage operations",
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric");
}
// Metrics collected on operations on the storage repository.
pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_storage_operations_seconds",
"Time spent on storage operations",
&["operation", "tenant_id", "timeline_id"],
get_buckets_for_critical_operations(),
)
.expect("failed to define a metric")
});
// Metrics collected on operations on the storage repository.
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value",
&["tenant_id", "timeline_id"],
get_buckets_for_critical_operations(),
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
lazy_static! {
static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("failed to define a metric");
static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
"pageserver_wait_lsn_seconds",
"Time spent waiting for WAL to arrive",
&["tenant_id", "timeline_id"],
get_buckets_for_critical_operations(),
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
lazy_static! {
static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!(
"pageserver_last_record_lsn",
"Last record LSN grouped by timeline",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
// Metrics for determining timeline's physical size.
// A layered timeline's physical is defined as the total size of
// (delta/image) layer files on disk.
static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
lazy_static! {
static ref CURRENT_PHYSICAL_SIZE: UIntGaugeVec = register_uint_gauge_vec!(
"pageserver_current_physical_size",
"Current physical size grouped by timeline",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
// or in testing they estimate how much we would upload if we did.
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
lazy_static! {
static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
"pageserver_created_persistent_files_total",
"Number of files created that are meant to be uploaded to cloud storage",
)
.expect("failed to define a metric")
});
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.expect("failed to define a metric");
static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
"pageserver_written_persistent_bytes_total",
"Total bytes written that are meant to be uploaded to cloud storage",
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
#[derive(Clone)]
pub enum LayeredTimelineEntry {
@@ -233,8 +205,6 @@ pub struct LayeredTimeline {
pub layers: RwLock<LayerMap>,
last_freeze_at: AtomicLsn,
// Atomic would be more appropriate here.
last_freeze_ts: RwLock<Instant>,
// WAL redo manager
walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
@@ -325,9 +295,6 @@ pub struct LayeredTimeline {
/// or None if WAL receiver has not received anything for this timeline
/// yet.
pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
/// Relation size cache
rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
}
pub struct WalReceiverInfo {
@@ -339,42 +306,7 @@ pub struct WalReceiverInfo {
/// Inherit all the functions from DatadirTimeline, to provide the
/// functionality to store PostgreSQL relations, SLRUs, etc. in a
/// LayeredTimeline.
impl DatadirTimeline for LayeredTimeline {
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
if lsn >= *cached_lsn {
return Some(*nblocks);
}
}
None
}
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
match rel_size_cache.entry(tag) {
Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
}
}
}
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.insert(tag, (lsn, nblocks));
}
fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.remove(tag);
}
}
impl DatadirTimeline for LayeredTimeline {}
///
/// Information about how much history needs to be retained, needed by
@@ -445,6 +377,8 @@ impl Timeline for LayeredTimeline {
/// Look up the value with the given a key
fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes> {
debug_assert!(lsn <= self.get_last_record_lsn());
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
// The cached image can be returned directly if there is no WAL between the cached image
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
@@ -562,13 +496,6 @@ impl LayeredTimeline {
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
}
fn get_checkpoint_timeout(&self) -> Duration {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.checkpoint_timeout
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
fn get_compaction_target_size(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
@@ -658,7 +585,6 @@ impl LayeredTimeline {
disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),
last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0),
last_freeze_ts: RwLock::new(Instant::now()),
ancestor_timeline: ancestor,
ancestor_lsn: metadata.ancestor_lsn(),
@@ -692,7 +618,6 @@ impl LayeredTimeline {
repartition_threshold: 0,
last_received_wal: Mutex::new(None),
rel_size_cache: RwLock::new(HashMap::new()),
};
result.repartition_threshold = result.get_checkpoint_distance() / 10;
result
@@ -1104,11 +1029,8 @@ impl LayeredTimeline {
}
///
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
/// the in-memory layer, and initiate flushing it if so.
///
/// Also flush after a period of time without new data -- it helps
/// safekeepers to regard pageserver as caught up and suspend activity.
/// Check if more than 'checkpoint_distance' of WAL has been accumulated
/// in the in-memory layer, and initiate flushing it if so.
///
pub fn check_checkpoint_distance(self: &Arc<LayeredTimeline>) -> Result<()> {
let last_lsn = self.get_last_record_lsn();
@@ -1116,27 +1038,21 @@ impl LayeredTimeline {
if let Some(open_layer) = &layers.open_layer {
let open_layer_size = open_layer.size()?;
drop(layers);
let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
let distance = last_lsn.widening_sub(last_freeze_at);
let distance = last_lsn.widening_sub(self.last_freeze_at.load());
// Checkpointing the open layer can be triggered by layer size or LSN range.
// S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
// we want to stay below that with a big margin. The LSN distance determines how
// much WAL the safekeepers need to store.
if distance >= self.get_checkpoint_distance().into()
|| open_layer_size > self.get_checkpoint_distance()
|| (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
{
info!(
"check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
distance,
open_layer_size,
last_freeze_ts.elapsed()
"check_checkpoint_distance {}, layer size {}",
distance, open_layer_size
);
self.freeze_inmem_layer(true);
self.last_freeze_at.store(last_lsn);
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
// Launch a thread to flush the frozen layer to disk, unless
// a thread was already running. (If the thread was running

View File

@@ -22,7 +22,7 @@ pub mod walreceiver;
pub mod walrecord;
pub mod walredo;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use tracing::info;
use crate::thread_mgr::ThreadKind;
@@ -42,14 +42,14 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3;
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
lazy_static! {
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
"pageserver_live_connections",
"Number of live network connections",
&["pageserver_connection_kind"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
pub const LOG_FILE_NAME: &str = "pageserver.log";
@@ -93,56 +93,3 @@ pub fn shutdown_pageserver(exit_code: i32) {
info!("Shut down successfully completed");
std::process::exit(exit_code);
}
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
let backoff_duration_seconds =
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
if backoff_duration_seconds > 0.0 {
info!(
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
);
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
}
}
fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
if n == 0 {
0.0
} else {
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
}
}
#[cfg(test)]
mod backoff_defaults_tests {
use super::*;
#[test]
fn backoff_defaults_produce_growing_backoff_sequence() {
let mut current_backoff_value = None;
for i in 0..10_000 {
let new_backoff_value = exponential_backoff_duration_seconds(
i,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
);
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
assert!(
old_backoff_value <= new_backoff_value,
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
)
}
}
assert_eq!(
current_backoff_value.expect("Should have produced backoff values to compare"),
DEFAULT_MAX_BACKOFF_SECONDS,
"Given big enough of retries, backoff should reach its allowed max value"
);
}
}

View File

@@ -55,6 +55,7 @@ use utils::{
use crate::layered_repository::writeback_ephemeral_file;
use crate::repository::Key;
// TODO move ownership into a new PageserverState struct
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;

View File

@@ -11,7 +11,7 @@
use anyhow::{bail, ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use regex::Regex;
use std::io::{self, Read};
use std::net::TcpListener;
@@ -434,15 +434,15 @@ const TIME_BUCKETS: &[f64] = &[
0.1, // 1/10 s
];
static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
"pageserver_smgr_query_seconds",
"Time spent on smgr query handling",
&["smgr_query_type", "tenant_id", "timeline_id"],
TIME_BUCKETS.into()
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
impl PageServerHandler {
pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
@@ -1044,7 +1044,6 @@ impl postgres_backend::Handler for PageServerHandler {
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"checkpoint_distance"),
RowDescriptor::int8_col(b"checkpoint_timeout"),
RowDescriptor::int8_col(b"compaction_target_size"),
RowDescriptor::int8_col(b"compaction_period"),
RowDescriptor::int8_col(b"compaction_threshold"),
@@ -1055,12 +1054,6 @@ impl postgres_backend::Handler for PageServerHandler {
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(repo.get_checkpoint_distance().to_string().as_bytes()),
Some(
repo.get_checkpoint_timeout()
.as_secs()
.to_string()
.as_bytes(),
),
Some(repo.get_compaction_target_size().to_string().as_bytes()),
Some(
repo.get_compaction_period()

View File

@@ -56,16 +56,13 @@ pub trait DatadirTimeline: Timeline {
/// This provides a transaction-like interface to perform a bunch
/// of modifications atomically.
///
/// To ingest a WAL record, call begin_modification(lsn) to get a
/// To ingest a WAL record, call begin_modification() to get a
/// DatadirModification object. Use the functions in the object to
/// modify the repository state, updating all the pages and metadata
/// that the WAL record affects. When you're done, call commit() to
/// commit the changes.
/// that the WAL record affects. When you're done, call commit(lsn) to
/// commit the changes. All the changes will be stamped with the specified LSN.
///
/// Lsn stored in modification is advanced by `ingest_record` and
/// is used by `commit()` to update `last_record_lsn`.
///
/// Calling commit() will flush all the changes and reset the state,
/// Calling commit(lsn) will flush all the changes and reset the state,
/// so the `DatadirModification` struct can be reused to perform the next modification.
///
/// Note that any pending modifications you make through the
@@ -73,7 +70,7 @@ pub trait DatadirTimeline: Timeline {
/// functions of the timeline until you finish! And if you update the
/// same page twice, the last update wins.
///
fn begin_modification(&self, lsn: Lsn) -> DatadirModification<Self>
fn begin_modification(&self) -> DatadirModification<Self>
where
Self: Sized,
{
@@ -82,7 +79,6 @@ pub trait DatadirTimeline: Timeline {
pending_updates: HashMap::new(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
lsn,
}
}
@@ -124,10 +120,6 @@ pub trait DatadirTimeline: Timeline {
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
ensure!(tag.relnode != 0, "invalid relnode");
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(nblocks);
}
if (tag.forknum == pg_constants::FSM_FORKNUM
|| tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, lsn)?
@@ -141,21 +133,13 @@ pub trait DatadirTimeline: Timeline {
let key = rel_size_to_key(tag);
let mut buf = self.get(key, lsn)?;
let nblocks = buf.get_u32_le();
// Update relation size cache
self.update_cached_rel_size(tag, lsn, nblocks);
Ok(nblocks)
Ok(buf.get_u32_le())
}
/// Does relation exist?
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
ensure!(tag.relnode != 0, "invalid relnode");
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(true);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = self.get(key, lsn)?;
@@ -461,18 +445,6 @@ pub trait DatadirTimeline: Timeline {
Ok(result.to_keyspace())
}
/// Get cached size of relation if it not updated after specified LSN
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber>;
/// Update cached relation size if there is no more recent update
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
/// Store cached relation size
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
/// Remove cached relation size
fn remove_cached_rel_size(&self, tag: &RelTag);
}
/// DatadirModification represents an operation to ingest an atomic set of
@@ -485,9 +457,6 @@ pub struct DatadirModification<'a, T: DatadirTimeline> {
/// in the state in 'tline' yet.
pub tline: &'a T,
/// Lsn assigned by begin_modification
pub lsn: Lsn,
// The modifications are not applied directly to the underlying key-value store.
// The put-functions add the modifications here, and they are flushed to the
// underlying key-value store by the 'finish' function.
@@ -697,36 +666,26 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
self.pending_nblocks += nblocks as isize;
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
// Even if nblocks > 0, we don't insert any actual blocks here. That's up to the
// caller.
Ok(())
}
/// Truncate relation
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn)? {
let size_key = rel_size_to_key(rel);
// Fetch the old size first
let old_size = self.get(size_key)?.get_u32_le();
let size_key = rel_size_to_key(rel);
// Update the entry with the new size.
let buf = nblocks.to_le_bytes();
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
// Fetch the old size first
let old_size = self.get(size_key)?.get_u32_le();
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
// Update the entry with the new size.
let buf = nblocks.to_le_bytes();
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
// Update logical database size.
self.pending_nblocks -= old_size as isize - nblocks as isize;
}
// Update logical database size.
self.pending_nblocks -= old_size as isize - nblocks as isize;
Ok(())
}
@@ -744,9 +703,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let buf = nblocks.to_le_bytes();
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
self.pending_nblocks += nblocks as isize - old_size as isize;
}
Ok(())
@@ -772,9 +728,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let old_size = self.get(size_key)?.get_u32_le();
self.pending_nblocks -= old_size as isize;
// Remove enty from relation size cache
self.tline.remove_cached_rel_size(&rel);
// Delete size entry, as well as all blocks
self.delete(rel_key_range(rel));
@@ -889,7 +842,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub fn flush(&mut self) -> Result<()> {
pub fn flush(&mut self, lsn: Lsn) -> Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -903,7 +856,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let mut result: Result<()> = Ok(());
self.pending_updates.retain(|&key, value| {
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
result = writer.put(key, self.lsn, value);
result = writer.put(key, lsn, value);
false
} else {
true
@@ -924,9 +877,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
/// underlying timeline.
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub fn commit(&mut self) -> Result<()> {
pub fn commit(&mut self, lsn: Lsn) -> Result<()> {
let writer = self.tline.writer();
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
@@ -966,8 +919,8 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
bail!("unexpected pending WAL record");
}
} else {
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn)
let last_lsn = self.tline.get_last_record_lsn();
self.tline.get(key, last_lsn)
}
}
@@ -1371,9 +1324,9 @@ pub fn create_test_timeline<R: Repository>(
timeline_id: utils::zid::ZTimelineId,
) -> Result<std::sync::Arc<R::Timeline>> {
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
let mut m = tline.begin_modification(Lsn(8));
let mut m = tline.begin_modification();
m.init_empty()?;
m.commit()?;
m.commit(Lsn(8))?;
Ok(tline)
}

View File

@@ -408,7 +408,7 @@ pub trait TimelineWriter<'a> {
#[cfg(test)]
pub mod repo_harness {
use bytes::BytesMut;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
use std::{fs, path::PathBuf};
@@ -439,13 +439,14 @@ pub mod repo_harness {
buf.freeze()
}
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
lazy_static! {
static ref LOCK: RwLock<()> = RwLock::new(());
}
impl From<TenantConf> for TenantConfOpt {
fn from(tenant_conf: TenantConf) -> Self {
Self {
checkpoint_distance: Some(tenant_conf.checkpoint_distance),
checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
compaction_target_size: Some(tenant_conf.compaction_target_size),
compaction_period: Some(tenant_conf.compaction_period),
compaction_threshold: Some(tenant_conf.compaction_threshold),
@@ -588,10 +589,11 @@ mod tests {
//use std::sync::Arc;
use bytes::BytesMut;
use hex_literal::hex;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
lazy_static! {
static ref TEST_KEY: Key = Key::from_slice(&hex!("112222222233333333444444445500000001"));
}
#[test]
fn test_basic() -> Result<()> {

View File

@@ -155,7 +155,8 @@ use std::{
use anyhow::{anyhow, bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use once_cell::sync::{Lazy, OnceCell};
use lazy_static::lazy_static;
use once_cell::sync::OnceCell;
use remote_storage::{GenericRemoteStorage, RemoteStorage};
use tokio::{
fs,
@@ -172,7 +173,6 @@ use self::{
};
use crate::{
config::PageServerConf,
exponential_backoff,
layered_repository::{
ephemeral_file::is_ephemeral_file,
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
@@ -184,8 +184,8 @@ use crate::{
};
use metrics::{
register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec,
IntCounterVec, IntGauge,
register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge,
HistogramVec, IntCounter, IntCounterVec, IntGauge,
};
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
@@ -193,34 +193,34 @@ use self::download::download_index_parts;
pub use self::download::gather_tenant_timelines_index_parts;
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
lazy_static! {
static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!(
"pageserver_remote_storage_remaining_sync_items",
"Number of storage sync items left in the queue"
)
.expect("failed to register pageserver remote storage remaining sync items int gauge")
});
static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("failed to register pageserver remote storage remaining sync items int gauge");
static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
"pageserver_remote_storage_fatal_task_failures_total",
"Number of critically failed tasks"
)
.expect("failed to register pageserver remote storage remaining sync items int gauge");
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
"pageserver_remote_storage_image_sync_seconds",
"Time took to synchronize (download or upload) a whole pageserver image. \
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
&["tenant_id", "timeline_id", "operation_kind", "status"],
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
)
.expect("failed to register pageserver image sync time histogram vec")
});
static REMOTE_INDEX_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
.expect("failed to register pageserver image sync time histogram vec");
static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!(
"pageserver_remote_storage_remote_index_uploads_total",
"Number of remote index uploads",
&["tenant_id", "timeline_id"],
)
.expect("failed to register pageserver remote index upload vec")
});
.expect("failed to register pageserver remote index upload vec");
}
// TODO move ownership into a new PageserverState struct
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
/// A timeline status to share with pageserver's sync counterpart,
@@ -970,19 +970,14 @@ fn storage_sync_loop<P, S>(
}
}
// needed to check whether the download happened
// more informative than just a bool
#[derive(Debug)]
enum DownloadStatus {
enum DownloadMarker {
Downloaded,
Nothing,
}
#[derive(Debug)]
enum UploadStatus {
Uploaded,
Failed,
Nothing,
}
async fn process_batches<P, S>(
conf: &'static PageServerConf,
max_sync_errors: NonZeroU32,
@@ -1022,7 +1017,7 @@ where
"Finished storage sync task for sync id {sync_id} download marker {:?}",
download_marker
);
if matches!(download_marker, DownloadStatus::Downloaded) {
if matches!(download_marker, DownloadMarker::Downloaded) {
downloaded_timelines.insert(sync_id.tenant_id);
}
}
@@ -1036,7 +1031,7 @@ async fn process_sync_task_batch<P, S>(
max_sync_errors: NonZeroU32,
sync_id: ZTenantTimelineId,
batch: SyncTaskBatch,
) -> DownloadStatus
) -> DownloadMarker
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1053,7 +1048,7 @@ where
// When operating in a system without tasks failing over the error threshold,
// current batching and task processing systems aim to update the layer set and metadata files (remote and local),
// without "losing" such layer files.
let (upload_status, download_status) = tokio::join!(
let (upload_result, status_update) = tokio::join!(
async {
if let Some(upload_data) = upload_data {
match validate_task_retries(upload_data, max_sync_errors)
@@ -1071,7 +1066,7 @@ where
"upload",
)
.await;
UploadStatus::Uploaded
return Some(());
}
ControlFlow::Break(failed_upload_data) => {
if let Err(e) = update_remote_data(
@@ -1088,13 +1083,10 @@ where
{
error!("Failed to update remote timeline {sync_id}: {e:?}");
}
UploadStatus::Failed
}
}
} else {
UploadStatus::Nothing
}
None
}
.instrument(info_span!("upload_timeline_data")),
async {
@@ -1124,53 +1116,51 @@ where
}
}
}
DownloadStatus::Nothing
DownloadMarker::Nothing
}
.instrument(info_span!("download_timeline_data")),
);
if let Some(delete_data) = batch.delete {
match upload_status {
UploadStatus::Uploaded | UploadStatus::Nothing => {
match validate_task_retries(delete_data, max_sync_errors)
.instrument(info_span!("retries_validation"))
if let Some(mut delete_data) = batch.delete {
if upload_result.is_some() {
match validate_task_retries(delete_data, max_sync_errors)
.instrument(info_span!("retries_validation"))
.await
{
ControlFlow::Continue(new_delete_data) => {
delete_timeline_data(
conf,
(storage.as_ref(), &index, sync_queue),
sync_id,
new_delete_data,
sync_start,
"delete",
)
.instrument(info_span!("delete_timeline_data"))
.await;
}
ControlFlow::Break(failed_delete_data) => {
if let Err(e) = update_remote_data(
conf,
storage.as_ref(),
&index,
sync_id,
RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers),
)
.await
{
ControlFlow::Continue(new_delete_data) => {
delete_timeline_data(
conf,
(storage.as_ref(), &index, sync_queue),
sync_id,
new_delete_data,
sync_start,
"delete",
)
.instrument(info_span!("delete_timeline_data"))
.await;
}
ControlFlow::Break(failed_delete_data) => {
if let Err(e) = update_remote_data(
conf,
storage.as_ref(),
&index,
sync_id,
RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers),
)
.await
{
error!("Failed to update remote timeline {sync_id}: {e:?}");
}
{
error!("Failed to update remote timeline {sync_id}: {e:?}");
}
}
}
UploadStatus::Failed => {
warn!("Skipping delete task due to failed upload tasks, reenqueuing");
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
}
} else {
delete_data.retries += 1;
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
warn!("Skipping delete task due to failed upload tasks, reenqueuing");
}
}
download_status
status_update
}
async fn download_timeline_data<P, S>(
@@ -1181,7 +1171,7 @@ async fn download_timeline_data<P, S>(
new_download_data: SyncData<LayersDownload>,
sync_start: Instant,
task_name: &str,
) -> DownloadStatus
) -> DownloadMarker
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1210,7 +1200,7 @@ where
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
Ok(()) => {
register_sync_status(sync_id, sync_start, task_name, Some(true));
return DownloadStatus::Downloaded;
return DownloadMarker::Downloaded;
}
Err(e) => {
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
@@ -1226,7 +1216,7 @@ where
}
}
DownloadStatus::Nothing
DownloadMarker::Nothing
}
async fn update_local_metadata(
@@ -1504,7 +1494,11 @@ async fn validate_task_retries<T>(
return ControlFlow::Break(sync_data);
}
exponential_backoff(current_attempt, 1.0, 30.0).await;
if current_attempt > 0 {
let seconds_to_wait = 2.0_f64.powf(current_attempt as f64 - 1.0).min(30.0);
info!("Waiting {seconds_to_wait} seconds before starting the task");
tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
}
ControlFlow::Continue(sync_data)
}

View File

@@ -130,7 +130,6 @@ where
tenant_path.display()
)
})?;
let timelines = storage
.list_prefixes(Some(tenant_storage_path))
.await
@@ -141,13 +140,6 @@ where
)
})?;
if timelines.is_empty() {
anyhow::bail!(
"no timelines found on the remote storage for tenant {}",
tenant_id
)
}
let mut sync_ids = HashSet::new();
for timeline_remote_storage_key in timelines {

View File

@@ -4,7 +4,7 @@ use std::{fmt::Debug, path::PathBuf};
use anyhow::Context;
use futures::stream::{FuturesUnordered, StreamExt};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use remote_storage::RemoteStorage;
use tokio::fs;
use tracing::{debug, error, info, warn};
@@ -20,14 +20,14 @@ use crate::{
};
use metrics::{register_int_counter_vec, IntCounterVec};
static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
lazy_static! {
static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!(
"pageserver_remote_storage_no_layers_uploads_total",
"Number of skipped uploads due to no layers",
&["tenant_id", "timeline_id"],
)
.expect("failed to register pageserver no layers upload vec")
});
.expect("failed to register pageserver no layers upload vec");
}
/// Serializes and uploads the given index part data to the remote storage.
pub(super) async fn upload_index_part<P, S>(

View File

@@ -23,7 +23,6 @@ pub mod defaults {
// which is good for now to trigger bugs.
// This parameter actually determines L0 layer file size.
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
// Target file size, when creating image and delta layers.
// This parameter determines L1 layer file size.
@@ -49,9 +48,6 @@ pub struct TenantConf {
// page server crashes.
// This parameter actually determines L0 layer file size.
pub checkpoint_distance: u64,
// Inmemory layer is also flushed at least once in checkpoint_timeout to
// eventually upload WAL after activity is stopped.
pub checkpoint_timeout: Duration,
// Target file size, when creating image and delta layers.
// This parameter determines L1 layer file size.
pub compaction_target_size: u64,
@@ -94,7 +90,6 @@ pub struct TenantConf {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct TenantConfOpt {
pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<Duration>,
pub compaction_target_size: Option<u64>,
#[serde(with = "humantime_serde")]
pub compaction_period: Option<Duration>,
@@ -118,9 +113,6 @@ impl TenantConfOpt {
checkpoint_distance: self
.checkpoint_distance
.unwrap_or(global_conf.checkpoint_distance),
checkpoint_timeout: self
.checkpoint_timeout
.unwrap_or(global_conf.checkpoint_timeout),
compaction_target_size: self
.compaction_target_size
.unwrap_or(global_conf.compaction_target_size),
@@ -150,9 +142,6 @@ impl TenantConfOpt {
if let Some(checkpoint_distance) = other.checkpoint_distance {
self.checkpoint_distance = Some(checkpoint_distance);
}
if let Some(checkpoint_timeout) = other.checkpoint_timeout {
self.checkpoint_timeout = Some(checkpoint_timeout);
}
if let Some(compaction_target_size) = other.compaction_target_size {
self.compaction_target_size = Some(compaction_target_size);
}
@@ -192,8 +181,6 @@ impl TenantConf {
TenantConf {
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
.expect("cannot parse default checkpoint timeout"),
compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
.expect("cannot parse default compaction period"),
@@ -225,7 +212,6 @@ impl TenantConf {
pub fn dummy_conf() -> Self {
TenantConf {
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_timeout: Duration::from_secs(600),
compaction_target_size: 4 * 1024 * 1024,
compaction_period: Duration::from_secs(10),
compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,

View File

@@ -25,27 +25,26 @@ use utils::lsn::Lsn;
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
// TODO move ownership into a new PageserverState struct
mod tenants_state {
use anyhow::ensure;
use once_cell::sync::Lazy;
use std::{
collections::HashMap,
sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
};
use tokio::sync::mpsc;
use tracing::{debug, error};
use utils::zid::ZTenantId;
use crate::tenant_mgr::{LocalTimelineUpdate, Tenant};
static TENANTS: Lazy<RwLock<HashMap<ZTenantId, Tenant>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
/// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
/// so that it can enable/disable corresponding processes.
static TIMELINE_UPDATE_SENDER: Lazy<
RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>>,
> = Lazy::new(|| RwLock::new(None));
lazy_static::lazy_static! {
static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>> = RwLock::new(HashMap::new());
/// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
/// so that it can enable/disable corresponding processes.
static ref TIMELINE_UPDATE_SENDER: RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>> = RwLock::new(None);
}
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<ZTenantId, Tenant>> {
TENANTS

View File

@@ -87,6 +87,7 @@ async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
);
}
// TODO move ownership into a new PageserverState struct
static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();

View File

@@ -45,20 +45,22 @@ use tokio::sync::watch;
use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use utils::zid::{ZTenantId, ZTimelineId};
use crate::shutdown_pageserver;
/// Each thread that we track is associated with a "thread ID". It's just
/// an increasing number that we assign, not related to any system thread
/// id.
static NEXT_THREAD_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
// TODO move ownership into a new PageserverState struct
lazy_static! {
/// Each thread that we track is associated with a "thread ID". It's just
/// an increasing number that we assign, not related to any system thread
/// id.
static ref NEXT_THREAD_ID: AtomicU64 = AtomicU64::new(1);
/// Global registry of threads
static THREADS: Lazy<Mutex<HashMap<u64, Arc<PageServerThread>>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
/// Global registry of threads
static ref THREADS: Mutex<HashMap<u64, Arc<PageServerThread>>> = Mutex::new(HashMap::new());
}
// There is a Tokio watch channel for each thread, which can be used to signal the
// thread that it needs to shut down. This thread local variable holds the receiving

View File

@@ -232,7 +232,7 @@ pub(crate) fn create_timeline(
return Ok(None);
}
match ancestor_timeline_id {
let _new_timeline = match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
let ancestor_timeline = repo
.get_timeline_load(ancestor_timeline_id)

View File

@@ -10,7 +10,7 @@
//! This is similar to PostgreSQL's virtual file descriptor facility in
//! src/backend/storage/file/fd.c
//!
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use once_cell::sync::OnceCell;
use std::fs::{File, OpenOptions};
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
@@ -32,24 +32,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
1.0, // 1 sec
];
static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
"pageserver_io_operations_seconds",
"Time spent in IO operations",
&["operation", "tenant_id", "timeline_id"],
STORAGE_IO_TIME_BUCKETS.into()
)
.expect("failed to define a metric")
});
static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
.expect("failed to define a metric");
}
lazy_static! {
static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
"pageserver_io_operations_bytes_total",
"Total amount of bytes read/written in IO operations",
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally

View File

@@ -30,6 +30,8 @@ use anyhow::Result;
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use std::collections::HashMap;
use crate::pgdatadir_mapping::*;
use crate::reltag::{RelTag, SlruKind};
use crate::walrecord::*;
@@ -46,6 +48,8 @@ pub struct WalIngest<'a, T: DatadirTimeline> {
checkpoint: CheckPoint,
checkpoint_modified: bool,
relsize_cache: HashMap<RelTag, BlockNumber>,
}
impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
@@ -60,13 +64,13 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
timeline,
checkpoint,
checkpoint_modified: false,
relsize_cache: HashMap::new(),
})
}
///
/// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline.
///
/// This function updates `lsn` field of `DatadirModification`
///
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
/// relations/pages that the record affects.
@@ -78,7 +82,6 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
modification: &mut DatadirModification<T>,
decoded: &mut DecodedWALRecord,
) -> Result<()> {
modification.lsn = lsn;
decode_wal_record(recdata, decoded).context("failed decoding wal record")?;
let mut buf = decoded.record.clone();
@@ -257,7 +260,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
modification.commit()?;
modification.commit(lsn)?;
Ok(())
}
@@ -405,7 +408,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
let vm_size = self.get_relsize(vm_rel)?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -877,6 +880,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
modification: &mut DatadirModification<T>,
rel: RelTag,
) -> Result<()> {
self.relsize_cache.insert(rel, 0);
modification.put_rel_creation(rel, 0)?;
Ok(())
}
@@ -912,6 +916,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
nblocks: BlockNumber,
) -> Result<()> {
modification.put_rel_truncation(rel, nblocks)?;
self.relsize_cache.insert(rel, nblocks);
Ok(())
}
@@ -921,16 +926,23 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
rel: RelTag,
) -> Result<()> {
modification.put_rel_drop(rel)?;
self.relsize_cache.remove(&rel);
Ok(())
}
fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result<BlockNumber> {
let nblocks = if !self.timeline.get_rel_exists(rel, lsn)? {
0
fn get_relsize(&mut self, rel: RelTag) -> Result<BlockNumber> {
if let Some(nblocks) = self.relsize_cache.get(&rel) {
Ok(*nblocks)
} else {
self.timeline.get_rel_size(rel, lsn)?
};
Ok(nblocks)
let last_lsn = self.timeline.get_last_record_lsn();
let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
0
} else {
self.timeline.get_rel_size(rel, last_lsn)?
};
self.relsize_cache.insert(rel, nblocks);
Ok(nblocks)
}
}
fn handle_rel_extend(
@@ -940,16 +952,22 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
blknum: BlockNumber,
) -> Result<()> {
let new_nblocks = blknum + 1;
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = modification.lsn;
let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
// create it with 0 size initially, the logic below will extend it
modification.put_rel_creation(rel, 0)?;
0
let old_nblocks = if let Some(nblocks) = self.relsize_cache.get(&rel) {
*nblocks
} else {
self.timeline.get_rel_size(rel, last_lsn)?
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = self.timeline.get_last_record_lsn();
let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
// create it with 0 size initially, the logic below will extend it
modification.put_rel_creation(rel, 0)?;
0
} else {
self.timeline.get_rel_size(rel, last_lsn)?
};
self.relsize_cache.insert(rel, nblocks);
nblocks
};
if new_nblocks > old_nblocks {
@@ -960,6 +978,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
for gap_blknum in old_nblocks..blknum {
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
}
self.relsize_cache.insert(rel, new_nblocks);
}
Ok(())
}
@@ -1050,10 +1069,10 @@ mod tests {
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
fn init_walingest_test<T: DatadirTimeline>(tline: &T) -> Result<WalIngest<T>> {
let mut m = tline.begin_modification(Lsn(0x10));
let mut m = tline.begin_modification();
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
m.commit()?;
m.commit(Lsn(0x10))?;
let walingest = WalIngest::new(tline, Lsn(0x10))?;
Ok(walingest)
@@ -1065,19 +1084,19 @@ mod tests {
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
let mut m = tline.begin_modification(Lsn(0x20));
let mut m = tline.begin_modification();
walingest.put_rel_creation(&mut m, TESTREL_A)?;
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x30));
m.commit(Lsn(0x20))?;
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x40));
m.commit(Lsn(0x30))?;
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x50));
m.commit(Lsn(0x40))?;
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
m.commit()?;
m.commit(Lsn(0x50))?;
assert_current_logical_size(&*tline, Lsn(0x50));
@@ -1123,9 +1142,9 @@ mod tests {
);
// Truncate last block
let mut m = tline.begin_modification(Lsn(0x60));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
m.commit()?;
m.commit(Lsn(0x60))?;
assert_current_logical_size(&*tline, Lsn(0x60));
// Check reported size and contents after truncation
@@ -1147,15 +1166,15 @@ mod tests {
);
// Truncate to zero length
let mut m = tline.begin_modification(Lsn(0x68));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
m.commit()?;
m.commit(Lsn(0x68))?;
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0);
// Extend from 0 to 2 blocks, leaving a gap
let mut m = tline.begin_modification(Lsn(0x70));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
m.commit()?;
m.commit(Lsn(0x70))?;
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2);
assert_eq!(
tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?,
@@ -1167,9 +1186,9 @@ mod tests {
);
// Extend a lot more, leaving a big gap that spans across segments
let mut m = tline.begin_modification(Lsn(0x80));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
m.commit()?;
m.commit(Lsn(0x80))?;
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501);
for blk in 2..1500 {
assert_eq!(
@@ -1193,18 +1212,18 @@ mod tests {
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
let mut m = tline.begin_modification(Lsn(0x20));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
m.commit()?;
m.commit(Lsn(0x20))?;
// Check that rel exists and size is correct
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1);
// Drop rel
let mut m = tline.begin_modification(Lsn(0x30));
let mut m = tline.begin_modification();
walingest.put_rel_drop(&mut m, TESTREL_A)?;
m.commit()?;
m.commit(Lsn(0x30))?;
// Check that rel is not visible anymore
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
@@ -1213,9 +1232,9 @@ mod tests {
//assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none());
// Re-create it
let mut m = tline.begin_modification(Lsn(0x40));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
m.commit()?;
m.commit(Lsn(0x40))?;
// Check that rel exists and size is correct
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
@@ -1235,12 +1254,12 @@ mod tests {
// Create a 20 MB relation (the size is arbitrary)
let relsize = 20 * 1024 * 1024 / 8192;
let mut m = tline.begin_modification(Lsn(0x20));
let mut m = tline.begin_modification();
for blkno in 0..relsize {
let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
}
m.commit()?;
m.commit(Lsn(0x20))?;
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
@@ -1261,9 +1280,9 @@ mod tests {
// Truncate relation so that second segment was dropped
// - only leave one page
let mut m = tline.begin_modification(Lsn(0x60));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?;
m.commit()?;
m.commit(Lsn(0x60))?;
// Check reported size and contents after truncation
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1);
@@ -1291,12 +1310,12 @@ mod tests {
// Extend relation again.
// Add enough blocks to create second segment
let lsn = Lsn(0x80);
let mut m = tline.begin_modification(lsn);
let mut m = tline.begin_modification();
for blkno in 0..relsize {
let data = format!("foo blk {} at {}", blkno, lsn);
walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
}
m.commit()?;
m.commit(lsn)?;
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize);
@@ -1324,10 +1343,10 @@ mod tests {
let mut lsn = 0x10;
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?;
m.commit()?;
m.commit(Lsn(lsn))?;
}
assert_current_logical_size(&*tline, Lsn(lsn));
@@ -1339,9 +1358,9 @@ mod tests {
// Truncate one block
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?;
m.commit()?;
m.commit(Lsn(lsn))?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
pg_constants::RELSEG_SIZE
@@ -1350,9 +1369,9 @@ mod tests {
// Truncate another block
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?;
m.commit()?;
m.commit(Lsn(lsn))?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
pg_constants::RELSEG_SIZE - 1
@@ -1364,9 +1383,9 @@ mod tests {
let mut size: i32 = 3000;
while size >= 0 {
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
m.commit()?;
m.commit(Lsn(lsn))?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
size as BlockNumber

View File

@@ -66,7 +66,7 @@ pub fn init_wal_receiver_main_thread(
);
let broker_prefix = &conf.broker_etcd_prefix;
info!(
"Starting wal receiver main thread, etcd endpoints: {}",
"Starting wal receiver main thread, etdc endpoints: {}",
etcd_endpoints.iter().map(Url::to_string).join(", ")
);

View File

@@ -25,11 +25,7 @@ use etcd_broker::{
use tokio::select;
use tracing::*;
use crate::{
exponential_backoff,
repository::{Repository, Timeline},
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use crate::repository::{Repository, Timeline};
use crate::{RepositoryImpl, TimelineImpl};
use utils::{
lsn::Lsn,
@@ -234,6 +230,18 @@ async fn subscribe_for_timeline_updates(
}
}
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) {
if n == 0 {
return;
}
let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds);
info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task");
tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
}
/// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible.
struct WalreceiverState {
id: ZTenantTimelineId,

View File

@@ -154,7 +154,7 @@ pub async fn handle_walreceiver_connection(
{
let mut decoded = DecodedWALRecord::default();
let mut modification = timeline.begin_modification(endlsn);
let mut modification = timeline.begin_modification();
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// let _enter = info_span!("processing record", lsn = %lsn).entered();
@@ -178,6 +178,16 @@ pub async fn handle_walreceiver_connection(
caught_up = true;
}
let timeline_to_check = Arc::clone(&timeline);
tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance())
.await
.with_context(|| {
format!("Spawned checkpoint check task panicked for timeline {id}")
})?
.with_context(|| {
format!("Failed to check checkpoint distance for timeline {id}")
})?;
Some(endlsn)
}
@@ -198,12 +208,6 @@ pub async fn handle_walreceiver_connection(
_ => None,
};
let timeline_to_check = Arc::clone(&timeline);
tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance())
.await
.with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))?
.with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?;
if let Some(last_lsn) = status_update {
let remote_index = repo.get_remote_index();
let timeline_remote_consistent_lsn = remote_index

View File

@@ -20,8 +20,8 @@
//!
use byteorder::{ByteOrder, LittleEndian};
use bytes::{BufMut, Bytes, BytesMut};
use lazy_static::lazy_static;
use nix::poll::*;
use once_cell::sync::Lazy;
use serde::Serialize;
use std::fs;
use std::fs::OpenOptions;
@@ -105,27 +105,21 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
// We collect the time spent in actual WAL redo ('redo'), and time waiting
// for access to the postgres process ('wait') since there is only one for
// each tenant.
static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
.expect("failed to define a metric")
});
static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
lazy_static! {
static ref WAL_REDO_TIME: Histogram =
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
.expect("failed to define a metric");
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the WAL redo process"
)
.expect("failed to define a metric")
});
static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.expect("failed to define a metric");
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
"pageserver_replayed_wal_records_total",
"Number of WAL records replayed in WAL redo process"
)
.unwrap()
});
.unwrap();
}
///
/// This is the real implementation that uses a Postgres process to

View File

@@ -14,7 +14,7 @@ hashbrown = "0.11.2"
hex = "0.4.3"
hmac = "0.12.1"
hyper = "0.14"
once_cell = "1.13.0"
lazy_static = "1.4.0"
md5 = "0.7.0"
parking_lot = "0.12"
pin-project-lite = "0.2.7"

View File

@@ -12,12 +12,13 @@ use crate::{
stream::PqStream,
waiters::{self, Waiter, Waiters},
};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);
lazy_static! {
static ref CPLANE_WAITERS: Waiters<mgmt::ComputeReady> = Default::default();
}
/// Give caller an opportunity to wait for the cloud's reply.
pub async fn with_waiter<R, T, E>(

View File

@@ -4,8 +4,8 @@ use crate::config::{ProxyConfig, TlsConfig};
use crate::stream::{MetricsStream, PqStream, Stream};
use anyhow::{bail, Context};
use futures::TryFutureExt;
use lazy_static::lazy_static;
use metrics::{register_int_counter, IntCounter};
use once_cell::sync::Lazy;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use utils::pq_proto::{BeMessage as Be, *};
@@ -13,29 +13,23 @@ use utils::pq_proto::{BeMessage as Be, *};
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
lazy_static! {
static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!(
"proxy_accepted_connections_total",
"Number of TCP client connections accepted."
)
.unwrap()
});
static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.unwrap();
static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!(
"proxy_closed_connections_total",
"Number of TCP client connections closed."
)
.unwrap()
});
static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.unwrap();
static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!(
"proxy_io_bytes_total",
"Number of bytes sent/received between any client and backend."
)
.unwrap()
});
.unwrap();
}
/// A small combinator for pluggable error logging.
async fn log_error<R, F>(future: F) -> F::Output

View File

@@ -1,8 +1,4 @@
[pytest]
filterwarnings =
error::pytest.PytestUnhandledThreadExceptionWarning
error::UserWarning
ignore:record_property is incompatible with junit_family:pytest.PytestWarning
addopts =
-m 'not remote_cluster'
markers =

View File

@@ -9,6 +9,7 @@ bytes = "1.0.1"
byteorder = "1.4.3"
hyper = "0.14"
fs2 = "0.4.3"
lazy_static = "1.4.0"
serde_json = "1"
tracing = "0.1.27"
clap = "3.0"
@@ -28,7 +29,7 @@ const_format = "0.2.21"
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
git-version = "0.3.5"
async-trait = "0.1"
once_cell = "1.13.0"
once_cell = "1.10.0"
toml_edit = { version = "0.13", features = ["easy"] }
postgres_ffi = { path = "../libs/postgres_ffi" }

View File

@@ -2,7 +2,7 @@
use anyhow::{bail, ensure, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::fs::{self, File, OpenOptions};
use std::io::{Read, Write};
@@ -26,15 +26,15 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control";
const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
static PERSIST_CONTROL_FILE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_persist_control_file_seconds",
"Seconds to persist and sync control file, grouped by timeline",
&["tenant_id", "timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec")
});
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec");
}
/// Storage should keep actual state inside of it. It should implement Deref
/// trait to access state fields and have persist method for updating that state.

View File

@@ -727,7 +727,7 @@ where
info!("setting local_start_lsn to {:?}", state.local_start_lsn);
}
// Initializing commit_lsn before acking first flushed record is
// important to let find_end_of_wal skip the hole in the beginning
// important to let find_end_of_wal skip the whole in the beginning
// of the first segment.
//
// NB: on new clusters, this happens at the same time as
@@ -738,10 +738,6 @@ where
// Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);
// Initializing remote_consistent_lsn sets that we have nothing to
// stream to pageserver(s) immediately after creation.
self.inmem.remote_consistent_lsn =
max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn);
state.acceptor_state.term_history = msg.term_history.clone();
self.persist_control_file(state)?;

View File

@@ -4,7 +4,7 @@
use anyhow::{bail, Context, Result};
use etcd_broker::subscription_value::SkTimelineInfo;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use postgres_ffi::xlog_utils::XLogSegNo;
use serde::Serialize;
@@ -137,7 +137,7 @@ impl SharedState {
self.is_wal_backup_required()
// FIXME: add tracking of relevant pageservers and check them here individually,
// otherwise migration won't work (we suspend too early).
|| self.sk.inmem.remote_consistent_lsn < self.sk.inmem.commit_lsn
|| self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn
}
/// Mark timeline active/inactive and return whether s3 offloading requires
@@ -559,12 +559,12 @@ struct GlobalTimelinesState {
wal_backup_launcher_tx: Option<Sender<ZTenantTimelineId>>,
}
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
Mutex::new(GlobalTimelinesState {
lazy_static! {
static ref TIMELINES_STATE: Mutex<GlobalTimelinesState> = Mutex::new(GlobalTimelinesState {
timelines: HashMap::new(),
wal_backup_launcher_tx: None,
})
});
});
}
#[derive(Clone, Copy, Serialize)]
pub struct TimelineDeleteForceResult {

View File

@@ -12,7 +12,7 @@ use std::io::{self, Seek, SeekFrom};
use std::pin::Pin;
use tokio::io::AsyncRead;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use postgres_ffi::xlog_utils::{
find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI,
};
@@ -38,44 +38,31 @@ use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECOND
use tokio::io::{AsyncReadExt, AsyncSeekExt};
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
// i64 is faster than f64, so update to u64 when available.
static WRITE_WAL_BYTES: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
// i64 is faster than f64, so update to u64 when available.
static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!(
"safekeeper_write_wal_bytes",
"Bytes written to WAL in a single request, grouped by timeline",
&["tenant_id", "timeline_id"],
vec![
1.0,
10.0,
100.0,
1024.0,
8192.0,
128.0 * 1024.0,
1024.0 * 1024.0,
10.0 * 1024.0 * 1024.0
]
vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]
)
.expect("Failed to register safekeeper_write_wal_bytes histogram vec")
});
static WRITE_WAL_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("Failed to register safekeeper_write_wal_bytes histogram vec");
static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_write_wal_seconds",
"Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline",
&["tenant_id", "timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_write_wal_seconds histogram vec")
});
static FLUSH_WAL_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("Failed to register safekeeper_write_wal_seconds histogram vec");
static ref FLUSH_WAL_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_flush_wal_seconds",
"Seconds spent syncing WAL to a disk, grouped by timeline",
&["tenant_id", "timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_flush_wal_seconds histogram vec")
});
.expect("Failed to register safekeeper_flush_wal_seconds histogram vec");
}
struct WalStorageMetrics {
write_wal_bytes: Histogram,

View File

@@ -1,708 +0,0 @@
#
# Script to export tenants from one pageserver and import them into another page server.
#
# Outline of steps:
# 1. Get `(last_lsn, prev_lsn)` from old pageserver
# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
# 3. This tar file might be missing relation files for empty relations, if the pageserver
# is old enough (we didn't always store those). So to recreate them, we start a local
# vanilla postgres on this basebackup and ask it what relations should exist, then touch
# any missing files and re-pack the tar.
# TODO This functionality is no longer needed, so we can delete it later if we don't
# end up using the same utils for the pg 15 upgrade. Not sure.
# 4. We import the patched basebackup into a new pageserver
# 5. We export again via fullbackup, now from the new pageserver and compare the returned
# tar file with the one we imported. This confirms that we imported everything that was
# exported, but doesn't guarantee correctness (what if we didn't **export** everything
# initially?)
# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
#
# For more context on how to use this, see:
# https://github.com/neondatabase/cloud/wiki/Storage-format-migration
import os
from os import path
import shutil
from pathlib import Path
import tempfile
from contextlib import closing
import psycopg2
import subprocess
import argparse
import time
import requests
import uuid
from psycopg2.extensions import connection as PgConnection
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple
###############################################
### client-side utils copied from test fixtures
###############################################
Env = Dict[str, str]
_global_counter = 0
def global_counter() -> int:
""" A really dumb global counter.
This is useful for giving output files a unique number, so if we run the
same command multiple times we can keep their output separate.
"""
global _global_counter
_global_counter += 1
return _global_counter
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
""" Run a process and capture its output
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
where "cmd" is the name of the program and NNN is an incrementing
counter.
If those files already exist, we will overwrite them.
Returns basepath for files with captured output.
"""
assert type(cmd) is list
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
basepath = os.path.join(capture_dir, base)
stdout_filename = basepath + '.stdout'
stderr_filename = basepath + '.stderr'
with open(stdout_filename, 'w') as stdout_f:
with open(stderr_filename, 'w') as stderr_f:
print('(capturing output to "{}.stdout")'.format(base))
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
return basepath
class PgBin:
""" A helper class for executing postgres binaries """
def __init__(self, log_dir: Path, pg_distrib_dir):
self.log_dir = log_dir
self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin')
self.env = os.environ.copy()
self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib')
def _fixpath(self, command: List[str]):
if '/' not in command[0]:
command[0] = os.path.join(self.pg_bin_path, command[0])
def _build_env(self, env_add: Optional[Env]) -> Env:
if env_add is None:
return self.env
env = self.env.copy()
env.update(env_add)
return env
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
"""
Run one of the postgres binaries.
The command should be in list form, e.g. ['pgbench', '-p', '55432']
All the necessary environment variables will be set.
If the first argument (the command name) doesn't include a path (no '/'
characters present), then it will be edited to include the correct path.
If you want stdout/stderr captured to files, use `run_capture` instead.
"""
self._fixpath(command)
print('Running command "{}"'.format(' '.join(command)))
env = self._build_env(env)
subprocess.run(command, env=env, cwd=cwd, check=True)
def run_capture(self,
command: List[str],
env: Optional[Env] = None,
cwd: Optional[str] = None,
**kwargs: Any) -> str:
"""
Run one of the postgres binaries, with stderr and stdout redirected to a file.
This is just like `run`, but for chatty programs. Returns basepath for files
with captured output.
"""
self._fixpath(command)
print('Running command "{}"'.format(' '.join(command)))
env = self._build_env(env)
return subprocess_capture(str(self.log_dir),
command,
env=env,
cwd=cwd,
check=True,
**kwargs)
class PgProtocol:
""" Reusable connection logic """
def __init__(self, **kwargs):
self.default_options = kwargs
def conn_options(self, **kwargs):
conn_options = self.default_options.copy()
if 'dsn' in kwargs:
conn_options.update(parse_dsn(kwargs['dsn']))
conn_options.update(kwargs)
# Individual statement timeout in seconds. 2 minutes should be
# enough for our tests, but if you need a longer, you can
# change it by calling "SET statement_timeout" after
# connecting.
if 'options' in conn_options:
conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options']
else:
conn_options['options'] = "-cstatement_timeout=120s"
return conn_options
# autocommit=True here by default because that's what we need most of the time
def connect(self, autocommit=True, **kwargs) -> PgConnection:
"""
Connect to the node.
Returns psycopg2's connection object.
This method passes all extra params to connstr.
"""
conn = psycopg2.connect(**self.conn_options(**kwargs))
# WARNING: this setting affects *all* tests!
conn.autocommit = autocommit
return conn
def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
"""
Execute query against the node and return all rows.
This method passes all extra params to connstr.
"""
return self.safe_psql_many([query], **kwargs)[0]
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
"""
Execute queries against the node and return all rows.
This method passes all extra params to connstr.
"""
result: List[List[Any]] = []
with closing(self.connect(**kwargs)) as conn:
with conn.cursor() as cur:
for query in queries:
print(f"Executing query: {query}")
cur.execute(query)
if cur.description is None:
result.append([]) # query didn't return data
else:
result.append(cast(List[Any], cur.fetchall()))
return result
class VanillaPostgres(PgProtocol):
def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
super().__init__(host='localhost', port=port, dbname='postgres')
self.pgdatadir = pgdatadir
self.pg_bin = pg_bin
self.running = False
if init:
self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)])
self.configure([f"port = {port}\n"])
def configure(self, options: List[str]):
"""Append lines into postgresql.conf file."""
assert not self.running
with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file:
conf_file.write("\n".join(options))
def start(self, log_path: Optional[str] = None):
assert not self.running
self.running = True
if log_path is None:
log_path = os.path.join(self.pgdatadir, "pg.log")
self.pg_bin.run_capture(
['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start'])
def stop(self):
assert self.running
self.running = False
self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop'])
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
if self.running:
self.stop()
class NeonPageserverApiException(Exception):
pass
class NeonPageserverHttpClient(requests.Session):
def __init__(self, host, port):
super().__init__()
self.host = host
self.port = port
def verbose_error(self, res: requests.Response):
try:
res.raise_for_status()
except requests.RequestException as e:
try:
msg = res.json()['msg']
except:
msg = ''
raise NeonPageserverApiException(msg) from e
def check_status(self):
self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
def tenant_list(self):
res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
res = self.post(
f"http://{self.host}:{self.port}/v1/tenant",
json={
'new_tenant_id': new_tenant_id.hex,
},
)
if res.status_code == 409:
if ok_if_exists:
print(f'could not create tenant: already exists for id {new_tenant_id}')
else:
res.raise_for_status()
elif res.status_code == 201:
print(f'created tenant {new_tenant_id}')
else:
self.verbose_error(res)
return new_tenant_id
def timeline_list(self, tenant_id: uuid.UUID):
res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1"
)
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def lsn_to_hex(num: int) -> str:
""" Convert lsn from int to standard hex notation. """
return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
def lsn_from_hex(lsn_hex: str) -> int:
""" Convert lsn from hex notation to int. """
l, r = lsn_hex.split('/')
return (int(l, 16) << 32) + int(r, 16)
def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient,
tenant: uuid.UUID,
timeline: uuid.UUID) -> int:
detail = pageserver_http_client.timeline_detail(tenant, timeline)
if detail['remote'] is None:
# No remote information at all. This happens right after creating
# a timeline, before any part of it has been uploaded to remote
# storage yet.
return 0
else:
lsn_str = detail['remote']['remote_consistent_lsn']
assert isinstance(lsn_str, str)
return lsn_from_hex(lsn_str)
def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient,
tenant: uuid.UUID,
timeline: uuid.UUID,
lsn: int):
"""waits for local timeline upload up to specified lsn"""
for i in range(10):
current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
if current_lsn >= lsn:
return
print("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1))
time.sleep(1)
raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
##############
# End of utils
##############
def pack_base(log_dir, restored_dir, output_tar):
"""Create tar file from basebackup, being careful to produce relative filenames."""
tmp_tar_name = "tmp.tar"
tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
# We actually cd into the dir and call tar from there. If we call tar from
# outside we won't encode filenames as relative, and they won't parse well
# on import.
subprocess_capture(log_dir, cmd, cwd=restored_dir)
shutil.move(tmp_tar_path, output_tar)
def reconstruct_paths(log_dir, pg_bin, base_tar):
"""Reconstruct what relation files should exist in the datadir by querying postgres."""
with tempfile.TemporaryDirectory() as restored_dir:
# Unpack the base tar
subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
# Start a vanilla postgres from the given datadir and query it to find
# what relfiles should exist, but possibly don't.
port = "55439" # Probably free
with VanillaPostgres(restored_dir, pg_bin, port, init=False) as vanilla_pg:
vanilla_pg.configure([f"port={port}"])
vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
# Create database based on template0 because we can't connect to template0
query = "create database template0copy template template0"
vanilla_pg.safe_psql(query, user="cloud_admin")
vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
# Get all databases
query = "select oid, datname from pg_database"
oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
template0_oid = [
oid for (oid, database) in oid_dbname_pairs if database == "template0"
][0]
# Get rel paths for each database
for oid, database in oid_dbname_pairs:
if database == "template0":
# We can't connect to template0
continue
query = "select relname, pg_relation_filepath(oid) from pg_class"
result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
for relname, filepath in result:
if filepath is not None:
if database == "template0copy":
# Add all template0copy paths to template0
prefix = f"base/{oid}/"
if filepath.startswith(prefix):
suffix = filepath[len(prefix):]
yield f"base/{template0_oid}/{suffix}"
elif filepath.startswith("global"):
print(f"skipping {database} global file {filepath}")
else:
raise AssertionError
else:
yield filepath
def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
"""Add the appropriate empty files to a basebadkup tar."""
with tempfile.TemporaryDirectory() as restored_dir:
# Unpack the base tar
subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
# Touch files that don't exist
for path in paths:
absolute_path = os.path.join(restored_dir, path)
exists = os.path.exists(absolute_path)
if not exists:
print(f"File {absolute_path} didn't exist. Creating..")
Path(absolute_path).touch()
# Repackage
pack_base(log_dir, restored_dir, output_tar)
# HACK This is a workaround for exporting from old pageservers that
# can't export empty relations. In this case we need to start
# a vanilla postgres from the exported datadir, and query it
# to see what empty relations are missing, and then create
# those empty files before importing.
def add_missing_rels(base_tar, output_tar, log_dir, pg_bin):
reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar))
touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
conn = psycopg2.connect(pageserver_connstr)
conn.autocommit = True
with conn.cursor() as cur:
cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
cur.execute(cmd)
res = cur.fetchone()
prev_lsn = res[0]
last_lsn = res[1]
conn.close()
return last_lsn, prev_lsn
def import_timeline(args,
psql_path,
pageserver_connstr,
pageserver_http,
tenant_id,
timeline_id,
last_lsn,
prev_lsn,
tar_filename):
# Import timelines to new pageserver
import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}"
full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
stderr_filename2 = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
stdout_filename = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
print(f"Running: {full_cmd}")
with open(stdout_filename, 'w') as stdout_f:
with open(stderr_filename2, 'w') as stderr_f:
print(f"(capturing output to {stdout_filename})")
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
subprocess.run(full_cmd,
stdout=stdout_f,
stderr=stderr_f,
env=pg_bin._build_env(None),
shell=True,
check=True)
print(f"Done import")
# Wait until pageserver persists the files
wait_for_upload(pageserver_http,
uuid.UUID(tenant_id),
uuid.UUID(timeline_id),
lsn_from_hex(last_lsn))
def export_timeline(args,
psql_path,
pageserver_connstr,
tenant_id,
timeline_id,
last_lsn,
prev_lsn,
tar_filename):
# Choose filenames
incomplete_filename = tar_filename + ".incomplete"
stderr_filename = path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
# Construct export command
query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
# Run export command
print(f"Running: {cmd}")
with open(incomplete_filename, 'w') as stdout_f:
with open(stderr_filename, 'w') as stderr_f:
print(f"(capturing output to {incomplete_filename})")
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
subprocess.run(cmd,
stdout=stdout_f,
stderr=stderr_f,
env=pg_bin._build_env(None),
check=True)
# Add missing rels
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin)
# Log more info
file_size = os.path.getsize(tar_filename)
print(f"Done export: {tar_filename}, size {file_size}")
def main(args: argparse.Namespace):
psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql")
old_pageserver_host = args.old_pageserver_host
new_pageserver_host = args.new_pageserver_host
old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
old_http_client.check_status()
old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
new_http_client.check_status()
new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
for tenant_id in args.tenants:
print(f"Tenant: {tenant_id}")
timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
print(f"Timelines: {timelines}")
# Create tenant in new pageserver
if args.only_import is False and not args.timelines:
new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
for timeline in timelines:
# Skip timelines we don't need to export
if args.timelines and timeline['timeline_id'] not in args.timelines:
print(f"Skipping timeline {timeline['timeline_id']}")
continue
# Choose filenames
tar_filename = path.join(args.work_dir,
f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar")
# Export timeline from old pageserver
if args.only_import is False:
last_lsn, prev_lsn = get_rlsn(
old_pageserver_connstr,
timeline['tenant_id'],
timeline['timeline_id'],
)
export_timeline(
args,
psql_path,
old_pageserver_connstr,
timeline['tenant_id'],
timeline['timeline_id'],
last_lsn,
prev_lsn,
tar_filename,
)
# Import into new pageserver
import_timeline(
args,
psql_path,
new_pageserver_connstr,
new_http_client,
timeline['tenant_id'],
timeline['timeline_id'],
last_lsn,
prev_lsn,
tar_filename,
)
# Re-export and compare
re_export_filename = tar_filename + ".reexport"
export_timeline(args,
psql_path,
new_pageserver_connstr,
timeline['tenant_id'],
timeline['timeline_id'],
last_lsn,
prev_lsn,
re_export_filename)
# Check the size is the same
old_size = os.path.getsize(tar_filename),
new_size = os.path.getsize(re_export_filename),
if old_size != new_size:
raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--tenant-id',
dest='tenants',
required=True,
nargs='+',
help='Id of the tenant to migrate. You can pass multiple arguments',
)
parser.add_argument(
'--timeline-id',
dest='timelines',
required=False,
nargs='+',
help='Id of the timeline to migrate. You can pass multiple arguments',
)
parser.add_argument(
'--from-host',
dest='old_pageserver_host',
required=True,
help='Host of the pageserver to migrate data from',
)
parser.add_argument(
'--from-http-port',
dest='old_pageserver_http_port',
required=False,
type=int,
default=9898,
help='HTTP port of the pageserver to migrate data from. Default: 9898',
)
parser.add_argument(
'--from-pg-port',
dest='old_pageserver_pg_port',
required=False,
type=int,
default=6400,
help='pg port of the pageserver to migrate data from. Default: 6400',
)
parser.add_argument(
'--to-host',
dest='new_pageserver_host',
required=True,
help='Host of the pageserver to migrate data to',
)
parser.add_argument(
'--to-http-port',
dest='new_pageserver_http_port',
required=False,
default=9898,
type=int,
help='HTTP port of the pageserver to migrate data to. Default: 9898',
)
parser.add_argument(
'--to-pg-port',
dest='new_pageserver_pg_port',
required=False,
default=6400,
type=int,
help='pg port of the pageserver to migrate data to. Default: 6400',
)
parser.add_argument(
'--ignore-tenant-exists',
dest='ok_if_exists',
required=False,
help=
'Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.',
)
parser.add_argument(
'--pg-distrib-dir',
dest='pg_distrib_dir',
required=False,
default='/usr/local/',
help='Path where postgres binaries are installed. Default: /usr/local/',
)
parser.add_argument(
'--psql-path',
dest='psql_path',
required=False,
default='/usr/local/bin/psql',
help='Path to the psql binary. Default: /usr/local/bin/psql',
)
parser.add_argument(
'--only-import',
dest='only_import',
required=False,
default=False,
action='store_true',
help='Skip export and tenant creation part',
)
parser.add_argument(
'--work-dir',
dest='work_dir',
required=True,
default=False,
help='directory where temporary tar files are stored',
)
args = parser.parse_args()
main(args)

View File

@@ -1,5 +1,6 @@
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
from fixtures.utils import query_scalar

View File

@@ -167,5 +167,3 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
# The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
with pytest.raises(Exception, match="invalid branch start lsn"):
env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn)
thread.join()

View File

@@ -1,11 +0,0 @@
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient
import pytest
def test_fsm_truncate(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_fsm_truncate")
pg = env.postgres.create_start('test_fsm_truncate')
pg.safe_psql(
'CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;')

View File

@@ -1,10 +1,9 @@
import re
import pytest
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, Postgres, wait_for_upload, wait_for_last_record_lsn
from fixtures.utils import lsn_from_hex
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_upload, wait_for_last_record_lsn
from fixtures.utils import lsn_from_hex, lsn_to_hex
from uuid import UUID, uuid4
import os
import tarfile
import os
import shutil
from pathlib import Path
import json
@@ -106,63 +105,20 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
@pytest.mark.timeout(600)
def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
neon_env_builder.enable_local_fs_remote_storage()
env = neon_env_builder.init_start()
timeline = env.neon_cli.create_branch('test_import_from_pageserver_small')
pg = env.postgres.create_start('test_import_from_pageserver_small')
def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
num_rows = 3000
lsn = _generate_data(num_rows, pg)
_import(num_rows, lsn, env, pg_bin, timeline)
@pytest.mark.timeout(1800)
# TODO: temporarily disable `test_import_from_pageserver_multisegment` test, enable
# the test back after finding the failure cause.
# @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build")
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255")
def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
neon_env_builder.enable_local_fs_remote_storage()
env = neon_env_builder.init_start()
timeline = env.neon_cli.create_branch('test_import_from_pageserver_multisegment')
pg = env.postgres.create_start('test_import_from_pageserver_multisegment')
env.neon_cli.create_branch('test_import_from_pageserver')
pgmain = env.postgres.create_start('test_import_from_pageserver')
log.info("postgres is running on 'test_import_from_pageserver' branch")
# For `test_import_from_pageserver_multisegment`, we want to make sure that the data
# is large enough to create multi-segment files. Typically, a segment file's size is
# at most 1GB. A large number of inserted rows (`30000000`) is used to increase the
# DB size to above 1GB. Related: https://github.com/neondatabase/neon/issues/2097.
num_rows = 30000000
lsn = _generate_data(num_rows, pg)
timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0]
logical_size = env.pageserver.http_client().timeline_detail(
env.initial_tenant, timeline)['local']['current_logical_size']
log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
assert logical_size > 1024**3 # = 1GB
tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline)
# Check if the backup data contains multiple segment files
cnt_seg_files = 0
segfile_re = re.compile('[0-9]+\\.[0-9]+')
with tarfile.open(tar_output_file, "r") as tar_f:
for f in tar_f.getnames():
if segfile_re.search(f) is not None:
cnt_seg_files += 1
log.info(f"Found a segment file: {f} in the backup archive file")
assert cnt_seg_files > 0
def _generate_data(num_rows: int, pg: Postgres) -> str:
"""Generate a table with `num_rows` rows.
Returns:
the latest insert WAL's LSN"""
with closing(pg.connect()) as conn:
with closing(pgmain.connect()) as conn:
with conn.cursor() as cur:
# data loading may take a while, so increase statement timeout
cur.execute("SET statement_timeout='300s'")
@@ -171,28 +127,15 @@ def _generate_data(num_rows: int, pg: Postgres) -> str:
cur.execute("CHECKPOINT")
cur.execute('SELECT pg_current_wal_insert_lsn()')
res = cur.fetchone()
assert res is not None and isinstance(res[0], str)
return res[0]
def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timeline: UUID) -> str:
"""Test importing backup data to the pageserver.
Args:
expected_num_rows: the expected number of rows of the test table in the backup data
lsn: the backup's base LSN
Returns:
path to the backup archive file"""
log.info(f"start_backup_lsn = {lsn}")
lsn = cur.fetchone()[0]
log.info(f"start_backup_lsn = {lsn}")
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}
# Get a fullbackup from pageserver
query = f"fullbackup { env.initial_tenant.hex} {timeline.hex} {lsn}"
query = f"fullbackup { env.initial_tenant.hex} {timeline} {lsn}"
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
result_basepath = pg_bin.run_capture(cmd, env=psql_env)
tar_output_file = result_basepath + ".stdout"
@@ -209,7 +152,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel
env.pageserver.start()
# Import using another tenantid, because we use the same pageserver.
# TODO Create another pageserver to make test more realistic.
# TODO Create another pageserver to maeke test more realistic.
tenant = uuid4()
# Import to pageserver
@@ -222,7 +165,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel
"--tenant-id",
tenant.hex,
"--timeline-id",
timeline.hex,
timeline,
"--node-name",
node_name,
"--base-lsn",
@@ -232,15 +175,15 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel
])
# Wait for data to land in s3
wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(lsn))
wait_for_upload(client, tenant, timeline, lsn_from_hex(lsn))
wait_for_last_record_lsn(client, tenant, UUID(timeline), lsn_from_hex(lsn))
wait_for_upload(client, tenant, UUID(timeline), lsn_from_hex(lsn))
# Check it worked
pg = env.postgres.create_start(node_name, tenant_id=tenant)
assert pg.safe_psql('select count(*) from tbl') == [(expected_num_rows, )]
assert pg.safe_psql('select count(*) from tbl') == [(num_rows, )]
# Take another fullbackup
query = f"fullbackup { tenant.hex} {timeline.hex} {lsn}"
query = f"fullbackup { tenant.hex} {timeline} {lsn}"
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
result_basepath = pg_bin.run_capture(cmd, env=psql_env)
new_tar_output_file = result_basepath + ".stdout"
@@ -252,6 +195,4 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel
# Check that gc works
psconn = env.pageserver.connect()
pscur = psconn.cursor()
pscur.execute(f"do_gc {tenant.hex} {timeline.hex} 0")
return tar_output_file
pscur.execute(f"do_gc {tenant.hex} {timeline} 0")

View File

@@ -60,38 +60,17 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID):
def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
tenant_id, timeline_id = env.neon_cli.create_tenant()
client = env.pageserver.http_client()
timeline_details = client.timeline_detail(tenant_id=tenant_id,
timeline_id=timeline_id,
include_non_incremental_logical_size=True)
tenant_id, timeline_id = env.neon_cli.create_tenant()
assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
timeline_details = client.timeline_detail(tenant_id=tenant_id,
timeline_id=timeline_id,
include_non_incremental_logical_size=True)
def expect_updated_msg_lsn(client: NeonPageserverHttpClient,
tenant_id: UUID,
timeline_id: UUID,
prev_msg_lsn: Optional[int]) -> int:
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
# a successful `timeline_details` response must contain the below fields
local_timeline_details = timeline_details['local']
assert "wal_source_connstr" in local_timeline_details.keys()
assert "last_received_msg_lsn" in local_timeline_details.keys()
assert "last_received_msg_ts" in local_timeline_details.keys()
assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty"
last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"])
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \
f"the last received message's LSN {last_msg_lsn} hasn't been updated \
compared to the previous message's LSN {prev_msg_lsn}"
return last_msg_lsn
assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
# Test the WAL-receiver related fields in the response to `timeline_details` API call
@@ -100,29 +79,44 @@ def expect_updated_msg_lsn(client: NeonPageserverHttpClient,
# `timeline_details` now.
def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
tenant_id, timeline_id = env.neon_cli.create_tenant()
pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
client = env.pageserver.http_client()
# Wait to make sure that we get a latest WAL receiver data.
# We need to wait here because it's possible that we don't have access to
# the latest WAL yet, when the `timeline_detail` API is first called.
# See: https://github.com/neondatabase/neon/issues/1768.
lsn = wait_until(number_of_iterations=5,
interval=1,
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
tenant_id, timeline_id = env.neon_cli.create_tenant()
pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
# Make a DB modification then expect getting a new WAL receiver's data.
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
wait_until(number_of_iterations=5,
interval=1,
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int:
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
# a successful `timeline_details` response must contain the below fields
local_timeline_details = timeline_details['local']
assert "wal_source_connstr" in local_timeline_details.keys()
assert "last_received_msg_lsn" in local_timeline_details.keys()
assert "last_received_msg_ts" in local_timeline_details.keys()
assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty"
last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"])
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \
f"the last received message's LSN {last_msg_lsn} hasn't been updated \
compared to the previous message's LSN {prev_msg_lsn}"
return last_msg_lsn
# Wait to make sure that we get a latest WAL receiver data.
# We need to wait here because it's possible that we don't have access to
# the latest WAL yet, when the `timeline_detail` API is first called.
# See: https://github.com/neondatabase/neon/issues/1768.
lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None))
# Make a DB modification then expect getting a new WAL receiver's data.
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn))
def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
check_client(client, env.initial_tenant)
client = env.pageserver.http_client()
check_client(client, env.initial_tenant)
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
@@ -131,5 +125,5 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
management_token = env.auth_keys.generate_management_token()
with env.pageserver.http_client(auth_token=management_token) as client:
check_client(client, env.initial_tenant)
client = env.pageserver.http_client(auth_token=management_token)
check_client(client, env.initial_tenant)

View File

@@ -2,16 +2,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.log_helper import log
# Test that the pageserver fixture is implemented correctly, allowing quick restarts.
# This is a regression test, see https://github.com/neondatabase/neon/issues/2247
def test_fixture_restart(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
for i in range(3):
env.pageserver.stop()
env.pageserver.start()
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):

View File

@@ -2,10 +2,11 @@
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
import shutil, os
from contextlib import closing
from pathlib import Path
import time
from uuid import UUID
from fixtures.neon_fixtures import NeonEnvBuilder, RemoteStorageKind, assert_timeline_local, available_remote_storages, wait_until, wait_for_last_record_lsn, wait_for_upload
from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload
from fixtures.log_helper import log
from fixtures.utils import lsn_from_hex, query_scalar
import pytest
@@ -28,19 +29,18 @@ import pytest
# * queries the specific data, ensuring that it matches the one stored before
#
# The tests are done for all types of remote storage pageserver supports.
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_remote_storage_backup_and_restore(
neon_env_builder: NeonEnvBuilder,
remote_storatge_kind: RemoteStorageKind,
):
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, storage_type: str):
# Use this test to check more realistic SK ids: some etcd key parsing bugs were related,
# and this test needs SK to write data to pageserver, so it will be visible
neon_env_builder.safekeepers_id_start = 12
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_remote_storage_backup_and_restore',
)
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
data_id = 1
data_secret = 'very secret secret'
@@ -110,7 +110,7 @@ def test_remote_storage_backup_and_restore(
client.tenant_attach(UUID(tenant_id))
log.info("waiting for timeline redownload")
wait_until(number_of_iterations=20,
wait_until(number_of_iterations=10,
interval=1,
func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)))

View File

@@ -1,19 +1,10 @@
from threading import Thread
from uuid import uuid4
import uuid
import psycopg2
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
"""Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
try:
env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0')
except Exception as e:
log.error("do_gc failed: %s", e)
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
@@ -45,7 +36,8 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0')
# try to concurrently run gc and detach
gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id))
gc_thread = Thread(
target=lambda: env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0'), )
gc_thread.start()
last_error = None

View File

@@ -229,7 +229,7 @@ def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path
# basebackup and importing it into the new pageserver.
# This kind of migration can tolerate breaking changes
# to storage format
'major',
pytest.param('major', marks=pytest.mark.xfail(reason="Not implemented")),
])
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
@@ -345,8 +345,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
# Migrate either by attaching from s3 or import/export basebackup
if method == "major":
cmd = [
"poetry",
"run",
"python",
os.path.join(base_dir, "scripts/export_import_between_pageservers.py"),
"--tenant-id",
@@ -363,12 +361,12 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
str(new_pageserver_http_port),
"--to-pg-port",
str(new_pageserver_pg_port),
"--pg-distrib-dir",
pg_distrib_dir,
"--psql-path",
os.path.join(pg_distrib_dir, "bin", "psql"),
"--work-dir",
os.path.join(test_output_dir),
]
subprocess_capture(test_output_dir, cmd, check=True)
subprocess_capture(str(env.repo_dir), cmd, check=True)
elif method == "minor":
# call to attach timeline to new pageserver
new_pageserver_http.tenant_attach(tenant_id)
@@ -429,22 +427,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
post_migration_check(pg_main, 500500, old_local_path_main)
post_migration_check(pg_second, 1001000, old_local_path_second)
# ensure that we can successfully read all relations on the new pageserver
with pg_cur(pg_second) as cur:
cur.execute('''
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN
SELECT relname FROM pg_class WHERE relkind='r'
LOOP
RAISE NOTICE '%', r.relname;
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
END LOOP;
END$$;
''')
if with_load == 'with_load':
assert load_ok_event.wait(3)
log.info('stopping load thread')

View File

@@ -13,7 +13,7 @@ from uuid import UUID
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload
from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, wait_for_last_record_lsn, wait_for_upload
from fixtures.utils import lsn_from_hex
@@ -38,7 +38,7 @@ async def tenant_workload(env: NeonEnv, pg: Postgres):
async def all_tenants_workload(env: NeonEnv, tenants_pgs):
workers = []
for _, pg in tenants_pgs:
for tenant, pg in tenants_pgs:
worker = tenant_workload(env, pg)
workers.append(asyncio.create_task(worker))
@@ -46,18 +46,23 @@ async def all_tenants_workload(env: NeonEnv, tenants_pgs):
await asyncio.gather(*workers)
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_tenants_many',
)
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str):
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
neon_env_builder.enable_local_fs_remote_storage()
env = neon_env_builder.init_start()
tenants_pgs: List[Tuple[UUID, Postgres]] = []
for _ in range(1, 5):
for i in range(1, 5):
# Use a tiny checkpoint distance, to create a lot of layers quickly
tenant, _ = env.neon_cli.create_tenant(
conf={

View File

@@ -4,7 +4,7 @@ from uuid import UUID
import re
import psycopg2.extras
import psycopg2.errors
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local
from fixtures.log_helper import log
import time
@@ -192,8 +192,6 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
FROM generate_series(1, 1000) g""",
])
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
# restart the pageserer to force calculating timeline's initial physical size
env.pageserver.stop()
env.pageserver.start()
@@ -213,9 +211,7 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
FROM generate_series(1, 1000) g""",
])
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
assert_physical_size(env, env.initial_tenant, new_timeline_id)
@@ -236,10 +232,8 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
FROM generate_series(1, 100000) g""",
])
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}")
assert_physical_size(env, env.initial_tenant, new_timeline_id)
@@ -260,21 +254,15 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
SELECT 'long string to consume some space' || g
FROM generate_series(1, 100000) g""",
])
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
pg.safe_psql("""
INSERT INTO foo
SELECT 'long string to consume some space' || g
FROM generate_series(1, 100000) g
""")
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0")
assert_physical_size(env, env.initial_tenant, new_timeline_id)
@@ -291,7 +279,6 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
FROM generate_series(1, 100000) g""",
])
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
# get the metrics and parse the metric for the current timeline's physical size
@@ -332,7 +319,6 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g",
])
wait_for_last_flush_lsn(env, pg, tenant, timeline)
env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}")
timeline_total_size += get_timeline_physical_size(timeline)

View File

@@ -12,8 +12,9 @@ import uuid
from contextlib import closing
from dataclasses import dataclass, field
from multiprocessing import Process, Value
from pathlib import Path
from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageKind, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, available_remote_storages, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload
from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload
from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex, query_scalar
from fixtures.log_helper import log
from typing import List, Optional, Any
@@ -284,12 +285,9 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
env.neon_cli.create_branch('test_safekeepers_wal_removal')
pg = env.postgres.create_start('test_safekeepers_wal_removal')
# Note: it is important to insert at least two segments, as currently
# control file is synced roughly once in segment range and WAL is not
# removed until all horizons are persisted.
pg.safe_psql_many([
'CREATE TABLE t(key int primary key, value text)',
"INSERT INTO t SELECT generate_series(1,200000), 'payload'",
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
])
tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
@@ -353,7 +351,7 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end):
if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end):
break
elapsed = time.time() - started_at
if elapsed > 30:
if elapsed > 20:
raise RuntimeError(
f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded")
time.sleep(0.5)
@@ -379,15 +377,15 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size):
time.sleep(0.5)
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs'])
def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_safekeepers_wal_backup',
)
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
env = neon_env_builder.init_start()
@@ -427,15 +425,15 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo
wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000')
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs'])
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_s3_wal_replay',
)
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_s3_wal_replay')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
env = neon_env_builder.init_start()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from dataclasses import field
from contextlib import contextmanager
from enum import Flag, auto
import enum
import textwrap
from cached_property import cached_property
import abc
@@ -222,7 +221,7 @@ def can_bind(host: str, port: int) -> bool:
# moment. If that changes, we should use start using SO_REUSEADDR here
# too, to allow reusing ports more quickly.
# See https://github.com/neondatabase/neon/issues/801
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
#sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try:
sock.bind((host, port))
@@ -231,8 +230,6 @@ def can_bind(host: str, port: int) -> bool:
except socket.error:
log.info(f"Port {port} is in use, skipping")
return False
finally:
sock.close()
class PortDistributor:
@@ -265,11 +262,6 @@ def default_broker(request: Any, port_distributor: PortDistributor):
broker.stop()
@pytest.fixture(scope='session')
def run_id():
yield uuid.uuid4()
@pytest.fixture(scope='session')
def mock_s3_server(port_distributor: PortDistributor):
mock_s3_server = MockS3Server(port_distributor.get_port())
@@ -299,9 +291,7 @@ class PgProtocol:
# change it by calling "SET statement_timeout" after
# connecting.
options = result.get('options', '')
if "statement_timeout" not in options:
options = f'-cstatement_timeout=120s {options}'
result['options'] = options
result['options'] = f'-cstatement_timeout=120s {options}'
return result
# autocommit=True here by default because that's what we need most of the time
@@ -448,46 +438,26 @@ class MockS3Server:
def secret_key(self) -> str:
return 'test'
def access_env_vars(self) -> Dict[Any, Any]:
return {
'AWS_ACCESS_KEY_ID': self.access_key(),
'AWS_SECRET_ACCESS_KEY': self.secret_key(),
}
def kill(self):
self.subprocess.kill()
@enum.unique
class RemoteStorageKind(enum.Enum):
LOCAL_FS = "local_fs"
MOCK_S3 = "mock_s3"
REAL_S3 = "real_s3"
def available_remote_storages() -> List[RemoteStorageKind]:
remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
remote_storages.append(RemoteStorageKind.REAL_S3)
log.info("Enabling real s3 storage for tests")
else:
log.info("Using mock implementations to test remote storage")
return remote_storages
@dataclass
class LocalFsStorage:
root: Path
local_path: Path
@dataclass
class S3Storage:
bucket_name: str
bucket_region: str
access_key: str
secret_key: str
endpoint: Optional[str] = None
prefix_in_bucket: Optional[str] = None
def access_env_vars(self) -> Dict[str, str]:
return {
'AWS_ACCESS_KEY_ID': self.access_key,
'AWS_SECRET_ACCESS_KEY': self.secret_key,
}
endpoint: Optional[str]
RemoteStorage = Union[LocalFsStorage, S3Storage]
@@ -496,20 +466,16 @@ RemoteStorage = Union[LocalFsStorage, S3Storage]
# serialize as toml inline table
def remote_storage_to_toml_inline_table(remote_storage):
if isinstance(remote_storage, LocalFsStorage):
remote_storage_config = f"local_path='{remote_storage.root}'"
res = f"local_path='{remote_storage.local_path}'"
elif isinstance(remote_storage, S3Storage):
remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\
bucket_region='{remote_storage.bucket_region}'"
if remote_storage.prefix_in_bucket is not None:
remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'"
res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'"
if remote_storage.endpoint is not None:
remote_storage_config += f",endpoint='{remote_storage.endpoint}'"
res += f", endpoint='{remote_storage.endpoint}'"
else:
raise Exception(f'Unknown storage configuration {remote_storage}')
else:
raise Exception("invalid remote storage type")
return f"{{{remote_storage_config}}}"
return f"{{{res}}}"
class RemoteStorageUsers(Flag):
@@ -527,31 +493,28 @@ class NeonEnvBuilder:
cleaned up after the test has finished.
"""
def __init__(
self,
repo_dir: Path,
port_distributor: PortDistributor,
broker: Etcd,
run_id: uuid.UUID,
mock_s3_server: MockS3Server,
remote_storage: Optional[RemoteStorage] = None,
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 1,
# Use non-standard SK ids to check for various parsing bugs
safekeepers_id_start: int = 0,
# fsync is disabled by default to make the tests go faster
safekeepers_enable_fsync: bool = False,
auth_enabled: bool = False,
rust_log_override: Optional[str] = None,
default_branch_name=DEFAULT_BRANCH_NAME,
):
self,
repo_dir: Path,
port_distributor: PortDistributor,
broker: Etcd,
mock_s3_server: MockS3Server,
remote_storage: Optional[RemoteStorage] = None,
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 1,
# Use non-standard SK ids to check for various parsing bugs
safekeepers_id_start: int = 0,
# fsync is disabled by default to make the tests go faster
safekeepers_enable_fsync: bool = False,
auth_enabled: bool = False,
rust_log_override: Optional[str] = None,
default_branch_name=DEFAULT_BRANCH_NAME):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
self.port_distributor = port_distributor
self.remote_storage = remote_storage
self.remote_storage_users = remote_storage_users
self.broker = broker
self.run_id = run_id
self.mock_s3_server = mock_s3_server
self.pageserver_config_override = pageserver_config_override
self.num_safekeepers = num_safekeepers
@@ -560,8 +523,6 @@ class NeonEnvBuilder:
self.auth_enabled = auth_enabled
self.default_branch_name = default_branch_name
self.env: Optional[NeonEnv] = None
self.remote_storage_prefix: Optional[str] = None
self.keep_remote_storage_contents: bool = True
def init(self) -> NeonEnv:
# Cannot create more than one environment from one builder
@@ -577,143 +538,41 @@ class NeonEnvBuilder:
self.start()
return env
def enable_remote_storage(
self,
remote_storage_kind: RemoteStorageKind,
test_name: str,
force_enable: bool = True,
):
if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
self.enable_local_fs_remote_storage(force_enable=force_enable)
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
else:
raise RuntimeError(f'Unknown storage type: {remote_storage_kind}')
"""
Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
def enable_local_fs_remote_storage(self, force_enable=True):
"""
Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage'))
def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True):
"""
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
Starts up the mock server, if that does not run yet.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
"""
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
Starts up the mock server, if that does not run yet.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True):
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
mock_endpoint = self.mock_s3_server.endpoint()
mock_region = self.mock_s3_server.region()
self.remote_storage_client = boto3.client(
boto3.client(
's3',
endpoint_url=mock_endpoint,
region_name=mock_region,
aws_access_key_id=self.mock_s3_server.access_key(),
aws_secret_access_key=self.mock_s3_server.secret_key(),
)
self.remote_storage_client.create_bucket(Bucket=bucket_name)
self.remote_storage = S3Storage(
bucket_name=bucket_name,
endpoint=mock_endpoint,
bucket_region=mock_region,
access_key=self.mock_s3_server.access_key(),
secret_key=self.mock_s3_server.secret_key(),
)
def enable_real_s3_remote_storage(self, test_name: str, force_enable=True):
"""
Sets up configuration to use real s3 endpoint without mock server
"""
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
access_key = os.getenv("AWS_ACCESS_KEY_ID")
assert access_key, "no aws access key provided"
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
assert secret_key, "no aws access key provided"
# session token is needed for local runs with sso auth
session_token = os.getenv("AWS_SESSION_TOKEN")
bucket_name = os.getenv("REMOTE_STORAGE_S3_BUCKET")
assert bucket_name, "no remote storage bucket name provided"
region = os.getenv("REMOTE_STORAGE_S3_REGION")
assert region, "no remote storage region provided"
# do not leave data in real s3
self.keep_remote_storage_contents = False
# construct a prefix inside bucket for the particular test case and test run
self.remote_storage_prefix = f'{self.run_id}/{test_name}'
self.remote_storage_client = boto3.client(
's3',
region_name=region,
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
aws_session_token=session_token,
)
).create_bucket(Bucket=bucket_name)
self.remote_storage = S3Storage(bucket_name=bucket_name,
bucket_region=region,
access_key=access_key,
secret_key=secret_key,
prefix_in_bucket=self.remote_storage_prefix)
def cleanup_remote_storage(self):
# here wee check for true remote storage, no the local one
# local cleanup is not needed after test because in ci all env will be destroyed anyway
if self.remote_storage_prefix is None:
log.info("no remote storage was set up, skipping cleanup")
return
if self.keep_remote_storage_contents:
log.info("keep_remote_storage_contents skipping remote storage cleanup")
return
log.info("removing data from test s3 bucket %s by prefix %s",
self.remote_storage.bucket_name,
self.remote_storage_prefix)
paginator = self.remote_storage_client.get_paginator('list_objects_v2')
pages = paginator.paginate(
Bucket=self.remote_storage.bucket_name,
Prefix=self.remote_storage_prefix,
)
objects_to_delete = {'Objects': []}
cnt = 0
for item in pages.search('Contents'):
# weirdly when nothing is found it returns [None]
if item is None:
break
objects_to_delete['Objects'].append({'Key': item['Key']})
# flush once aws limit reached
if len(objects_to_delete['Objects']) >= 1000:
self.remote_storage_client.delete_objects(
Bucket=self.remote_storage.bucket_name,
Delete=objects_to_delete,
)
objects_to_delete = dict(Objects=[])
cnt += 1
# flush rest
if len(objects_to_delete['Objects']):
self.remote_storage_client.delete_objects(Bucket=self.remote_storage.bucket_name,
Delete=objects_to_delete)
log.info("deleted %s objects from remote storage", cnt)
endpoint=mock_endpoint,
bucket_region=mock_region)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
# Stop all the nodes.
if self.env:
log.info('Cleaning up all storage and compute nodes')
@@ -722,8 +581,6 @@ class NeonEnvBuilder:
sk.stop(immediate=True)
self.env.pageserver.stop(immediate=True)
self.cleanup_remote_storage()
class NeonEnv:
"""
@@ -856,13 +713,10 @@ class NeonEnv:
@pytest.fixture(scope=shareable_scope)
def _shared_simple_env(
request: Any,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd,
run_id: uuid.UUID,
) -> Iterator[NeonEnv]:
def _shared_simple_env(request: Any,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd) -> Iterator[NeonEnv]:
"""
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
is set, this is shared by all tests using `neon_simple_env`.
@@ -876,13 +730,8 @@ def _shared_simple_env(
repo_dir = os.path.join(str(top_output_dir), "shared_repo")
shutil.rmtree(repo_dir, ignore_errors=True)
with NeonEnvBuilder(
repo_dir=Path(repo_dir),
port_distributor=port_distributor,
broker=default_broker,
mock_s3_server=mock_s3_server,
run_id=run_id,
) as builder:
with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker,
mock_s3_server) as builder:
env = builder.init_start()
# For convenience in tests, create a branch from the freshly-initialized cluster.
@@ -907,13 +756,10 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
@pytest.fixture(scope='function')
def neon_env_builder(
test_output_dir,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd,
run_id: uuid.UUID,
) -> Iterator[NeonEnvBuilder]:
def neon_env_builder(test_output_dir,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
@@ -931,13 +777,8 @@ def neon_env_builder(
repo_dir = os.path.join(test_output_dir, "repo")
# Return the builder to the caller
with NeonEnvBuilder(
repo_dir=Path(repo_dir),
port_distributor=port_distributor,
mock_s3_server=mock_s3_server,
broker=default_broker,
run_id=run_id,
) as builder:
with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker,
mock_s3_server) as builder:
yield builder
@@ -1342,10 +1183,7 @@ class NeonCli(AbstractNeonCli):
remote_storage_users=self.env.remote_storage_users,
pageserver_config_override=self.env.pageserver.config_override)
s3_env_vars = None
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
s3_env_vars = self.env.remote_storage.access_env_vars()
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]':
@@ -1357,10 +1195,7 @@ class NeonCli(AbstractNeonCli):
return self.raw_cli(cmd)
def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]':
s3_env_vars = None
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
s3_env_vars = self.env.remote_storage.access_env_vars()
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars)
def safekeeper_stop(self,
@@ -1488,17 +1323,6 @@ class NeonPageserver(PgProtocol):
self.running = True
return self
def _wait_for_death(self):
"""Wait for pageserver to die. Assumes kill signal is sent."""
pid_path = pathlib.Path(self.env.repo_dir) / "pageserver.pid"
pid = read_pid(pid_path)
retries_left = 20
while check_pid(pid):
time.sleep(0.2)
retries_left -= 1
if retries_left == 0:
raise AssertionError("Pageserver failed to die")
def stop(self, immediate=False) -> 'NeonPageserver':
"""
Stop the page server.
@@ -1506,7 +1330,6 @@ class NeonPageserver(PgProtocol):
"""
if self.running:
self.env.neon_cli.pageserver_stop(immediate)
self._wait_for_death()
self.running = False
return self
@@ -1514,7 +1337,7 @@ class NeonPageserver(PgProtocol):
return self
def __exit__(self, exc_type, exc, tb):
self.stop(immediate=True)
self.stop(True)
def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient:
return NeonPageserverHttpClient(
@@ -1531,7 +1354,6 @@ def append_pageserver_param_overrides(
):
if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None:
remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
params_to_update.append(
f'--pageserver-config-override=remote_storage={remote_storage_toml_table}')
@@ -2016,17 +1838,6 @@ def read_pid(path: Path) -> int:
return int(path.read_text())
def check_pid(pid):
"""Check whether pid is running."""
try:
# If sig is 0, then no signal is sent, but error checking is still performed.
os.kill(pid, 0)
except OSError:
return False
else:
return True
@dataclass
class SafekeeperPort:
pg: int
@@ -2049,8 +1860,8 @@ class Safekeeper:
started_at = time.time()
while True:
try:
with self.http_client() as http_cli:
http_cli.check_status()
http_cli = self.http_client()
http_cli.check_status()
except Exception as e:
elapsed = time.time() - started_at
if elapsed > 3:
@@ -2201,9 +2012,9 @@ class Etcd:
return f'http://127.0.0.1:{self.port}'
def check_status(self):
with requests.Session() as s:
s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry
s.get(f"{self.client_url()}/health").raise_for_status()
s = requests.Session()
s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry
s.get(f"{self.client_url()}/health").raise_for_status()
def try_start(self):
if self.handle is not None:
@@ -2498,9 +2309,3 @@ def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient,
time.sleep(1)
raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format(
lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID):
"""Wait for pageserver to catch up the latest flush LSN"""
last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)

View File

@@ -146,7 +146,7 @@ def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, durat
record_thread.join()
def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_event: threading.Event):
def start_pgbench_intensive_initialization(env: PgCompare, scale: int):
with env.record_duration("run_duration"):
# Needs to increase the statement timeout (default: 120s) because the
# initialization step can be slow with a large scale.
@@ -155,11 +155,9 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_even
f'-s{scale}',
'-i',
'-Idtg',
env.pg.connstr(options='-cstatement_timeout=600s')
env.pg.connstr(options='-cstatement_timeout=300s')
])
done_event.set()
@pytest.mark.timeout(1000)
@pytest.mark.parametrize("scale", get_scales_matrix(1000))
@@ -168,17 +166,15 @@ def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int):
with env.pg.connect().cursor() as cur:
cur.execute("CREATE TABLE foo as select generate_series(1,100000)")
workload_done_event = threading.Event()
workload_thread = threading.Thread(target=start_pgbench_intensive_initialization,
args=(env, scale, workload_done_event))
args=(env, scale))
workload_thread.start()
record_thread = threading.Thread(target=record_lsn_write_lag,
args=(env, lambda: not workload_done_event.is_set()))
args=(env, lambda: workload_thread.is_alive()))
record_thread.start()
record_read_latency(env, lambda: not workload_done_event.is_set(), "SELECT count(*) from foo")
record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT count(*) from foo")
workload_thread.join()
record_thread.join()