mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 11:40:38 +00:00
Compare commits
3 Commits
rfc-pagese
...
ps-no-stat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
688f68ecba | ||
|
|
fb2ffac8b9 | ||
|
|
8173e36a1b |
30
.github/actions/run-python-test-set/action.yml
vendored
30
.github/actions/run-python-test-set/action.yml
vendored
@@ -27,26 +27,6 @@ inputs:
|
||||
description: 'Whether to upload the performance report'
|
||||
required: false
|
||||
default: 'false'
|
||||
run_with_real_s3:
|
||||
description: 'Whether to pass real s3 credentials to the test suite'
|
||||
required: false
|
||||
default: 'false'
|
||||
real_s3_bucket:
|
||||
description: 'Bucket name for real s3 tests'
|
||||
required: false
|
||||
default: ''
|
||||
real_s3_region:
|
||||
description: 'Region name for real s3 tests'
|
||||
required: false
|
||||
default: ''
|
||||
real_s3_access_key_id:
|
||||
description: 'Access key id'
|
||||
required: false
|
||||
default: ''
|
||||
real_s3_secret_access_key:
|
||||
description: 'Secret access key'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -83,8 +63,6 @@ runs:
|
||||
# this variable will be embedded in perf test report
|
||||
# and is needed to distinguish different environments
|
||||
PLATFORM: github-actions-selfhosted
|
||||
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
|
||||
@@ -99,14 +77,6 @@ runs:
|
||||
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
|
||||
echo "REAL S3 ENABLED"
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }}
|
||||
export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }}
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
|
||||
mkdir -p "$PERF_REPORT_DIR"
|
||||
|
||||
16
.github/workflows/build_and_test.yml
vendored
16
.github/workflows/build_and_test.yml
vendored
@@ -35,16 +35,6 @@ jobs:
|
||||
GIT_VERSION: ${{ github.sha }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownerwhip
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
@@ -219,11 +209,7 @@ jobs:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||
test_selection: batch_others
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: ci-tests-s3
|
||||
real_s3_region: us-west-2
|
||||
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
|
||||
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
@@ -11,15 +11,17 @@ than it was before.
|
||||
|
||||
## Submitting changes
|
||||
|
||||
1. Get at least one +1 on your PR before you push.
|
||||
1. Make a PR for every change.
|
||||
|
||||
Even seemingly trivial patches can break things in surprising ways.
|
||||
Use of common sense is OK. If you're only fixing a typo in a comment,
|
||||
it's probably fine to just push it. But if in doubt, open a PR.
|
||||
|
||||
2. Get at least one +1 on your PR before you push.
|
||||
|
||||
For simple patches, it will only take a minute for someone to review
|
||||
it.
|
||||
|
||||
2. Don't force push small changes after making the PR ready for review.
|
||||
Doing so will force readers to re-read your entire PR, which will delay
|
||||
the review process.
|
||||
|
||||
3. Always keep the CI green.
|
||||
|
||||
Do not push, if the CI failed on your PR. Even if you think it's not
|
||||
|
||||
59
Cargo.lock
generated
59
Cargo.lock
generated
@@ -154,9 +154,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.5.13"
|
||||
version = "0.5.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b9496f0c1d1afb7a2af4338bbe1d969cddfead41d87a9fb3aaa6d0bbc7af648"
|
||||
checksum = "d16705af05732b7d3258ec0f7b73c03a658a28925e050d8852d5b568ee8bcf4e"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core",
|
||||
@@ -317,6 +317,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.2.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
|
||||
dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.3.0"
|
||||
@@ -495,8 +504,8 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"lazy_static",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"regex",
|
||||
@@ -570,7 +579,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"cast",
|
||||
"cast 0.3.0",
|
||||
"clap 2.34.0",
|
||||
"criterion-plot",
|
||||
"csv",
|
||||
@@ -591,11 +600,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.4.5"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
|
||||
checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"cast 0.2.7",
|
||||
"itertools",
|
||||
]
|
||||
|
||||
@@ -671,9 +680,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crypto-common"
|
||||
version = "0.1.6"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
|
||||
checksum = "2ccfd8c0ee4cce11e45b3fd6f9d5e69e0cc62912aa6a0cb1bf4617b0eba5a12f"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
"typenum",
|
||||
@@ -1107,9 +1116,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "gimli"
|
||||
version = "0.26.2"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d"
|
||||
checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4"
|
||||
|
||||
[[package]]
|
||||
name = "git-version"
|
||||
@@ -1175,9 +1184,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
version = "0.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
@@ -1379,7 +1388,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"hashbrown 0.12.3",
|
||||
"hashbrown 0.12.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1591,8 +1600,8 @@ dependencies = [
|
||||
name = "metrics"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"prometheus",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -1842,9 +1851,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "os_str_bytes"
|
||||
version = "6.2.0"
|
||||
version = "6.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4"
|
||||
checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa"
|
||||
|
||||
[[package]]
|
||||
name = "pageserver"
|
||||
@@ -1870,6 +1879,7 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"metrics",
|
||||
"nix",
|
||||
"once_cell",
|
||||
@@ -2115,9 +2125,9 @@ dependencies = [
|
||||
"crc32c",
|
||||
"env_logger",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"memoffset",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"rand",
|
||||
"regex",
|
||||
@@ -2277,9 +2287,9 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac 0.12.1",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"md5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"pin-project-lite",
|
||||
"rand",
|
||||
@@ -2725,9 +2735,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.8"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"
|
||||
checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
@@ -2753,6 +2763,7 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
@@ -3606,9 +3617,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.2"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
|
||||
checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
@@ -3669,9 +3680,9 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"lazy_static",
|
||||
"metrics",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
|
||||
@@ -9,7 +9,7 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
toml = "0.5"
|
||||
once_cell = "1.13.0"
|
||||
lazy_static = "1.4"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
thiserror = "1"
|
||||
|
||||
@@ -30,14 +30,14 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_stdout_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stout file in directory {}",
|
||||
"Failed to create ectd stout file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let etcd_stderr_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd stderr file in directory {}",
|
||||
"Failed to create ectd stderr file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
@@ -51,11 +51,7 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
] {
|
||||
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
/// enough to extract a few settings we need in Zenith, assuming you don't do
|
||||
/// funny stuff like include-directives or funny escaping.
|
||||
use anyhow::{bail, Context, Result};
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
@@ -19,7 +19,9 @@ pub struct PostgresConf {
|
||||
hash: HashMap<String, String>,
|
||||
}
|
||||
|
||||
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
|
||||
lazy_static! {
|
||||
static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
|
||||
}
|
||||
|
||||
impl PostgresConf {
|
||||
pub fn new() -> PostgresConf {
|
||||
@@ -137,10 +139,10 @@ fn escape_str(s: &str) -> String {
|
||||
//
|
||||
// This regex is a bit more conservative than the rules in guc-file.l, so we quote some
|
||||
// strings that PostgreSQL would accept without quoting, but that's OK.
|
||||
|
||||
static UNQUOTED_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap());
|
||||
|
||||
lazy_static! {
|
||||
static ref UNQUOTED_RE: Regex =
|
||||
Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
|
||||
}
|
||||
if UNQUOTED_RE.is_match(s) {
|
||||
s.to_string()
|
||||
} else {
|
||||
|
||||
@@ -247,7 +247,7 @@ impl SafekeeperNode {
|
||||
// Shutting down may take a long time,
|
||||
// if safekeeper flushes a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for i in 0..600 {
|
||||
for _ in 0..100 {
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
@@ -272,11 +272,9 @@ impl SafekeeperNode {
|
||||
}
|
||||
}
|
||||
}
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
|
||||
@@ -318,7 +318,7 @@ impl PageServerNode {
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for i in 0..600 {
|
||||
for _ in 0..100 {
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
@@ -344,11 +344,9 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
}
|
||||
if i % 10 == 0 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -eux
|
||||
|
||||
pageserver_id_param="${NODE_ID:-10}"
|
||||
|
||||
broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
|
||||
if [ "$broker_endpoints_param" != "absent" ]; then
|
||||
broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
|
||||
@@ -10,12 +8,10 @@ else
|
||||
broker_endpoints_param=''
|
||||
fi
|
||||
|
||||
remote_storage_param="${REMOTE_STORAGE:-}"
|
||||
|
||||
if [ "$1" = 'pageserver' ]; then
|
||||
if [ ! -d "/data/tenants" ]; then
|
||||
echo "Initializing pageserver data directory"
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
|
||||
|
||||
@@ -52,8 +52,10 @@
|
||||
- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
|
||||
- [settings.md](./settings.md)
|
||||
#FIXME: move these under sourcetree.md
|
||||
#- [pageserver/README.md](/pageserver/README.md)
|
||||
#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
|
||||
#- [test_runner/README.md](/test_runner/README.md)
|
||||
#- [safekeeper/README.md](/safekeeper/README.md)
|
||||
|
||||
|
||||
# RFCs
|
||||
@@ -79,5 +81,4 @@
|
||||
- [014-storage-lsm](rfcs/014-storage-lsm.md)
|
||||
- [015-storage-messaging](rfcs/015-storage-messaging.md)
|
||||
- [016-connection-routing](rfcs/016-connection-routing.md)
|
||||
- [017-pageserver-op-atomicity](rfcs/017-pageserver-op-atomicity.md)
|
||||
- [cluster-size-limits](rfcs/cluster-size-limits.md)
|
||||
|
||||
@@ -75,7 +75,7 @@ layer's Segment and range of LSNs.
|
||||
There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
layers are used to ingest incoming WAL, and provide fast access
|
||||
to the recent page versions. On-disk layers are stored as files on disk, and
|
||||
are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more.
|
||||
are immutable. See pageserver/src/layered_repository/README.md for more.
|
||||
|
||||
### Layer file (on-disk layer)
|
||||
|
||||
@@ -111,7 +111,7 @@ PostgreSQL LSNs and functions to monitor them:
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
|
||||
|
||||
Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information.
|
||||
Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
|
||||
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
|
||||
* `RestartLSN`: position in WAL confirmed by all safekeepers.
|
||||
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
|
||||
|
||||
@@ -68,6 +68,8 @@ There are the following implementations present:
|
||||
* local filesystem — to use in tests mainly
|
||||
* AWS S3 - to use in production
|
||||
|
||||
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
|
||||
|
||||
The backup service is disabled by default and can be enabled to interact with a single remote storage.
|
||||
|
||||
CLI examples:
|
||||
@@ -116,7 +118,7 @@ implemented by the LayeredRepository object in
|
||||
`layered_repository.rs`. There is only that one implementation of the
|
||||
Repository trait, but it's still a useful abstraction that keeps the
|
||||
interface for the low-level storage functionality clean. The layered
|
||||
storage format is described in [pageserver-storage.md](./pageserver-storage.md).
|
||||
storage format is described in layered_repository/README.md.
|
||||
|
||||
Each repository consists of multiple Timelines. Timeline is a
|
||||
workhorse that accepts page changes from the WAL, and serves
|
||||
|
||||
@@ -1,153 +0,0 @@
|
||||
# Durability and atomicity of tenant/timeline operations
|
||||
|
||||
The pageserver has 8 tenant/timeline operations, listed below. In
|
||||
addition to that, data can be appended to a timeline by WAL receiver,
|
||||
pages can be requested by the compute node, and tenant/timeline status
|
||||
can be queries through the mgmt API. But these are the operations that
|
||||
modify state in pageserver or in S3, and need to worry about crash
|
||||
safety.
|
||||
|
||||
To make these operations atomic and recoverable, let's introduce a new
|
||||
"tenant index file", called `tenant.json`. For each tenant, there is
|
||||
one tenant index file, and it contains a list of all timelines for
|
||||
that tenant:
|
||||
|
||||
{
|
||||
tenant_id: a93a94724945e95e1a0c448004ece2ec
|
||||
|
||||
timelines: [
|
||||
{ timeline_id: "9979cd302340a058606473912651f27f",
|
||||
ancestor_id: ""
|
||||
ancestor_lsn: "0/0"
|
||||
},
|
||||
{ timeline_id: "f0a6f3372d273dd9ca3480d19e6b565c",
|
||||
ancestor_id: "9979cd302340a058606473912651f27f"
|
||||
ancestor_lsn: "1/1698C48"
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
The file only contains the immutable metadata of each timeline, like
|
||||
the point it was branched from. The changing parts, like
|
||||
disk_consistent_lsn, are still stored in the per-timeline metadata
|
||||
file.
|
||||
|
||||
This file allows us to resolve some ambiguous situations, like
|
||||
remembering that a tenant exists when it doesn't have any timelines.
|
||||
It also allows us to quickly fetch the list of all timelines of a
|
||||
tenant, without having to perform S3 LIST operations.
|
||||
|
||||
Below is a brief description of all the pageserver tenant/timeline
|
||||
operations, and how the steps of creating/deleting local files or
|
||||
directories and uploads to S3 are performed. The steps are listed in
|
||||
such an order that each operation can be sanely recovered or aborted,
|
||||
if the pageserver crashes while the operation is being perfromed.
|
||||
|
||||
## Create tenant
|
||||
|
||||
Create an empty tenant. It doesn't have any timelines initially.
|
||||
|
||||
1. Create local tenant-directory with .temp extension
|
||||
2. Create tenant.json file in the directory, with a special flag
|
||||
indicating that the tenant-creation is in progress
|
||||
3. Rename the local tenant directory in place
|
||||
4. Upload the tenant.json file to S3, without the flag
|
||||
5. Update the local file, removing the flag
|
||||
|
||||
At pageserver startup, if we see a tenant.json file with the special
|
||||
flag, check if the tenant exists in S3. If not, remove the local directory.
|
||||
Otherwise remove the flag from local file.
|
||||
|
||||
## Create timeline
|
||||
|
||||
Create a timeline for a tenant, as result of running initdb.
|
||||
|
||||
1. create timeline directory locally, with .temp extension
|
||||
2. run initdb, creating the initial set of layers
|
||||
3. upload all layer files to S3
|
||||
4. upload metadata file to S3
|
||||
5. update tenant.json file in S3
|
||||
6. Rename local directory in place
|
||||
|
||||
If we crash before step 5, S3 may have a timeline metadata file and some
|
||||
layer files, without corresponding entry in tenant.json file. That's OK.
|
||||
Whenever we see that, we can delete the leftover timeline files.
|
||||
|
||||
If we want to make that less scary, we could update a tenant.json file in S3
|
||||
twice. First, add the new timeline ID to the file with a flag indicating
|
||||
that it's being created. Do that before uploading anything else to S3. And
|
||||
then in step 5, update tenant.json to indicate that the creation is complete.
|
||||
|
||||
## Branch timeline
|
||||
|
||||
Create a new timeline with an existing timeline as parent
|
||||
|
||||
1. create timeline directory locally, with .temp extension
|
||||
2. create metadata file in the local directory
|
||||
3. upload metadata file to S3
|
||||
4. update tenant.json file in S3
|
||||
5. Rename local directory in place
|
||||
|
||||
Like with Create timeline, if we crash between steps 3 and 4, we will
|
||||
leave behind a timeline metadata file with no corresponding entry in
|
||||
tenant.json. That's harmless.
|
||||
|
||||
## Delete timeline
|
||||
|
||||
1. rename local timeline directory to have .temp extension
|
||||
2. Update tenant.json file in S3
|
||||
3. delete index file from S3
|
||||
4. delete layer files from S3
|
||||
5. delete local directory
|
||||
|
||||
Like with creation, if this is interrupted, we will leave behind
|
||||
timeline files in S3 with no corresponding entry in tenant.json. If we
|
||||
want to make that less scary, we can update tenant.json in step 2 with
|
||||
a tombstone flag for the timeline we're removing, instead of removing
|
||||
the entry for it outright.
|
||||
|
||||
## Delete tenant
|
||||
|
||||
1. rename local tenant directory to have .temp extension
|
||||
2. delete tenant.json file in S3
|
||||
3. delete all timeline index files from S3
|
||||
4. delete all layer files from S3
|
||||
5. delete local directory
|
||||
|
||||
Like with timeline creation, this can leave behind files with no corresponding
|
||||
tenant.json file. We can make it less scary by adding tombstones.
|
||||
|
||||
## Attach tenant
|
||||
|
||||
1. create local tenant directory with .temp extension
|
||||
2. Download tenant.json file
|
||||
3. download index files for every timeline
|
||||
4. download all layer files (in the future, skip this and download them on demand)
|
||||
5. rename local tenant directory in place
|
||||
|
||||
## Detach tenant
|
||||
|
||||
1. rename local tenant directory to have .temp extension
|
||||
2. delete local directory
|
||||
|
||||
|
||||
## Load tenant
|
||||
|
||||
This happens automatically at pageserver startup, for every tenant that is found
|
||||
in the tenants-directory. I.e. for every tenant that was attached to the pageserver
|
||||
before the crash or shutdown.
|
||||
|
||||
1. download tenant.json file
|
||||
2. for every timeline that's in remote tenant.json:
|
||||
1. download remote index file
|
||||
2. download all layer files that are missing locally (skip in future, and download on-demand)
|
||||
3. schedule upload of all files present locally, but missing remotely
|
||||
4. schedule index file upload
|
||||
3. delete all locally present timeline directories that's not in tenant.json
|
||||
|
||||
|
||||
On startup, delete everything with the .temp extension
|
||||
|
||||
|
||||
- we could skip some of the downloads if we stored the S3 etag of the object in the local file,
|
||||
and compared that
|
||||
@@ -28,7 +28,7 @@ The pageserver has a few different duties:
|
||||
- Receive WAL from the WAL service and decode it.
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
|
||||
For more detailed info, see [pageserver-services.md](./pageserver-services.md)
|
||||
For more detailed info, see [/pageserver/README](/pageserver/README.md)
|
||||
|
||||
`/proxy`:
|
||||
|
||||
@@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
It acts as a holding area and redistribution center for recently generated WAL.
|
||||
|
||||
For more detailed info, see [walservice.md](./walservice.md)
|
||||
For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
|
||||
|
||||
`/workspace_hack`:
|
||||
The workspace_hack crate exists only to pin down some dependencies.
|
||||
|
||||
@@ -75,8 +75,8 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only
|
||||
one primary node can be actively streaming WAL to the quorum of
|
||||
safekeepers.
|
||||
|
||||
See [this section](safekeeper-protocol.md) for a more detailed description of
|
||||
the consensus protocol. spec/ contains TLA+ specification of it.
|
||||
See README_PROTO.md for a more detailed description of the consensus
|
||||
protocol. spec/ contains TLA+ specification of it.
|
||||
|
||||
# Q&A
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
once_cell = "1.13.0"
|
||||
once_cell = "1.8.0"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -6,5 +6,5 @@ edition = "2021"
|
||||
[dependencies]
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
once_cell = "1.13.0"
|
||||
lazy_static = "1.4"
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! make sure that we use the same dep version everywhere.
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
@@ -41,22 +41,19 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||
prometheus::gather()
|
||||
}
|
||||
|
||||
static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
lazy_static! {
|
||||
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
|
||||
"libmetrics_disk_io_bytes_total",
|
||||
"Bytes written and read from disk, grouped by the operation (read|write)",
|
||||
&["io_operation"]
|
||||
)
|
||||
.expect("Failed to register disk i/o bytes int gauge vec")
|
||||
});
|
||||
|
||||
static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
.expect("Failed to register disk i/o bytes int gauge vec");
|
||||
static ref MAXRSS_KB: IntGauge = register_int_gauge!(
|
||||
"libmetrics_maxrss_kb",
|
||||
"Memory usage (Maximum Resident Set Size)"
|
||||
)
|
||||
.expect("Failed to register maxrss_kb int gauge")
|
||||
});
|
||||
.expect("Failed to register maxrss_kb int gauge");
|
||||
}
|
||||
|
||||
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
|
||||
@@ -10,13 +10,13 @@ use std::io::{Read, Result, Write};
|
||||
/// # use std::io::{Result, Read};
|
||||
/// # use metrics::{register_int_counter, IntCounter};
|
||||
/// # use metrics::CountedReader;
|
||||
/// # use once_cell::sync::Lazy;
|
||||
/// #
|
||||
/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
|
||||
/// # lazy_static::lazy_static! {
|
||||
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
|
||||
/// # "int_counter",
|
||||
/// # "let's count something!"
|
||||
/// # ).unwrap()
|
||||
/// # });
|
||||
/// # ).unwrap();
|
||||
/// # }
|
||||
/// #
|
||||
/// fn do_some_reads(stream: impl Read, count: usize) -> Result<Vec<u8>> {
|
||||
/// let mut reader = CountedReader::new(stream, |cnt| {
|
||||
@@ -85,13 +85,13 @@ impl<T: Read> Read for CountedReader<'_, T> {
|
||||
/// # use std::io::{Result, Write};
|
||||
/// # use metrics::{register_int_counter, IntCounter};
|
||||
/// # use metrics::CountedWriter;
|
||||
/// # use once_cell::sync::Lazy;
|
||||
/// #
|
||||
/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
|
||||
/// # lazy_static::lazy_static! {
|
||||
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
|
||||
/// # "int_counter",
|
||||
/// # "let's count something!"
|
||||
/// # ).unwrap()
|
||||
/// # });
|
||||
/// # ).unwrap();
|
||||
/// # }
|
||||
/// #
|
||||
/// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> {
|
||||
/// let mut writer = CountedWriter::new(stream, |cnt| {
|
||||
|
||||
@@ -12,7 +12,7 @@ byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
once_cell = "1.13.0"
|
||||
lazy_static = "1.4"
|
||||
log = "0.4.14"
|
||||
memoffset = "0.6.2"
|
||||
thiserror = "1.0"
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! Common utilities for dealing with PostgreSQL relation files.
|
||||
//!
|
||||
use crate::pg_constants;
|
||||
use once_cell::sync::OnceCell;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
|
||||
@@ -54,14 +54,11 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||
///
|
||||
pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
static RELFILE_RE: OnceCell<Regex> = OnceCell::new();
|
||||
RELFILE_RE.get_or_init(|| {
|
||||
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap()
|
||||
});
|
||||
|
||||
lazy_static! {
|
||||
static ref RELFILE_RE: Regex =
|
||||
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
}
|
||||
let caps = RELFILE_RE
|
||||
.get()
|
||||
.unwrap()
|
||||
.captures(fname)
|
||||
.ok_or(FilePathError::InvalidFileName)?;
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::XLogRecord;
|
||||
use crate::XLOG_PAGE_MAGIC;
|
||||
|
||||
use crate::pg_constants::WAL_SEGMENT_SIZE;
|
||||
use anyhow::{anyhow, bail, ensure};
|
||||
use anyhow::{bail, ensure};
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
@@ -159,7 +159,7 @@ fn find_end_of_wal_segment(
|
||||
let mut buf = [0u8; XLOG_BLCKSZ];
|
||||
let file_name = XLogFileName(tli, segno, wal_seg_size);
|
||||
let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
|
||||
let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
|
||||
let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
|
||||
file.seek(SeekFrom::Start(offs as u64))?;
|
||||
// xl_crc is the last field in XLogRecord, will not be read into rec_hdr
|
||||
const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
|
||||
@@ -396,13 +396,10 @@ pub fn find_end_of_wal(
|
||||
let mut high_tli: TimeLineID = 0;
|
||||
let mut high_ispartial = false;
|
||||
|
||||
for entry in fs::read_dir(data_dir)?.flatten() {
|
||||
for entry in fs::read_dir(data_dir).unwrap().flatten() {
|
||||
let ispartial: bool;
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("Invalid file name"))?;
|
||||
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
/*
|
||||
* Check if the filename looks like an xlog file, or a .partial file.
|
||||
*/
|
||||
@@ -414,7 +411,7 @@ pub fn find_end_of_wal(
|
||||
continue;
|
||||
}
|
||||
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
|
||||
if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
|
||||
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
|
||||
continue;
|
||||
}
|
||||
if segno > high_segno
|
||||
|
||||
@@ -10,7 +10,7 @@ anyhow = "1.0"
|
||||
clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
log = "0.4"
|
||||
once_cell = "1.13.0"
|
||||
once_cell = "1.8.0"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres_ffi = { path = "../" }
|
||||
tempfile = "3.2"
|
||||
|
||||
@@ -7,7 +7,7 @@ edition = "2021"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-trait = "0.1"
|
||||
metrics = { version = "0.1", path = "../metrics" }
|
||||
once_cell = "1.13.0"
|
||||
once_cell = "1.8.0"
|
||||
rusoto_core = "0.48"
|
||||
rusoto_s3 = "0.48"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
|
||||
@@ -66,9 +66,6 @@ pub trait RemoteStorage: Send + Sync {
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
||||
/// so this method doesnt need to.
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
|
||||
@@ -116,7 +116,7 @@ impl RemoteStorage for LocalFs {
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix),
|
||||
Some(prefix) => Cow::Owned(self.storage_root.join(prefix)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
get_all_files(path.as_ref(), false).await
|
||||
|
||||
@@ -171,25 +171,17 @@ impl S3Bucket {
|
||||
|
||||
let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
|
||||
let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
|
||||
// session token is used when authorizing through sso
|
||||
// which is typically the case when testing locally on developer machine
|
||||
let session_token = std::env::var("AWS_SESSION_TOKEN").ok();
|
||||
|
||||
let client = if access_key_id.is_none() && secret_access_key.is_none() {
|
||||
debug!("Using IAM-based AWS access");
|
||||
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
|
||||
} else {
|
||||
debug!(
|
||||
"Using credentials-based AWS access. Session token is set: {}",
|
||||
session_token.is_some()
|
||||
);
|
||||
debug!("Using credentials-based AWS access");
|
||||
S3Client::new_with(
|
||||
request_dispatcher,
|
||||
StaticProvider::new(
|
||||
StaticProvider::new_minimal(
|
||||
access_key_id.unwrap_or_default(),
|
||||
secret_access_key.unwrap_or_default(),
|
||||
session_token,
|
||||
None,
|
||||
),
|
||||
region,
|
||||
)
|
||||
@@ -312,24 +304,32 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| p.0)
|
||||
.or_else(|| self.prefix_in_bucket.clone())
|
||||
.map(|mut p| {
|
||||
let list_prefix = match prefix {
|
||||
Some(prefix) => {
|
||||
let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
// if there is no trailing / in default prefix and
|
||||
// supplied prefix does not start with "/" insert it
|
||||
if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR)
|
||||
|| prefix.0.starts_with(S3_PREFIX_SEPARATOR))
|
||||
{
|
||||
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
|
||||
}
|
||||
|
||||
prefix_in_bucket.push_str(&prefix.0);
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if !p.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
p.push(S3_PREFIX_SEPARATOR);
|
||||
if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
Some(prefix_in_bucket)
|
||||
}
|
||||
None => self.prefix_in_bucket.clone(),
|
||||
};
|
||||
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ anyhow = "1.0"
|
||||
bincode = "1.3"
|
||||
bytes = "1.0.1"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
lazy_static = "1.4.0"
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
@@ -27,8 +28,6 @@ rustls = "0.20.2"
|
||||
rustls-split = "0.3.0"
|
||||
git-version = "0.3.5"
|
||||
serde_with = "1.12.0"
|
||||
once_cell = "1.13.0"
|
||||
|
||||
|
||||
metrics = { path = "../metrics" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -4,8 +4,8 @@ use crate::zid::ZTenantId;
|
||||
use anyhow::anyhow;
|
||||
use hyper::header::AUTHORIZATION;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
use lazy_static::lazy_static;
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::RequestInfo;
|
||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||
@@ -16,13 +16,13 @@ use std::net::TcpListener;
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
lazy_static! {
|
||||
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
|
||||
"libmetrics_metric_handler_requests_total",
|
||||
"Number of metric requests made"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
|
||||
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::{
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
|
||||
|
||||
@@ -19,15 +19,16 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
||||
(server_stream, client_stream)
|
||||
}
|
||||
|
||||
static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("key.pem"));
|
||||
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
|
||||
static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
|
||||
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
lazy_static! {
|
||||
static ref KEY: rustls::PrivateKey = {
|
||||
let mut cursor = Cursor::new(include_bytes!("key.pem"));
|
||||
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
|
||||
};
|
||||
static ref CERT: rustls::Certificate = {
|
||||
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
|
||||
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ssl() {
|
||||
|
||||
@@ -884,7 +884,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", start_match)) => {
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
eprintln!("pageserver start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@@ -906,19 +906,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
eprintln!("pageserver start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
|
||||
Ok(_) => println!("Page server is up and running"),
|
||||
Err(err) => {
|
||||
eprintln!("Page server is not available: {}", err);
|
||||
exit(1);
|
||||
}
|
||||
},
|
||||
|
||||
Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
|
||||
None => bail!("no pageserver subcommand provided"),
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ futures = "0.3.13"
|
||||
hex = "0.4.3"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
lazy_static = "1.4.0"
|
||||
clap = "3.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
@@ -47,7 +48,7 @@ tracing = "0.1.27"
|
||||
signal-hook = "0.3.10"
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
once_cell = "1.13.0"
|
||||
once_cell = "1.8.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
|
||||
@@ -37,7 +37,7 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
|
||||
|
||||
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
|
||||
// Then fishing out pg_control would be unnecessary
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
let mut modification = tline.begin_modification();
|
||||
modification.init_empty()?;
|
||||
|
||||
// Import all but pg_wal
|
||||
@@ -56,12 +56,12 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
|
||||
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
|
||||
pg_control = Some(control_file);
|
||||
}
|
||||
modification.flush()?;
|
||||
modification.flush(lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We're done importing all the data files.
|
||||
modification.commit()?;
|
||||
modification.commit(lsn)?;
|
||||
|
||||
// We expect the Postgres server to be shut down cleanly.
|
||||
let pg_control = pg_control.context("pg_control file not found")?;
|
||||
@@ -267,7 +267,7 @@ fn import_wal<T: DatadirTimeline>(
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
let mut modification = tline.begin_modification(endpoint);
|
||||
let mut modification = tline.begin_modification();
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
@@ -301,7 +301,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
base_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
info!("importing base at {}", base_lsn);
|
||||
let mut modification = tline.begin_modification(base_lsn);
|
||||
let mut modification = tline.begin_modification();
|
||||
modification.init_empty()?;
|
||||
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
@@ -319,7 +319,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
// We found the pg_control file.
|
||||
pg_control = Some(res);
|
||||
}
|
||||
modification.flush()?;
|
||||
modification.flush(base_lsn)?;
|
||||
}
|
||||
tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
@@ -333,7 +333,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
// sanity check: ensure that pg_control is loaded
|
||||
let _pg_control = pg_control.context("pg_control file not found")?;
|
||||
|
||||
modification.commit()?;
|
||||
modification.commit(base_lsn)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -385,7 +385,7 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
|
||||
waldecoder.feed_bytes(&bytes[offset..]);
|
||||
|
||||
let mut modification = tline.begin_modification(end_lsn);
|
||||
let mut modification = tline.begin_modification();
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! get/put call, walking back the timeline branching history as needed.
|
||||
//!
|
||||
//! The files are stored in the .neon/tenants/<tenantid>/timelines/<timelineid>
|
||||
//! directory. See docs/pageserver-storage.md for how the files are managed.
|
||||
//! directory. See layered_repository/README for how the files are managed.
|
||||
//! In addition to the layer files, there is a metadata file in the same
|
||||
//! directory that contains information about the timeline, in particular its
|
||||
//! parent timeline, and the last LSN that has been written to disk.
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::{ReadBufResult, PAGE_SZ};
|
||||
use bytes::Bytes;
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
@@ -117,7 +117,9 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
|
||||
lazy_static! {
|
||||
static ref NEXT_ID: AtomicU64 = AtomicU64::new(1);
|
||||
}
|
||||
|
||||
/// An adapter for reading a (virtual) file using the page cache.
|
||||
///
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::page_cache;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{ReadBufResult, WriteBufResult};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -21,15 +21,15 @@ use utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
///
|
||||
/// This is the global cache of file descriptors (File objects).
|
||||
///
|
||||
static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
|
||||
RwLock::new(EphemeralFiles {
|
||||
lazy_static! {
|
||||
///
|
||||
/// This is the global cache of file descriptors (File objects).
|
||||
///
|
||||
static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
|
||||
next_file_id: 1,
|
||||
files: HashMap::new(),
|
||||
})
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
pub struct EphemeralFiles {
|
||||
next_file_id: u64,
|
||||
|
||||
@@ -15,18 +15,19 @@ use crate::layered_repository::storage_layer::Layer;
|
||||
use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
|
||||
use crate::repository::Key;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use metrics::{register_int_gauge, IntGauge};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
lazy_static! {
|
||||
static ref NUM_ONDISK_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
|
||||
@@ -4,11 +4,11 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use tracing::*;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{hash_map::Entry, HashMap, HashSet};
|
||||
use std::collections::HashSet;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
@@ -38,9 +38,7 @@ use crate::layered_repository::{
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::pgdatadir_mapping::BlockNumber;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::DatadirTimeline;
|
||||
|
||||
@@ -60,102 +58,76 @@ use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{page_cache, storage_sync};
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) that capture the majority of
|
||||
/// latencies in the microsecond range but also extend far enough up to distinguish
|
||||
/// "bad" from "really bad".
|
||||
fn get_buckets_for_critical_operations() -> Vec<f64> {
|
||||
let buckets_per_digit = 5;
|
||||
let min_exponent = -6;
|
||||
let max_exponent = 2;
|
||||
|
||||
let mut buckets = vec![];
|
||||
// Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
|
||||
// because it's more numerically stable and doesn't result in numbers like 9.999999
|
||||
for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
|
||||
buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
|
||||
}
|
||||
buckets
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
pub static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds",
|
||||
"Time spent on storage operations",
|
||||
&["operation", "tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds",
|
||||
"Time spent on storage operations",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
get_buckets_for_critical_operations(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
lazy_static! {
|
||||
static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value",
|
||||
&["tenant_id", "timeline_id"],
|
||||
get_buckets_for_critical_operations(),
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
lazy_static! {
|
||||
static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"Number of cache hits from materialized page cache",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
.expect("failed to define a metric");
|
||||
static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"],
|
||||
get_buckets_for_critical_operations(),
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
lazy_static! {
|
||||
static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_last_record_lsn",
|
||||
"Last record LSN grouped by timeline",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
// Metrics for determining timeline's physical size.
|
||||
// A layered timeline's physical is defined as the total size of
|
||||
// (delta/image) layer files on disk.
|
||||
static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
lazy_static! {
|
||||
static ref CURRENT_PHYSICAL_SIZE: UIntGaugeVec = register_uint_gauge_vec!(
|
||||
"pageserver_current_physical_size",
|
||||
"Current physical size grouped by timeline",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
lazy_static! {
|
||||
static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
|
||||
"pageserver_created_persistent_files_total",
|
||||
"Number of files created that are meant to be uploaded to cloud storage",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
.expect("failed to define a metric");
|
||||
static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LayeredTimelineEntry {
|
||||
@@ -323,9 +295,6 @@ pub struct LayeredTimeline {
|
||||
/// or None if WAL receiver has not received anything for this timeline
|
||||
/// yet.
|
||||
pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
|
||||
|
||||
/// Relation size cache
|
||||
rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -337,42 +306,7 @@ pub struct WalReceiverInfo {
|
||||
/// Inherit all the functions from DatadirTimeline, to provide the
|
||||
/// functionality to store PostgreSQL relations, SLRUs, etc. in a
|
||||
/// LayeredTimeline.
|
||||
impl DatadirTimeline for LayeredTimeline {
|
||||
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
|
||||
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
|
||||
if lsn >= *cached_lsn {
|
||||
return Some(*nblocks);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
match rel_size_cache.entry(tag) {
|
||||
Entry::Occupied(mut entry) => {
|
||||
let cached_lsn = entry.get_mut();
|
||||
if lsn >= cached_lsn.0 {
|
||||
*cached_lsn = (lsn, nblocks);
|
||||
}
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert((lsn, nblocks));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.insert(tag, (lsn, nblocks));
|
||||
}
|
||||
|
||||
fn remove_cached_rel_size(&self, tag: &RelTag) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.remove(tag);
|
||||
}
|
||||
}
|
||||
impl DatadirTimeline for LayeredTimeline {}
|
||||
|
||||
///
|
||||
/// Information about how much history needs to be retained, needed by
|
||||
@@ -443,6 +377,8 @@ impl Timeline for LayeredTimeline {
|
||||
|
||||
/// Look up the value with the given a key
|
||||
fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes> {
|
||||
debug_assert!(lsn <= self.get_last_record_lsn());
|
||||
|
||||
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
|
||||
// The cached image can be returned directly if there is no WAL between the cached image
|
||||
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
|
||||
@@ -682,7 +618,6 @@ impl LayeredTimeline {
|
||||
repartition_threshold: 0,
|
||||
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_cache: RwLock::new(HashMap::new()),
|
||||
};
|
||||
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||
result
|
||||
|
||||
@@ -22,7 +22,7 @@ pub mod walreceiver;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use tracing::info;
|
||||
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
@@ -42,14 +42,14 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
lazy_static! {
|
||||
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_live_connections",
|
||||
"Number of live network connections",
|
||||
&["pageserver_connection_kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
pub const LOG_FILE_NAME: &str = "pageserver.log";
|
||||
|
||||
|
||||
@@ -55,6 +55,7 @@ use utils::{
|
||||
use crate::layered_repository::writeback_ephemeral_file;
|
||||
use crate::repository::Key;
|
||||
|
||||
// TODO move ownership into a new PageserverState struct
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 50;
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use std::io::{self, Read};
|
||||
use std::net::TcpListener;
|
||||
@@ -434,15 +434,15 @@ const TIME_BUCKETS: &[f64] = &[
|
||||
0.1, // 1/10 s
|
||||
];
|
||||
|
||||
static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
lazy_static! {
|
||||
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
TIME_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
|
||||
|
||||
@@ -56,16 +56,13 @@ pub trait DatadirTimeline: Timeline {
|
||||
/// This provides a transaction-like interface to perform a bunch
|
||||
/// of modifications atomically.
|
||||
///
|
||||
/// To ingest a WAL record, call begin_modification(lsn) to get a
|
||||
/// To ingest a WAL record, call begin_modification() to get a
|
||||
/// DatadirModification object. Use the functions in the object to
|
||||
/// modify the repository state, updating all the pages and metadata
|
||||
/// that the WAL record affects. When you're done, call commit() to
|
||||
/// commit the changes.
|
||||
/// that the WAL record affects. When you're done, call commit(lsn) to
|
||||
/// commit the changes. All the changes will be stamped with the specified LSN.
|
||||
///
|
||||
/// Lsn stored in modification is advanced by `ingest_record` and
|
||||
/// is used by `commit()` to update `last_record_lsn`.
|
||||
///
|
||||
/// Calling commit() will flush all the changes and reset the state,
|
||||
/// Calling commit(lsn) will flush all the changes and reset the state,
|
||||
/// so the `DatadirModification` struct can be reused to perform the next modification.
|
||||
///
|
||||
/// Note that any pending modifications you make through the
|
||||
@@ -73,7 +70,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
/// functions of the timeline until you finish! And if you update the
|
||||
/// same page twice, the last update wins.
|
||||
///
|
||||
fn begin_modification(&self, lsn: Lsn) -> DatadirModification<Self>
|
||||
fn begin_modification(&self) -> DatadirModification<Self>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
@@ -82,7 +79,6 @@ pub trait DatadirTimeline: Timeline {
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,10 +120,6 @@ pub trait DatadirTimeline: Timeline {
|
||||
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == pg_constants::FSM_FORKNUM
|
||||
|| tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM)
|
||||
&& !self.get_rel_exists(tag, lsn)?
|
||||
@@ -141,21 +133,13 @@ pub trait DatadirTimeline: Timeline {
|
||||
|
||||
let key = rel_size_to_key(tag);
|
||||
let mut buf = self.get(key, lsn)?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
// Update relation size cache
|
||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
||||
Ok(nblocks)
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(true);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = self.get(key, lsn)?;
|
||||
@@ -461,18 +445,6 @@ pub trait DatadirTimeline: Timeline {
|
||||
|
||||
Ok(result.to_keyspace())
|
||||
}
|
||||
|
||||
/// Get cached size of relation if it not updated after specified LSN
|
||||
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber>;
|
||||
|
||||
/// Update cached relation size if there is no more recent update
|
||||
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
|
||||
|
||||
/// Store cached relation size
|
||||
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
|
||||
|
||||
/// Remove cached relation size
|
||||
fn remove_cached_rel_size(&self, tag: &RelTag);
|
||||
}
|
||||
|
||||
/// DatadirModification represents an operation to ingest an atomic set of
|
||||
@@ -485,9 +457,6 @@ pub struct DatadirModification<'a, T: DatadirTimeline> {
|
||||
/// in the state in 'tline' yet.
|
||||
pub tline: &'a T,
|
||||
|
||||
/// Lsn assigned by begin_modification
|
||||
pub lsn: Lsn,
|
||||
|
||||
// The modifications are not applied directly to the underlying key-value store.
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
@@ -697,11 +666,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
|
||||
self.pending_nblocks += nblocks as isize;
|
||||
|
||||
// Update relation size cache
|
||||
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
|
||||
|
||||
// Even if nblocks > 0, we don't insert any actual blocks here. That's up to the
|
||||
// caller.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -717,9 +684,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
let buf = nblocks.to_le_bytes();
|
||||
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
||||
|
||||
// Update relation size cache
|
||||
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
|
||||
|
||||
// Update logical database size.
|
||||
self.pending_nblocks -= old_size as isize - nblocks as isize;
|
||||
Ok(())
|
||||
@@ -739,9 +703,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
let buf = nblocks.to_le_bytes();
|
||||
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
||||
|
||||
// Update relation size cache
|
||||
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
|
||||
|
||||
self.pending_nblocks += nblocks as isize - old_size as isize;
|
||||
}
|
||||
Ok(())
|
||||
@@ -767,9 +728,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
let old_size = self.get(size_key)?.get_u32_le();
|
||||
self.pending_nblocks -= old_size as isize;
|
||||
|
||||
// Remove enty from relation size cache
|
||||
self.tline.remove_cached_rel_size(&rel);
|
||||
|
||||
// Delete size entry, as well as all blocks
|
||||
self.delete(rel_key_range(rel));
|
||||
|
||||
@@ -884,7 +842,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub fn flush(&mut self) -> Result<()> {
|
||||
pub fn flush(&mut self, lsn: Lsn) -> Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -898,7 +856,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
let mut result: Result<()> = Ok(());
|
||||
self.pending_updates.retain(|&key, value| {
|
||||
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
|
||||
result = writer.put(key, self.lsn, value);
|
||||
result = writer.put(key, lsn, value);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
@@ -919,9 +877,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
/// underlying timeline.
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub fn commit(&mut self) -> Result<()> {
|
||||
pub fn commit(&mut self, lsn: Lsn) -> Result<()> {
|
||||
let writer = self.tline.writer();
|
||||
let lsn = self.lsn;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
@@ -1366,9 +1324,9 @@ pub fn create_test_timeline<R: Repository>(
|
||||
timeline_id: utils::zid::ZTimelineId,
|
||||
) -> Result<std::sync::Arc<R::Timeline>> {
|
||||
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
|
||||
let mut m = tline.begin_modification(Lsn(8));
|
||||
let mut m = tline.begin_modification();
|
||||
m.init_empty()?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(8))?;
|
||||
Ok(tline)
|
||||
}
|
||||
|
||||
|
||||
@@ -408,7 +408,7 @@ pub trait TimelineWriter<'a> {
|
||||
#[cfg(test)]
|
||||
pub mod repo_harness {
|
||||
use bytes::BytesMut;
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
@@ -439,7 +439,9 @@ pub mod repo_harness {
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
|
||||
lazy_static! {
|
||||
static ref LOCK: RwLock<()> = RwLock::new(());
|
||||
}
|
||||
|
||||
impl From<TenantConf> for TenantConfOpt {
|
||||
fn from(tenant_conf: TenantConf) -> Self {
|
||||
@@ -587,10 +589,11 @@ mod tests {
|
||||
//use std::sync::Arc;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
|
||||
lazy_static! {
|
||||
static ref TEST_KEY: Key = Key::from_slice(&hex!("112222222233333333444444445500000001"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_basic() -> Result<()> {
|
||||
|
||||
@@ -155,7 +155,8 @@ use std::{
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use once_cell::sync::{Lazy, OnceCell};
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
||||
use tokio::{
|
||||
fs,
|
||||
@@ -183,8 +184,8 @@ use crate::{
|
||||
};
|
||||
|
||||
use metrics::{
|
||||
register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec,
|
||||
IntCounterVec, IntGauge,
|
||||
register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge,
|
||||
HistogramVec, IntCounter, IntCounterVec, IntGauge,
|
||||
};
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
@@ -192,34 +193,34 @@ use self::download::download_index_parts;
|
||||
pub use self::download::gather_tenant_timelines_index_parts;
|
||||
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
|
||||
|
||||
static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
lazy_static! {
|
||||
static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!(
|
||||
"pageserver_remote_storage_remaining_sync_items",
|
||||
"Number of storage sync items left in the queue"
|
||||
)
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge")
|
||||
});
|
||||
|
||||
static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge");
|
||||
static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
|
||||
"pageserver_remote_storage_fatal_task_failures_total",
|
||||
"Number of critically failed tasks"
|
||||
)
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge");
|
||||
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_remote_storage_image_sync_seconds",
|
||||
"Time took to synchronize (download or upload) a whole pageserver image. \
|
||||
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
|
||||
&["tenant_id", "timeline_id", "operation_kind", "status"],
|
||||
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
|
||||
)
|
||||
.expect("failed to register pageserver image sync time histogram vec")
|
||||
});
|
||||
|
||||
static REMOTE_INDEX_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
.expect("failed to register pageserver image sync time histogram vec");
|
||||
static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!(
|
||||
"pageserver_remote_storage_remote_index_uploads_total",
|
||||
"Number of remote index uploads",
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to register pageserver remote index upload vec")
|
||||
});
|
||||
.expect("failed to register pageserver remote index upload vec");
|
||||
}
|
||||
|
||||
// TODO move ownership into a new PageserverState struct
|
||||
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
|
||||
|
||||
/// A timeline status to share with pageserver's sync counterpart,
|
||||
|
||||
@@ -130,7 +130,6 @@ where
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines = storage
|
||||
.list_prefixes(Some(tenant_storage_path))
|
||||
.await
|
||||
@@ -141,13 +140,6 @@ where
|
||||
)
|
||||
})?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!(
|
||||
"no timelines found on the remote storage for tenant {}",
|
||||
tenant_id
|
||||
)
|
||||
}
|
||||
|
||||
let mut sync_ids = HashSet::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::{fmt::Debug, path::PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use remote_storage::RemoteStorage;
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, info, warn};
|
||||
@@ -20,14 +20,14 @@ use crate::{
|
||||
};
|
||||
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||
|
||||
static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
lazy_static! {
|
||||
static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!(
|
||||
"pageserver_remote_storage_no_layers_uploads_total",
|
||||
"Number of skipped uploads due to no layers",
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to register pageserver no layers upload vec")
|
||||
});
|
||||
.expect("failed to register pageserver no layers upload vec");
|
||||
}
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<P, S>(
|
||||
|
||||
@@ -25,27 +25,26 @@ use utils::lsn::Lsn;
|
||||
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
// TODO move ownership into a new PageserverState struct
|
||||
mod tenants_state {
|
||||
use anyhow::ensure;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
|
||||
};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{debug, error};
|
||||
|
||||
use utils::zid::ZTenantId;
|
||||
|
||||
use crate::tenant_mgr::{LocalTimelineUpdate, Tenant};
|
||||
|
||||
static TENANTS: Lazy<RwLock<HashMap<ZTenantId, Tenant>>> =
|
||||
Lazy::new(|| RwLock::new(HashMap::new()));
|
||||
|
||||
/// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
|
||||
/// so that it can enable/disable corresponding processes.
|
||||
static TIMELINE_UPDATE_SENDER: Lazy<
|
||||
RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>>,
|
||||
> = Lazy::new(|| RwLock::new(None));
|
||||
lazy_static::lazy_static! {
|
||||
static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>> = RwLock::new(HashMap::new());
|
||||
/// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
|
||||
/// so that it can enable/disable corresponding processes.
|
||||
static ref TIMELINE_UPDATE_SENDER: RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>> = RwLock::new(None);
|
||||
}
|
||||
|
||||
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
TENANTS
|
||||
|
||||
@@ -87,6 +87,7 @@ async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
|
||||
);
|
||||
}
|
||||
|
||||
// TODO move ownership into a new PageserverState struct
|
||||
static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||
static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||
|
||||
|
||||
@@ -45,20 +45,22 @@ use tokio::sync::watch;
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::shutdown_pageserver;
|
||||
|
||||
/// Each thread that we track is associated with a "thread ID". It's just
|
||||
/// an increasing number that we assign, not related to any system thread
|
||||
/// id.
|
||||
static NEXT_THREAD_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
|
||||
// TODO move ownership into a new PageserverState struct
|
||||
lazy_static! {
|
||||
/// Each thread that we track is associated with a "thread ID". It's just
|
||||
/// an increasing number that we assign, not related to any system thread
|
||||
/// id.
|
||||
static ref NEXT_THREAD_ID: AtomicU64 = AtomicU64::new(1);
|
||||
|
||||
/// Global registry of threads
|
||||
static THREADS: Lazy<Mutex<HashMap<u64, Arc<PageServerThread>>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
/// Global registry of threads
|
||||
static ref THREADS: Mutex<HashMap<u64, Arc<PageServerThread>>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
// There is a Tokio watch channel for each thread, which can be used to signal the
|
||||
// thread that it needs to shut down. This thread local variable holds the receiving
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! This is similar to PostgreSQL's virtual file descriptor facility in
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||
@@ -32,24 +32,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
1.0, // 1 sec
|
||||
];
|
||||
|
||||
static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
lazy_static! {
|
||||
static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_io_operations_seconds",
|
||||
"Time spent in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
STORAGE_IO_TIME_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
lazy_static! {
|
||||
static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
|
||||
@@ -30,6 +30,8 @@ use anyhow::Result;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::walrecord::*;
|
||||
@@ -46,6 +48,8 @@ pub struct WalIngest<'a, T: DatadirTimeline> {
|
||||
|
||||
checkpoint: CheckPoint,
|
||||
checkpoint_modified: bool,
|
||||
|
||||
relsize_cache: HashMap<RelTag, BlockNumber>,
|
||||
}
|
||||
|
||||
impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
@@ -60,13 +64,13 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
timeline,
|
||||
checkpoint,
|
||||
checkpoint_modified: false,
|
||||
relsize_cache: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline.
|
||||
///
|
||||
/// This function updates `lsn` field of `DatadirModification`
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
@@ -78,7 +82,6 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
modification: &mut DatadirModification<T>,
|
||||
decoded: &mut DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
modification.lsn = lsn;
|
||||
decode_wal_record(recdata, decoded).context("failed decoding wal record")?;
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
@@ -257,7 +260,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
modification.commit()?;
|
||||
modification.commit(lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -405,7 +408,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
// replaying it would fail to find the previous image of the page, because
|
||||
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
||||
// record if it doesn't.
|
||||
let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
|
||||
let vm_size = self.get_relsize(vm_rel)?;
|
||||
if let Some(blknum) = new_vm_blk {
|
||||
if blknum >= vm_size {
|
||||
new_vm_blk = None;
|
||||
@@ -877,6 +880,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
modification: &mut DatadirModification<T>,
|
||||
rel: RelTag,
|
||||
) -> Result<()> {
|
||||
self.relsize_cache.insert(rel, 0);
|
||||
modification.put_rel_creation(rel, 0)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -912,6 +916,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
nblocks: BlockNumber,
|
||||
) -> Result<()> {
|
||||
modification.put_rel_truncation(rel, nblocks)?;
|
||||
self.relsize_cache.insert(rel, nblocks);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -921,16 +926,23 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
rel: RelTag,
|
||||
) -> Result<()> {
|
||||
modification.put_rel_drop(rel)?;
|
||||
self.relsize_cache.remove(&rel);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
||||
let nblocks = if !self.timeline.get_rel_exists(rel, lsn)? {
|
||||
0
|
||||
fn get_relsize(&mut self, rel: RelTag) -> Result<BlockNumber> {
|
||||
if let Some(nblocks) = self.relsize_cache.get(&rel) {
|
||||
Ok(*nblocks)
|
||||
} else {
|
||||
self.timeline.get_rel_size(rel, lsn)?
|
||||
};
|
||||
Ok(nblocks)
|
||||
let last_lsn = self.timeline.get_last_record_lsn();
|
||||
let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
|
||||
0
|
||||
} else {
|
||||
self.timeline.get_rel_size(rel, last_lsn)?
|
||||
};
|
||||
self.relsize_cache.insert(rel, nblocks);
|
||||
Ok(nblocks)
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_rel_extend(
|
||||
@@ -940,16 +952,22 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
blknum: BlockNumber,
|
||||
) -> Result<()> {
|
||||
let new_nblocks = blknum + 1;
|
||||
// Check if the relation exists. We implicitly create relations on first
|
||||
// record.
|
||||
// TODO: would be nice if to be more explicit about it
|
||||
let last_lsn = modification.lsn;
|
||||
let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
modification.put_rel_creation(rel, 0)?;
|
||||
0
|
||||
let old_nblocks = if let Some(nblocks) = self.relsize_cache.get(&rel) {
|
||||
*nblocks
|
||||
} else {
|
||||
self.timeline.get_rel_size(rel, last_lsn)?
|
||||
// Check if the relation exists. We implicitly create relations on first
|
||||
// record.
|
||||
// TODO: would be nice if to be more explicit about it
|
||||
let last_lsn = self.timeline.get_last_record_lsn();
|
||||
let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
modification.put_rel_creation(rel, 0)?;
|
||||
0
|
||||
} else {
|
||||
self.timeline.get_rel_size(rel, last_lsn)?
|
||||
};
|
||||
self.relsize_cache.insert(rel, nblocks);
|
||||
nblocks
|
||||
};
|
||||
|
||||
if new_nblocks > old_nblocks {
|
||||
@@ -960,6 +978,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
for gap_blknum in old_nblocks..blknum {
|
||||
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
|
||||
}
|
||||
self.relsize_cache.insert(rel, new_nblocks);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1050,10 +1069,10 @@ mod tests {
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
fn init_walingest_test<T: DatadirTimeline>(tline: &T) -> Result<WalIngest<T>> {
|
||||
let mut m = tline.begin_modification(Lsn(0x10));
|
||||
let mut m = tline.begin_modification();
|
||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||
m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x10))?;
|
||||
let walingest = WalIngest::new(tline, Lsn(0x10))?;
|
||||
|
||||
Ok(walingest)
|
||||
@@ -1065,19 +1084,19 @@ mod tests {
|
||||
let tline = create_test_timeline(repo, TIMELINE_ID)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_creation(&mut m, TESTREL_A)?;
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
|
||||
m.commit()?;
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
m.commit(Lsn(0x20))?;
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
|
||||
m.commit()?;
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
m.commit(Lsn(0x30))?;
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
|
||||
m.commit()?;
|
||||
let mut m = tline.begin_modification(Lsn(0x50));
|
||||
m.commit(Lsn(0x40))?;
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x50))?;
|
||||
|
||||
assert_current_logical_size(&*tline, Lsn(0x50));
|
||||
|
||||
@@ -1123,9 +1142,9 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
let mut m = tline.begin_modification(Lsn(0x60));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x60))?;
|
||||
assert_current_logical_size(&*tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
@@ -1147,15 +1166,15 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
let mut m = tline.begin_modification(Lsn(0x68));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x68))?;
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0);
|
||||
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
let mut m = tline.begin_modification(Lsn(0x70));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x70))?;
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2);
|
||||
assert_eq!(
|
||||
tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?,
|
||||
@@ -1167,9 +1186,9 @@ mod tests {
|
||||
);
|
||||
|
||||
// Extend a lot more, leaving a big gap that spans across segments
|
||||
let mut m = tline.begin_modification(Lsn(0x80));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x80))?;
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501);
|
||||
for blk in 2..1500 {
|
||||
assert_eq!(
|
||||
@@ -1193,18 +1212,18 @@ mod tests {
|
||||
let tline = create_test_timeline(repo, TIMELINE_ID)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x20))?;
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1);
|
||||
|
||||
// Drop rel
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_drop(&mut m, TESTREL_A)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x30))?;
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
|
||||
@@ -1213,9 +1232,9 @@ mod tests {
|
||||
//assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none());
|
||||
|
||||
// Re-create it
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x40))?;
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
|
||||
@@ -1235,12 +1254,12 @@ mod tests {
|
||||
|
||||
// Create a 20 MB relation (the size is arbitrary)
|
||||
let relsize = 20 * 1024 * 1024 / 8192;
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
let mut m = tline.begin_modification();
|
||||
for blkno in 0..relsize {
|
||||
let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
|
||||
}
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x20))?;
|
||||
|
||||
// The relation was created at LSN 20, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
|
||||
@@ -1261,9 +1280,9 @@ mod tests {
|
||||
|
||||
// Truncate relation so that second segment was dropped
|
||||
// - only leave one page
|
||||
let mut m = tline.begin_modification(Lsn(0x60));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(0x60))?;
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1);
|
||||
@@ -1291,12 +1310,12 @@ mod tests {
|
||||
// Extend relation again.
|
||||
// Add enough blocks to create second segment
|
||||
let lsn = Lsn(0x80);
|
||||
let mut m = tline.begin_modification(lsn);
|
||||
let mut m = tline.begin_modification();
|
||||
for blkno in 0..relsize {
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
|
||||
}
|
||||
m.commit()?;
|
||||
m.commit(lsn)?;
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize);
|
||||
@@ -1324,10 +1343,10 @@ mod tests {
|
||||
let mut lsn = 0x10;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
let mut m = tline.begin_modification();
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(lsn))?;
|
||||
}
|
||||
|
||||
assert_current_logical_size(&*tline, Lsn(lsn));
|
||||
@@ -1339,9 +1358,9 @@ mod tests {
|
||||
|
||||
// Truncate one block
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(lsn))?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE
|
||||
@@ -1350,9 +1369,9 @@ mod tests {
|
||||
|
||||
// Truncate another block
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(lsn))?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
@@ -1364,9 +1383,9 @@ mod tests {
|
||||
let mut size: i32 = 3000;
|
||||
while size >= 0 {
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
let mut m = tline.begin_modification();
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
|
||||
m.commit()?;
|
||||
m.commit(Lsn(lsn))?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
|
||||
size as BlockNumber
|
||||
|
||||
@@ -66,7 +66,7 @@ pub fn init_wal_receiver_main_thread(
|
||||
);
|
||||
let broker_prefix = &conf.broker_etcd_prefix;
|
||||
info!(
|
||||
"Starting wal receiver main thread, etcd endpoints: {}",
|
||||
"Starting wal receiver main thread, etdc endpoints: {}",
|
||||
etcd_endpoints.iter().map(Url::to_string).join(", ")
|
||||
);
|
||||
|
||||
|
||||
@@ -154,7 +154,7 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
{
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
let mut modification = timeline.begin_modification(endlsn);
|
||||
let mut modification = timeline.begin_modification();
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// let _enter = info_span!("processing record", lsn = %lsn).entered();
|
||||
|
||||
|
||||
@@ -20,8 +20,8 @@
|
||||
//!
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use nix::poll::*;
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -105,27 +105,21 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
// We collect the time spent in actual WAL redo ('redo'), and time waiting
|
||||
// for access to the postgres process ('wait') since there is only one for
|
||||
// each tenant.
|
||||
|
||||
static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
lazy_static! {
|
||||
static ref WAL_REDO_TIME: Histogram =
|
||||
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
|
||||
.expect("failed to define a metric");
|
||||
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the WAL redo process"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
.expect("failed to define a metric");
|
||||
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
|
||||
"pageserver_replayed_wal_records_total",
|
||||
"Number of WAL records replayed in WAL redo process"
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
///
|
||||
/// This is the real implementation that uses a Postgres process to
|
||||
|
||||
@@ -14,7 +14,7 @@ hashbrown = "0.11.2"
|
||||
hex = "0.4.3"
|
||||
hmac = "0.12.1"
|
||||
hyper = "0.14"
|
||||
once_cell = "1.13.0"
|
||||
lazy_static = "1.4.0"
|
||||
md5 = "0.7.0"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2.7"
|
||||
|
||||
@@ -12,12 +12,13 @@ use crate::{
|
||||
stream::PqStream,
|
||||
waiters::{self, Waiter, Waiters},
|
||||
};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
|
||||
static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);
|
||||
lazy_static! {
|
||||
static ref CPLANE_WAITERS: Waiters<mgmt::ComputeReady> = Default::default();
|
||||
}
|
||||
|
||||
/// Give caller an opportunity to wait for the cloud's reply.
|
||||
pub async fn with_waiter<R, T, E>(
|
||||
|
||||
@@ -4,8 +4,8 @@ use crate::config::{ProxyConfig, TlsConfig};
|
||||
use crate::stream::{MetricsStream, PqStream, Stream};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::TryFutureExt;
|
||||
use lazy_static::lazy_static;
|
||||
use metrics::{register_int_counter, IntCounter};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use utils::pq_proto::{BeMessage as Be, *};
|
||||
@@ -13,29 +13,23 @@ use utils::pq_proto::{BeMessage as Be, *};
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
const ERR_PROTO_VIOLATION: &str = "protocol violation";
|
||||
|
||||
static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
lazy_static! {
|
||||
static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!(
|
||||
"proxy_accepted_connections_total",
|
||||
"Number of TCP client connections accepted."
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
.unwrap();
|
||||
static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!(
|
||||
"proxy_closed_connections_total",
|
||||
"Number of TCP client connections closed."
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
.unwrap();
|
||||
static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!(
|
||||
"proxy_io_bytes_total",
|
||||
"Number of bytes sent/received between any client and backend."
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// A small combinator for pluggable error logging.
|
||||
async fn log_error<R, F>(future: F) -> F::Output
|
||||
|
||||
@@ -1,8 +1,4 @@
|
||||
[pytest]
|
||||
filterwarnings =
|
||||
error::pytest.PytestUnhandledThreadExceptionWarning
|
||||
error::UserWarning
|
||||
ignore:record_property is incompatible with junit_family:pytest.PytestWarning
|
||||
addopts =
|
||||
-m 'not remote_cluster'
|
||||
markers =
|
||||
|
||||
@@ -9,6 +9,7 @@ bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
hyper = "0.14"
|
||||
fs2 = "0.4.3"
|
||||
lazy_static = "1.4.0"
|
||||
serde_json = "1"
|
||||
tracing = "0.1.27"
|
||||
clap = "3.0"
|
||||
@@ -28,7 +29,7 @@ const_format = "0.2.21"
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
git-version = "0.3.5"
|
||||
async-trait = "0.1"
|
||||
once_cell = "1.13.0"
|
||||
once_cell = "1.10.0"
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Read, Write};
|
||||
@@ -26,15 +26,15 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||
const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
|
||||
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
|
||||
static PERSIST_CONTROL_FILE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
lazy_static! {
|
||||
static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_control_file_seconds",
|
||||
"Seconds to persist and sync control file, grouped by timeline",
|
||||
&["tenant_id", "timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec")
|
||||
});
|
||||
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec");
|
||||
}
|
||||
|
||||
/// Storage should keep actual state inside of it. It should implement Deref
|
||||
/// trait to access state fields and have persist method for updating that state.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use anyhow::{bail, Context, Result};
|
||||
|
||||
use etcd_broker::subscription_value::SkTimelineInfo;
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use postgres_ffi::xlog_utils::XLogSegNo;
|
||||
|
||||
use serde::Serialize;
|
||||
@@ -559,12 +559,12 @@ struct GlobalTimelinesState {
|
||||
wal_backup_launcher_tx: Option<Sender<ZTenantTimelineId>>,
|
||||
}
|
||||
|
||||
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
|
||||
Mutex::new(GlobalTimelinesState {
|
||||
lazy_static! {
|
||||
static ref TIMELINES_STATE: Mutex<GlobalTimelinesState> = Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
wal_backup_launcher_tx: None,
|
||||
})
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Serialize)]
|
||||
pub struct TimelineDeleteForceResult {
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::io::{self, Seek, SeekFrom};
|
||||
use std::pin::Pin;
|
||||
use tokio::io::AsyncRead;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use lazy_static::lazy_static;
|
||||
use postgres_ffi::xlog_utils::{
|
||||
find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI,
|
||||
};
|
||||
@@ -38,44 +38,31 @@ use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECOND
|
||||
|
||||
use tokio::io::{AsyncReadExt, AsyncSeekExt};
|
||||
|
||||
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
|
||||
// i64 is faster than f64, so update to u64 when available.
|
||||
static WRITE_WAL_BYTES: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
lazy_static! {
|
||||
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
|
||||
// i64 is faster than f64, so update to u64 when available.
|
||||
static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_write_wal_bytes",
|
||||
"Bytes written to WAL in a single request, grouped by timeline",
|
||||
&["tenant_id", "timeline_id"],
|
||||
vec![
|
||||
1.0,
|
||||
10.0,
|
||||
100.0,
|
||||
1024.0,
|
||||
8192.0,
|
||||
128.0 * 1024.0,
|
||||
1024.0 * 1024.0,
|
||||
10.0 * 1024.0 * 1024.0
|
||||
]
|
||||
vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]
|
||||
)
|
||||
.expect("Failed to register safekeeper_write_wal_bytes histogram vec")
|
||||
});
|
||||
static WRITE_WAL_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
.expect("Failed to register safekeeper_write_wal_bytes histogram vec");
|
||||
static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_write_wal_seconds",
|
||||
"Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline",
|
||||
&["tenant_id", "timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_write_wal_seconds histogram vec")
|
||||
});
|
||||
static FLUSH_WAL_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
.expect("Failed to register safekeeper_write_wal_seconds histogram vec");
|
||||
static ref FLUSH_WAL_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_flush_wal_seconds",
|
||||
"Seconds spent syncing WAL to a disk, grouped by timeline",
|
||||
&["tenant_id", "timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_flush_wal_seconds histogram vec")
|
||||
});
|
||||
.expect("Failed to register safekeeper_flush_wal_seconds histogram vec");
|
||||
}
|
||||
|
||||
struct WalStorageMetrics {
|
||||
write_wal_bytes: Histogram,
|
||||
|
||||
@@ -1,708 +0,0 @@
|
||||
#
|
||||
# Script to export tenants from one pageserver and import them into another page server.
|
||||
#
|
||||
# Outline of steps:
|
||||
# 1. Get `(last_lsn, prev_lsn)` from old pageserver
|
||||
# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
|
||||
# 3. This tar file might be missing relation files for empty relations, if the pageserver
|
||||
# is old enough (we didn't always store those). So to recreate them, we start a local
|
||||
# vanilla postgres on this basebackup and ask it what relations should exist, then touch
|
||||
# any missing files and re-pack the tar.
|
||||
# TODO This functionality is no longer needed, so we can delete it later if we don't
|
||||
# end up using the same utils for the pg 15 upgrade. Not sure.
|
||||
# 4. We import the patched basebackup into a new pageserver
|
||||
# 5. We export again via fullbackup, now from the new pageserver and compare the returned
|
||||
# tar file with the one we imported. This confirms that we imported everything that was
|
||||
# exported, but doesn't guarantee correctness (what if we didn't **export** everything
|
||||
# initially?)
|
||||
# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
|
||||
#
|
||||
# For more context on how to use this, see:
|
||||
# https://github.com/neondatabase/cloud/wiki/Storage-format-migration
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
from contextlib import closing
|
||||
import psycopg2
|
||||
import subprocess
|
||||
import argparse
|
||||
import time
|
||||
import requests
|
||||
import uuid
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple
|
||||
|
||||
###############################################
|
||||
### client-side utils copied from test fixtures
|
||||
###############################################
|
||||
|
||||
Env = Dict[str, str]
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
|
||||
def global_counter() -> int:
|
||||
""" A really dumb global counter.
|
||||
This is useful for giving output files a unique number, so if we run the
|
||||
same command multiple times we can keep their output separate.
|
||||
"""
|
||||
global _global_counter
|
||||
_global_counter += 1
|
||||
return _global_counter
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
""" Run a process and capture its output
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
where "cmd" is the name of the program and NNN is an incrementing
|
||||
counter.
|
||||
If those files already exist, we will overwrite them.
|
||||
Returns basepath for files with captured output.
|
||||
"""
|
||||
assert type(cmd) is list
|
||||
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
|
||||
basepath = os.path.join(capture_dir, base)
|
||||
stdout_filename = basepath + '.stdout'
|
||||
stderr_filename = basepath + '.stderr'
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename, 'w') as stderr_f:
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
return basepath
|
||||
|
||||
|
||||
class PgBin:
|
||||
""" A helper class for executing postgres binaries """
|
||||
def __init__(self, log_dir: Path, pg_distrib_dir):
|
||||
self.log_dir = log_dir
|
||||
self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin')
|
||||
self.env = os.environ.copy()
|
||||
self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib')
|
||||
|
||||
def _fixpath(self, command: List[str]):
|
||||
if '/' not in command[0]:
|
||||
command[0] = os.path.join(self.pg_bin_path, command[0])
|
||||
|
||||
def _build_env(self, env_add: Optional[Env]) -> Env:
|
||||
if env_add is None:
|
||||
return self.env
|
||||
env = self.env.copy()
|
||||
env.update(env_add)
|
||||
return env
|
||||
|
||||
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
|
||||
"""
|
||||
Run one of the postgres binaries.
|
||||
The command should be in list form, e.g. ['pgbench', '-p', '55432']
|
||||
All the necessary environment variables will be set.
|
||||
If the first argument (the command name) doesn't include a path (no '/'
|
||||
characters present), then it will be edited to include the correct path.
|
||||
If you want stdout/stderr captured to files, use `run_capture` instead.
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
def run_capture(self,
|
||||
command: List[str],
|
||||
env: Optional[Env] = None,
|
||||
cwd: Optional[str] = None,
|
||||
**kwargs: Any) -> str:
|
||||
"""
|
||||
Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
This is just like `run`, but for chatty programs. Returns basepath for files
|
||||
with captured output.
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
return subprocess_capture(str(self.log_dir),
|
||||
command,
|
||||
env=env,
|
||||
cwd=cwd,
|
||||
check=True,
|
||||
**kwargs)
|
||||
|
||||
|
||||
class PgProtocol:
|
||||
""" Reusable connection logic """
|
||||
def __init__(self, **kwargs):
|
||||
self.default_options = kwargs
|
||||
|
||||
def conn_options(self, **kwargs):
|
||||
conn_options = self.default_options.copy()
|
||||
if 'dsn' in kwargs:
|
||||
conn_options.update(parse_dsn(kwargs['dsn']))
|
||||
conn_options.update(kwargs)
|
||||
|
||||
# Individual statement timeout in seconds. 2 minutes should be
|
||||
# enough for our tests, but if you need a longer, you can
|
||||
# change it by calling "SET statement_timeout" after
|
||||
# connecting.
|
||||
if 'options' in conn_options:
|
||||
conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options']
|
||||
else:
|
||||
conn_options['options'] = "-cstatement_timeout=120s"
|
||||
return conn_options
|
||||
|
||||
# autocommit=True here by default because that's what we need most of the time
|
||||
def connect(self, autocommit=True, **kwargs) -> PgConnection:
|
||||
"""
|
||||
Connect to the node.
|
||||
Returns psycopg2's connection object.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
conn = psycopg2.connect(**self.conn_options(**kwargs))
|
||||
|
||||
# WARNING: this setting affects *all* tests!
|
||||
conn.autocommit = autocommit
|
||||
return conn
|
||||
|
||||
def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
|
||||
"""
|
||||
Execute query against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
return self.safe_psql_many([query], **kwargs)[0]
|
||||
|
||||
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
|
||||
"""
|
||||
Execute queries against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
result: List[List[Any]] = []
|
||||
with closing(self.connect(**kwargs)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for query in queries:
|
||||
print(f"Executing query: {query}")
|
||||
cur.execute(query)
|
||||
|
||||
if cur.description is None:
|
||||
result.append([]) # query didn't return data
|
||||
else:
|
||||
result.append(cast(List[Any], cur.fetchall()))
|
||||
return result
|
||||
|
||||
|
||||
class VanillaPostgres(PgProtocol):
|
||||
def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
|
||||
super().__init__(host='localhost', port=port, dbname='postgres')
|
||||
self.pgdatadir = pgdatadir
|
||||
self.pg_bin = pg_bin
|
||||
self.running = False
|
||||
if init:
|
||||
self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)])
|
||||
self.configure([f"port = {port}\n"])
|
||||
|
||||
def configure(self, options: List[str]):
|
||||
"""Append lines into postgresql.conf file."""
|
||||
assert not self.running
|
||||
with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file:
|
||||
conf_file.write("\n".join(options))
|
||||
|
||||
def start(self, log_path: Optional[str] = None):
|
||||
assert not self.running
|
||||
self.running = True
|
||||
|
||||
if log_path is None:
|
||||
log_path = os.path.join(self.pgdatadir, "pg.log")
|
||||
|
||||
self.pg_bin.run_capture(
|
||||
['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start'])
|
||||
|
||||
def stop(self):
|
||||
assert self.running
|
||||
self.running = False
|
||||
self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop'])
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
if self.running:
|
||||
self.stop()
|
||||
|
||||
|
||||
class NeonPageserverApiException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class NeonPageserverHttpClient(requests.Session):
|
||||
def __init__(self, host, port):
|
||||
super().__init__()
|
||||
self.host = host
|
||||
self.port = port
|
||||
|
||||
def verbose_error(self, res: requests.Response):
|
||||
try:
|
||||
res.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
try:
|
||||
msg = res.json()['msg']
|
||||
except:
|
||||
msg = ''
|
||||
raise NeonPageserverApiException(msg) from e
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def tenant_list(self):
|
||||
res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
|
||||
res = self.post(
|
||||
f"http://{self.host}:{self.port}/v1/tenant",
|
||||
json={
|
||||
'new_tenant_id': new_tenant_id.hex,
|
||||
},
|
||||
)
|
||||
|
||||
if res.status_code == 409:
|
||||
if ok_if_exists:
|
||||
print(f'could not create tenant: already exists for id {new_tenant_id}')
|
||||
else:
|
||||
res.raise_for_status()
|
||||
elif res.status_code == 201:
|
||||
print(f'created tenant {new_tenant_id}')
|
||||
else:
|
||||
self.verbose_error(res)
|
||||
|
||||
return new_tenant_id
|
||||
|
||||
def timeline_list(self, tenant_id: uuid.UUID):
|
||||
res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1"
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
|
||||
def lsn_to_hex(num: int) -> str:
|
||||
""" Convert lsn from int to standard hex notation. """
|
||||
return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
|
||||
|
||||
|
||||
def lsn_from_hex(lsn_hex: str) -> int:
|
||||
""" Convert lsn from hex notation to int. """
|
||||
l, r = lsn_hex.split('/')
|
||||
return (int(l, 16) << 32) + int(r, 16)
|
||||
|
||||
|
||||
def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient,
|
||||
tenant: uuid.UUID,
|
||||
timeline: uuid.UUID) -> int:
|
||||
detail = pageserver_http_client.timeline_detail(tenant, timeline)
|
||||
|
||||
if detail['remote'] is None:
|
||||
# No remote information at all. This happens right after creating
|
||||
# a timeline, before any part of it has been uploaded to remote
|
||||
# storage yet.
|
||||
return 0
|
||||
else:
|
||||
lsn_str = detail['remote']['remote_consistent_lsn']
|
||||
assert isinstance(lsn_str, str)
|
||||
return lsn_from_hex(lsn_str)
|
||||
|
||||
|
||||
def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient,
|
||||
tenant: uuid.UUID,
|
||||
timeline: uuid.UUID,
|
||||
lsn: int):
|
||||
"""waits for local timeline upload up to specified lsn"""
|
||||
for i in range(10):
|
||||
current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return
|
||||
print("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1))
|
||||
time.sleep(1)
|
||||
|
||||
raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
|
||||
lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
|
||||
|
||||
|
||||
##############
|
||||
# End of utils
|
||||
##############
|
||||
|
||||
|
||||
def pack_base(log_dir, restored_dir, output_tar):
|
||||
"""Create tar file from basebackup, being careful to produce relative filenames."""
|
||||
tmp_tar_name = "tmp.tar"
|
||||
tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
|
||||
cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
|
||||
# We actually cd into the dir and call tar from there. If we call tar from
|
||||
# outside we won't encode filenames as relative, and they won't parse well
|
||||
# on import.
|
||||
subprocess_capture(log_dir, cmd, cwd=restored_dir)
|
||||
shutil.move(tmp_tar_path, output_tar)
|
||||
|
||||
|
||||
def reconstruct_paths(log_dir, pg_bin, base_tar):
|
||||
"""Reconstruct what relation files should exist in the datadir by querying postgres."""
|
||||
with tempfile.TemporaryDirectory() as restored_dir:
|
||||
# Unpack the base tar
|
||||
subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
|
||||
|
||||
# Start a vanilla postgres from the given datadir and query it to find
|
||||
# what relfiles should exist, but possibly don't.
|
||||
port = "55439" # Probably free
|
||||
with VanillaPostgres(restored_dir, pg_bin, port, init=False) as vanilla_pg:
|
||||
vanilla_pg.configure([f"port={port}"])
|
||||
vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
|
||||
|
||||
# Create database based on template0 because we can't connect to template0
|
||||
query = "create database template0copy template template0"
|
||||
vanilla_pg.safe_psql(query, user="cloud_admin")
|
||||
vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
|
||||
|
||||
# Get all databases
|
||||
query = "select oid, datname from pg_database"
|
||||
oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
|
||||
template0_oid = [
|
||||
oid for (oid, database) in oid_dbname_pairs if database == "template0"
|
||||
][0]
|
||||
|
||||
# Get rel paths for each database
|
||||
for oid, database in oid_dbname_pairs:
|
||||
if database == "template0":
|
||||
# We can't connect to template0
|
||||
continue
|
||||
|
||||
query = "select relname, pg_relation_filepath(oid) from pg_class"
|
||||
result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
|
||||
for relname, filepath in result:
|
||||
if filepath is not None:
|
||||
|
||||
if database == "template0copy":
|
||||
# Add all template0copy paths to template0
|
||||
prefix = f"base/{oid}/"
|
||||
if filepath.startswith(prefix):
|
||||
suffix = filepath[len(prefix):]
|
||||
yield f"base/{template0_oid}/{suffix}"
|
||||
elif filepath.startswith("global"):
|
||||
print(f"skipping {database} global file {filepath}")
|
||||
else:
|
||||
raise AssertionError
|
||||
else:
|
||||
yield filepath
|
||||
|
||||
|
||||
def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
|
||||
"""Add the appropriate empty files to a basebadkup tar."""
|
||||
with tempfile.TemporaryDirectory() as restored_dir:
|
||||
# Unpack the base tar
|
||||
subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
|
||||
|
||||
# Touch files that don't exist
|
||||
for path in paths:
|
||||
absolute_path = os.path.join(restored_dir, path)
|
||||
exists = os.path.exists(absolute_path)
|
||||
if not exists:
|
||||
print(f"File {absolute_path} didn't exist. Creating..")
|
||||
Path(absolute_path).touch()
|
||||
|
||||
# Repackage
|
||||
pack_base(log_dir, restored_dir, output_tar)
|
||||
|
||||
|
||||
# HACK This is a workaround for exporting from old pageservers that
|
||||
# can't export empty relations. In this case we need to start
|
||||
# a vanilla postgres from the exported datadir, and query it
|
||||
# to see what empty relations are missing, and then create
|
||||
# those empty files before importing.
|
||||
def add_missing_rels(base_tar, output_tar, log_dir, pg_bin):
|
||||
reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar))
|
||||
touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
|
||||
|
||||
|
||||
def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
|
||||
conn = psycopg2.connect(pageserver_connstr)
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
|
||||
cur.execute(cmd)
|
||||
res = cur.fetchone()
|
||||
prev_lsn = res[0]
|
||||
last_lsn = res[1]
|
||||
conn.close()
|
||||
|
||||
return last_lsn, prev_lsn
|
||||
|
||||
|
||||
def import_timeline(args,
|
||||
psql_path,
|
||||
pageserver_connstr,
|
||||
pageserver_http,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename):
|
||||
# Import timelines to new pageserver
|
||||
import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}"
|
||||
full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
|
||||
|
||||
stderr_filename2 = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
|
||||
stdout_filename = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
|
||||
|
||||
print(f"Running: {full_cmd}")
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename2, 'w') as stderr_f:
|
||||
print(f"(capturing output to {stdout_filename})")
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
|
||||
subprocess.run(full_cmd,
|
||||
stdout=stdout_f,
|
||||
stderr=stderr_f,
|
||||
env=pg_bin._build_env(None),
|
||||
shell=True,
|
||||
check=True)
|
||||
|
||||
print(f"Done import")
|
||||
|
||||
# Wait until pageserver persists the files
|
||||
wait_for_upload(pageserver_http,
|
||||
uuid.UUID(tenant_id),
|
||||
uuid.UUID(timeline_id),
|
||||
lsn_from_hex(last_lsn))
|
||||
|
||||
|
||||
def export_timeline(args,
|
||||
psql_path,
|
||||
pageserver_connstr,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename):
|
||||
# Choose filenames
|
||||
incomplete_filename = tar_filename + ".incomplete"
|
||||
stderr_filename = path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
|
||||
|
||||
# Construct export command
|
||||
query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
|
||||
cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
|
||||
|
||||
# Run export command
|
||||
print(f"Running: {cmd}")
|
||||
with open(incomplete_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename, 'w') as stderr_f:
|
||||
print(f"(capturing output to {incomplete_filename})")
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
|
||||
subprocess.run(cmd,
|
||||
stdout=stdout_f,
|
||||
stderr=stderr_f,
|
||||
env=pg_bin._build_env(None),
|
||||
check=True)
|
||||
|
||||
# Add missing rels
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
|
||||
add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin)
|
||||
|
||||
# Log more info
|
||||
file_size = os.path.getsize(tar_filename)
|
||||
print(f"Done export: {tar_filename}, size {file_size}")
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql")
|
||||
|
||||
old_pageserver_host = args.old_pageserver_host
|
||||
new_pageserver_host = args.new_pageserver_host
|
||||
|
||||
old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
|
||||
old_http_client.check_status()
|
||||
old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
|
||||
|
||||
new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
|
||||
new_http_client.check_status()
|
||||
new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
|
||||
|
||||
for tenant_id in args.tenants:
|
||||
print(f"Tenant: {tenant_id}")
|
||||
timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
|
||||
print(f"Timelines: {timelines}")
|
||||
|
||||
# Create tenant in new pageserver
|
||||
if args.only_import is False and not args.timelines:
|
||||
new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
|
||||
|
||||
for timeline in timelines:
|
||||
# Skip timelines we don't need to export
|
||||
if args.timelines and timeline['timeline_id'] not in args.timelines:
|
||||
print(f"Skipping timeline {timeline['timeline_id']}")
|
||||
continue
|
||||
|
||||
# Choose filenames
|
||||
tar_filename = path.join(args.work_dir,
|
||||
f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar")
|
||||
|
||||
# Export timeline from old pageserver
|
||||
if args.only_import is False:
|
||||
last_lsn, prev_lsn = get_rlsn(
|
||||
old_pageserver_connstr,
|
||||
timeline['tenant_id'],
|
||||
timeline['timeline_id'],
|
||||
)
|
||||
export_timeline(
|
||||
args,
|
||||
psql_path,
|
||||
old_pageserver_connstr,
|
||||
timeline['tenant_id'],
|
||||
timeline['timeline_id'],
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename,
|
||||
)
|
||||
|
||||
# Import into new pageserver
|
||||
import_timeline(
|
||||
args,
|
||||
psql_path,
|
||||
new_pageserver_connstr,
|
||||
new_http_client,
|
||||
timeline['tenant_id'],
|
||||
timeline['timeline_id'],
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename,
|
||||
)
|
||||
|
||||
# Re-export and compare
|
||||
re_export_filename = tar_filename + ".reexport"
|
||||
export_timeline(args,
|
||||
psql_path,
|
||||
new_pageserver_connstr,
|
||||
timeline['tenant_id'],
|
||||
timeline['timeline_id'],
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
re_export_filename)
|
||||
|
||||
# Check the size is the same
|
||||
old_size = os.path.getsize(tar_filename),
|
||||
new_size = os.path.getsize(re_export_filename),
|
||||
if old_size != new_size:
|
||||
raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--tenant-id',
|
||||
dest='tenants',
|
||||
required=True,
|
||||
nargs='+',
|
||||
help='Id of the tenant to migrate. You can pass multiple arguments',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--timeline-id',
|
||||
dest='timelines',
|
||||
required=False,
|
||||
nargs='+',
|
||||
help='Id of the timeline to migrate. You can pass multiple arguments',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--from-host',
|
||||
dest='old_pageserver_host',
|
||||
required=True,
|
||||
help='Host of the pageserver to migrate data from',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--from-http-port',
|
||||
dest='old_pageserver_http_port',
|
||||
required=False,
|
||||
type=int,
|
||||
default=9898,
|
||||
help='HTTP port of the pageserver to migrate data from. Default: 9898',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--from-pg-port',
|
||||
dest='old_pageserver_pg_port',
|
||||
required=False,
|
||||
type=int,
|
||||
default=6400,
|
||||
help='pg port of the pageserver to migrate data from. Default: 6400',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--to-host',
|
||||
dest='new_pageserver_host',
|
||||
required=True,
|
||||
help='Host of the pageserver to migrate data to',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--to-http-port',
|
||||
dest='new_pageserver_http_port',
|
||||
required=False,
|
||||
default=9898,
|
||||
type=int,
|
||||
help='HTTP port of the pageserver to migrate data to. Default: 9898',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--to-pg-port',
|
||||
dest='new_pageserver_pg_port',
|
||||
required=False,
|
||||
default=6400,
|
||||
type=int,
|
||||
help='pg port of the pageserver to migrate data to. Default: 6400',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--ignore-tenant-exists',
|
||||
dest='ok_if_exists',
|
||||
required=False,
|
||||
help=
|
||||
'Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--pg-distrib-dir',
|
||||
dest='pg_distrib_dir',
|
||||
required=False,
|
||||
default='/usr/local/',
|
||||
help='Path where postgres binaries are installed. Default: /usr/local/',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--psql-path',
|
||||
dest='psql_path',
|
||||
required=False,
|
||||
default='/usr/local/bin/psql',
|
||||
help='Path to the psql binary. Default: /usr/local/bin/psql',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--only-import',
|
||||
dest='only_import',
|
||||
required=False,
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Skip export and tenant creation part',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--work-dir',
|
||||
dest='work_dir',
|
||||
required=True,
|
||||
default=False,
|
||||
help='directory where temporary tar files are stored',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -1,5 +1,6 @@
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
|
||||
@@ -167,5 +167,3 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
|
||||
# The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
|
||||
with pytest.raises(Exception, match="invalid branch start lsn"):
|
||||
env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn)
|
||||
|
||||
thread.join()
|
||||
|
||||
@@ -60,38 +60,17 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID):
|
||||
|
||||
def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
timeline_details = client.timeline_detail(tenant_id=tenant_id,
|
||||
timeline_id=timeline_id,
|
||||
include_non_incremental_logical_size=True)
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
|
||||
assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
|
||||
assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
|
||||
assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
|
||||
timeline_details = client.timeline_detail(tenant_id=tenant_id,
|
||||
timeline_id=timeline_id,
|
||||
include_non_incremental_logical_size=True)
|
||||
|
||||
|
||||
def expect_updated_msg_lsn(client: NeonPageserverHttpClient,
|
||||
tenant_id: UUID,
|
||||
timeline_id: UUID,
|
||||
prev_msg_lsn: Optional[int]) -> int:
|
||||
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
|
||||
|
||||
# a successful `timeline_details` response must contain the below fields
|
||||
local_timeline_details = timeline_details['local']
|
||||
assert "wal_source_connstr" in local_timeline_details.keys()
|
||||
assert "last_received_msg_lsn" in local_timeline_details.keys()
|
||||
assert "last_received_msg_ts" in local_timeline_details.keys()
|
||||
|
||||
assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty"
|
||||
|
||||
last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"])
|
||||
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \
|
||||
f"the last received message's LSN {last_msg_lsn} hasn't been updated \
|
||||
compared to the previous message's LSN {prev_msg_lsn}"
|
||||
|
||||
return last_msg_lsn
|
||||
assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
|
||||
assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
|
||||
assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
|
||||
|
||||
|
||||
# Test the WAL-receiver related fields in the response to `timeline_details` API call
|
||||
@@ -100,29 +79,44 @@ def expect_updated_msg_lsn(client: NeonPageserverHttpClient,
|
||||
# `timeline_details` now.
|
||||
def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
# Wait to make sure that we get a latest WAL receiver data.
|
||||
# We need to wait here because it's possible that we don't have access to
|
||||
# the latest WAL yet, when the `timeline_detail` API is first called.
|
||||
# See: https://github.com/neondatabase/neon/issues/1768.
|
||||
lsn = wait_until(number_of_iterations=5,
|
||||
interval=1,
|
||||
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
|
||||
|
||||
# Make a DB modification then expect getting a new WAL receiver's data.
|
||||
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
wait_until(number_of_iterations=5,
|
||||
interval=1,
|
||||
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
|
||||
def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int:
|
||||
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
|
||||
|
||||
# a successful `timeline_details` response must contain the below fields
|
||||
local_timeline_details = timeline_details['local']
|
||||
assert "wal_source_connstr" in local_timeline_details.keys()
|
||||
assert "last_received_msg_lsn" in local_timeline_details.keys()
|
||||
assert "last_received_msg_ts" in local_timeline_details.keys()
|
||||
|
||||
assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty"
|
||||
|
||||
last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"])
|
||||
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \
|
||||
f"the last received message's LSN {last_msg_lsn} hasn't been updated \
|
||||
compared to the previous message's LSN {prev_msg_lsn}"
|
||||
|
||||
return last_msg_lsn
|
||||
|
||||
# Wait to make sure that we get a latest WAL receiver data.
|
||||
# We need to wait here because it's possible that we don't have access to
|
||||
# the latest WAL yet, when the `timeline_detail` API is first called.
|
||||
# See: https://github.com/neondatabase/neon/issues/1768.
|
||||
lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None))
|
||||
|
||||
# Make a DB modification then expect getting a new WAL receiver's data.
|
||||
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn))
|
||||
|
||||
|
||||
def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
with env.pageserver.http_client() as client:
|
||||
check_client(client, env.initial_tenant)
|
||||
client = env.pageserver.http_client()
|
||||
check_client(client, env.initial_tenant)
|
||||
|
||||
|
||||
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -131,5 +125,5 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
|
||||
|
||||
management_token = env.auth_keys.generate_management_token()
|
||||
|
||||
with env.pageserver.http_client(auth_token=management_token) as client:
|
||||
check_client(client, env.initial_tenant)
|
||||
client = env.pageserver.http_client(auth_token=management_token)
|
||||
check_client(client, env.initial_tenant)
|
||||
|
||||
@@ -2,10 +2,11 @@
|
||||
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
|
||||
|
||||
import shutil, os
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
import time
|
||||
from uuid import UUID
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, RemoteStorageKind, assert_timeline_local, available_remote_storages, wait_until, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.utils import lsn_from_hex, query_scalar
|
||||
import pytest
|
||||
@@ -28,19 +29,18 @@ import pytest
|
||||
# * queries the specific data, ensuring that it matches the one stored before
|
||||
#
|
||||
# The tests are done for all types of remote storage pageserver supports.
|
||||
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
|
||||
def test_remote_storage_backup_and_restore(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storatge_kind: RemoteStorageKind,
|
||||
):
|
||||
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
|
||||
def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, storage_type: str):
|
||||
# Use this test to check more realistic SK ids: some etcd key parsing bugs were related,
|
||||
# and this test needs SK to write data to pageserver, so it will be visible
|
||||
neon_env_builder.safekeepers_id_start = 12
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storatge_kind,
|
||||
test_name='test_remote_storage_backup_and_restore',
|
||||
)
|
||||
if storage_type == 'local_fs':
|
||||
neon_env_builder.enable_local_fs_remote_storage()
|
||||
elif storage_type == 'mock_s3':
|
||||
neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore')
|
||||
else:
|
||||
raise RuntimeError(f'Unknown storage type: {storage_type}')
|
||||
|
||||
data_id = 1
|
||||
data_secret = 'very secret secret'
|
||||
@@ -110,7 +110,7 @@ def test_remote_storage_backup_and_restore(
|
||||
client.tenant_attach(UUID(tenant_id))
|
||||
|
||||
log.info("waiting for timeline redownload")
|
||||
wait_until(number_of_iterations=20,
|
||||
wait_until(number_of_iterations=10,
|
||||
interval=1,
|
||||
func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)))
|
||||
|
||||
|
||||
@@ -1,19 +1,10 @@
|
||||
from threading import Thread
|
||||
from uuid import uuid4
|
||||
import uuid
|
||||
import psycopg2
|
||||
import pytest
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
|
||||
|
||||
|
||||
def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
|
||||
"""Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
|
||||
try:
|
||||
env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0')
|
||||
except Exception as e:
|
||||
log.error("do_gc failed: %s", e)
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
|
||||
|
||||
|
||||
def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -45,7 +36,8 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0')
|
||||
|
||||
# try to concurrently run gc and detach
|
||||
gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id))
|
||||
gc_thread = Thread(
|
||||
target=lambda: env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0'), )
|
||||
gc_thread.start()
|
||||
|
||||
last_error = None
|
||||
|
||||
@@ -229,7 +229,7 @@ def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path
|
||||
# basebackup and importing it into the new pageserver.
|
||||
# This kind of migration can tolerate breaking changes
|
||||
# to storage format
|
||||
'major',
|
||||
pytest.param('major', marks=pytest.mark.xfail(reason="Not implemented")),
|
||||
])
|
||||
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
|
||||
def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
|
||||
@@ -345,8 +345,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
|
||||
# Migrate either by attaching from s3 or import/export basebackup
|
||||
if method == "major":
|
||||
cmd = [
|
||||
"poetry",
|
||||
"run",
|
||||
"python",
|
||||
os.path.join(base_dir, "scripts/export_import_between_pageservers.py"),
|
||||
"--tenant-id",
|
||||
@@ -363,12 +361,12 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
|
||||
str(new_pageserver_http_port),
|
||||
"--to-pg-port",
|
||||
str(new_pageserver_pg_port),
|
||||
"--pg-distrib-dir",
|
||||
pg_distrib_dir,
|
||||
"--psql-path",
|
||||
os.path.join(pg_distrib_dir, "bin", "psql"),
|
||||
"--work-dir",
|
||||
os.path.join(test_output_dir),
|
||||
]
|
||||
subprocess_capture(test_output_dir, cmd, check=True)
|
||||
subprocess_capture(str(env.repo_dir), cmd, check=True)
|
||||
elif method == "minor":
|
||||
# call to attach timeline to new pageserver
|
||||
new_pageserver_http.tenant_attach(tenant_id)
|
||||
@@ -429,22 +427,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
|
||||
post_migration_check(pg_main, 500500, old_local_path_main)
|
||||
post_migration_check(pg_second, 1001000, old_local_path_second)
|
||||
|
||||
# ensure that we can successfully read all relations on the new pageserver
|
||||
with pg_cur(pg_second) as cur:
|
||||
cur.execute('''
|
||||
DO $$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
BEGIN
|
||||
FOR r IN
|
||||
SELECT relname FROM pg_class WHERE relkind='r'
|
||||
LOOP
|
||||
RAISE NOTICE '%', r.relname;
|
||||
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
|
||||
END LOOP;
|
||||
END$$;
|
||||
''')
|
||||
|
||||
if with_load == 'with_load':
|
||||
assert load_ok_event.wait(3)
|
||||
log.info('stopping load thread')
|
||||
|
||||
@@ -13,7 +13,7 @@ from uuid import UUID
|
||||
|
||||
import pytest
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.utils import lsn_from_hex
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ async def tenant_workload(env: NeonEnv, pg: Postgres):
|
||||
|
||||
async def all_tenants_workload(env: NeonEnv, tenants_pgs):
|
||||
workers = []
|
||||
for _, pg in tenants_pgs:
|
||||
for tenant, pg in tenants_pgs:
|
||||
worker = tenant_workload(env, pg)
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
@@ -46,18 +46,23 @@ async def all_tenants_workload(env: NeonEnv, tenants_pgs):
|
||||
await asyncio.gather(*workers)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
|
||||
def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storatge_kind,
|
||||
test_name='test_tenants_many',
|
||||
)
|
||||
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
|
||||
def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str):
|
||||
|
||||
if storage_type == 'local_fs':
|
||||
neon_env_builder.enable_local_fs_remote_storage()
|
||||
elif storage_type == 'mock_s3':
|
||||
neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore')
|
||||
else:
|
||||
raise RuntimeError(f'Unknown storage type: {storage_type}')
|
||||
|
||||
neon_env_builder.enable_local_fs_remote_storage()
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenants_pgs: List[Tuple[UUID, Postgres]] = []
|
||||
|
||||
for _ in range(1, 5):
|
||||
for i in range(1, 5):
|
||||
# Use a tiny checkpoint distance, to create a lot of layers quickly
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
|
||||
@@ -12,8 +12,9 @@ import uuid
|
||||
|
||||
from contextlib import closing
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import Process, Value
|
||||
from pathlib import Path
|
||||
from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageKind, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, available_remote_storages, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex, query_scalar
|
||||
from fixtures.log_helper import log
|
||||
from typing import List, Optional, Any
|
||||
@@ -350,7 +351,7 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end):
|
||||
if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end):
|
||||
break
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > 30:
|
||||
if elapsed > 20:
|
||||
raise RuntimeError(
|
||||
f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded")
|
||||
time.sleep(0.5)
|
||||
@@ -376,15 +377,15 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size):
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
|
||||
def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
|
||||
@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs'])
|
||||
def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storatge_kind,
|
||||
test_name='test_safekeepers_wal_backup',
|
||||
)
|
||||
|
||||
if storage_type == 'local_fs':
|
||||
neon_env_builder.enable_local_fs_remote_storage()
|
||||
elif storage_type == 'mock_s3':
|
||||
neon_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup')
|
||||
else:
|
||||
raise RuntimeError(f'Unknown storage type: {storage_type}')
|
||||
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -424,15 +425,15 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo
|
||||
wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
|
||||
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
|
||||
@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs'])
|
||||
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storatge_kind,
|
||||
test_name='test_s3_wal_replay',
|
||||
)
|
||||
|
||||
if storage_type == 'local_fs':
|
||||
neon_env_builder.enable_local_fs_remote_storage()
|
||||
elif storage_type == 'mock_s3':
|
||||
neon_env_builder.enable_s3_mock_remote_storage('test_s3_wal_replay')
|
||||
else:
|
||||
raise RuntimeError(f'Unknown storage type: {storage_type}')
|
||||
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
from dataclasses import field
|
||||
from contextlib import contextmanager
|
||||
from enum import Flag, auto
|
||||
import enum
|
||||
import textwrap
|
||||
from cached_property import cached_property
|
||||
import abc
|
||||
@@ -222,7 +221,7 @@ def can_bind(host: str, port: int) -> bool:
|
||||
# moment. If that changes, we should use start using SO_REUSEADDR here
|
||||
# too, to allow reusing ports more quickly.
|
||||
# See https://github.com/neondatabase/neon/issues/801
|
||||
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
#sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
|
||||
try:
|
||||
sock.bind((host, port))
|
||||
@@ -231,8 +230,6 @@ def can_bind(host: str, port: int) -> bool:
|
||||
except socket.error:
|
||||
log.info(f"Port {port} is in use, skipping")
|
||||
return False
|
||||
finally:
|
||||
sock.close()
|
||||
|
||||
|
||||
class PortDistributor:
|
||||
@@ -265,11 +262,6 @@ def default_broker(request: Any, port_distributor: PortDistributor):
|
||||
broker.stop()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def run_id():
|
||||
yield uuid.uuid4()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def mock_s3_server(port_distributor: PortDistributor):
|
||||
mock_s3_server = MockS3Server(port_distributor.get_port())
|
||||
@@ -299,9 +291,7 @@ class PgProtocol:
|
||||
# change it by calling "SET statement_timeout" after
|
||||
# connecting.
|
||||
options = result.get('options', '')
|
||||
if "statement_timeout" not in options:
|
||||
options = f'-cstatement_timeout=120s {options}'
|
||||
result['options'] = options
|
||||
result['options'] = f'-cstatement_timeout=120s {options}'
|
||||
return result
|
||||
|
||||
# autocommit=True here by default because that's what we need most of the time
|
||||
@@ -448,46 +438,26 @@ class MockS3Server:
|
||||
def secret_key(self) -> str:
|
||||
return 'test'
|
||||
|
||||
def access_env_vars(self) -> Dict[Any, Any]:
|
||||
return {
|
||||
'AWS_ACCESS_KEY_ID': self.access_key(),
|
||||
'AWS_SECRET_ACCESS_KEY': self.secret_key(),
|
||||
}
|
||||
|
||||
def kill(self):
|
||||
self.subprocess.kill()
|
||||
|
||||
|
||||
@enum.unique
|
||||
class RemoteStorageKind(enum.Enum):
|
||||
LOCAL_FS = "local_fs"
|
||||
MOCK_S3 = "mock_s3"
|
||||
REAL_S3 = "real_s3"
|
||||
|
||||
|
||||
def available_remote_storages() -> List[RemoteStorageKind]:
|
||||
remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3]
|
||||
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
|
||||
remote_storages.append(RemoteStorageKind.REAL_S3)
|
||||
log.info("Enabling real s3 storage for tests")
|
||||
else:
|
||||
log.info("Using mock implementations to test remote storage")
|
||||
return remote_storages
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalFsStorage:
|
||||
root: Path
|
||||
local_path: Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class S3Storage:
|
||||
bucket_name: str
|
||||
bucket_region: str
|
||||
access_key: str
|
||||
secret_key: str
|
||||
endpoint: Optional[str] = None
|
||||
prefix_in_bucket: Optional[str] = None
|
||||
|
||||
def access_env_vars(self) -> Dict[str, str]:
|
||||
return {
|
||||
'AWS_ACCESS_KEY_ID': self.access_key,
|
||||
'AWS_SECRET_ACCESS_KEY': self.secret_key,
|
||||
}
|
||||
endpoint: Optional[str]
|
||||
|
||||
|
||||
RemoteStorage = Union[LocalFsStorage, S3Storage]
|
||||
@@ -496,20 +466,16 @@ RemoteStorage = Union[LocalFsStorage, S3Storage]
|
||||
# serialize as toml inline table
|
||||
def remote_storage_to_toml_inline_table(remote_storage):
|
||||
if isinstance(remote_storage, LocalFsStorage):
|
||||
remote_storage_config = f"local_path='{remote_storage.root}'"
|
||||
res = f"local_path='{remote_storage.local_path}'"
|
||||
elif isinstance(remote_storage, S3Storage):
|
||||
remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\
|
||||
bucket_region='{remote_storage.bucket_region}'"
|
||||
|
||||
if remote_storage.prefix_in_bucket is not None:
|
||||
remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'"
|
||||
|
||||
res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'"
|
||||
if remote_storage.endpoint is not None:
|
||||
remote_storage_config += f",endpoint='{remote_storage.endpoint}'"
|
||||
res += f", endpoint='{remote_storage.endpoint}'"
|
||||
else:
|
||||
raise Exception(f'Unknown storage configuration {remote_storage}')
|
||||
else:
|
||||
raise Exception("invalid remote storage type")
|
||||
|
||||
return f"{{{remote_storage_config}}}"
|
||||
return f"{{{res}}}"
|
||||
|
||||
|
||||
class RemoteStorageUsers(Flag):
|
||||
@@ -527,31 +493,28 @@ class NeonEnvBuilder:
|
||||
cleaned up after the test has finished.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
repo_dir: Path,
|
||||
port_distributor: PortDistributor,
|
||||
broker: Etcd,
|
||||
run_id: uuid.UUID,
|
||||
mock_s3_server: MockS3Server,
|
||||
remote_storage: Optional[RemoteStorage] = None,
|
||||
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
|
||||
pageserver_config_override: Optional[str] = None,
|
||||
num_safekeepers: int = 1,
|
||||
# Use non-standard SK ids to check for various parsing bugs
|
||||
safekeepers_id_start: int = 0,
|
||||
# fsync is disabled by default to make the tests go faster
|
||||
safekeepers_enable_fsync: bool = False,
|
||||
auth_enabled: bool = False,
|
||||
rust_log_override: Optional[str] = None,
|
||||
default_branch_name=DEFAULT_BRANCH_NAME,
|
||||
):
|
||||
self,
|
||||
repo_dir: Path,
|
||||
port_distributor: PortDistributor,
|
||||
broker: Etcd,
|
||||
mock_s3_server: MockS3Server,
|
||||
remote_storage: Optional[RemoteStorage] = None,
|
||||
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
|
||||
pageserver_config_override: Optional[str] = None,
|
||||
num_safekeepers: int = 1,
|
||||
# Use non-standard SK ids to check for various parsing bugs
|
||||
safekeepers_id_start: int = 0,
|
||||
# fsync is disabled by default to make the tests go faster
|
||||
safekeepers_enable_fsync: bool = False,
|
||||
auth_enabled: bool = False,
|
||||
rust_log_override: Optional[str] = None,
|
||||
default_branch_name=DEFAULT_BRANCH_NAME):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
self.port_distributor = port_distributor
|
||||
self.remote_storage = remote_storage
|
||||
self.remote_storage_users = remote_storage_users
|
||||
self.broker = broker
|
||||
self.run_id = run_id
|
||||
self.mock_s3_server = mock_s3_server
|
||||
self.pageserver_config_override = pageserver_config_override
|
||||
self.num_safekeepers = num_safekeepers
|
||||
@@ -560,8 +523,6 @@ class NeonEnvBuilder:
|
||||
self.auth_enabled = auth_enabled
|
||||
self.default_branch_name = default_branch_name
|
||||
self.env: Optional[NeonEnv] = None
|
||||
self.remote_storage_prefix: Optional[str] = None
|
||||
self.keep_remote_storage_contents: bool = True
|
||||
|
||||
def init(self) -> NeonEnv:
|
||||
# Cannot create more than one environment from one builder
|
||||
@@ -577,143 +538,41 @@ class NeonEnvBuilder:
|
||||
self.start()
|
||||
return env
|
||||
|
||||
def enable_remote_storage(
|
||||
self,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
test_name: str,
|
||||
force_enable: bool = True,
|
||||
):
|
||||
if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
|
||||
self.enable_local_fs_remote_storage(force_enable=force_enable)
|
||||
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
||||
self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
|
||||
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
|
||||
else:
|
||||
raise RuntimeError(f'Unknown storage type: {remote_storage_kind}')
|
||||
"""
|
||||
Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
|
||||
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
|
||||
"""
|
||||
|
||||
def enable_local_fs_remote_storage(self, force_enable=True):
|
||||
"""
|
||||
Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
|
||||
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
|
||||
"""
|
||||
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
|
||||
self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage'))
|
||||
|
||||
def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True):
|
||||
"""
|
||||
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
|
||||
Starts up the mock server, if that does not run yet.
|
||||
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
|
||||
"""
|
||||
"""
|
||||
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
|
||||
Starts up the mock server, if that does not run yet.
|
||||
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
|
||||
"""
|
||||
|
||||
def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True):
|
||||
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
|
||||
mock_endpoint = self.mock_s3_server.endpoint()
|
||||
mock_region = self.mock_s3_server.region()
|
||||
|
||||
self.remote_storage_client = boto3.client(
|
||||
boto3.client(
|
||||
's3',
|
||||
endpoint_url=mock_endpoint,
|
||||
region_name=mock_region,
|
||||
aws_access_key_id=self.mock_s3_server.access_key(),
|
||||
aws_secret_access_key=self.mock_s3_server.secret_key(),
|
||||
)
|
||||
self.remote_storage_client.create_bucket(Bucket=bucket_name)
|
||||
|
||||
self.remote_storage = S3Storage(
|
||||
bucket_name=bucket_name,
|
||||
endpoint=mock_endpoint,
|
||||
bucket_region=mock_region,
|
||||
access_key=self.mock_s3_server.access_key(),
|
||||
secret_key=self.mock_s3_server.secret_key(),
|
||||
)
|
||||
|
||||
def enable_real_s3_remote_storage(self, test_name: str, force_enable=True):
|
||||
"""
|
||||
Sets up configuration to use real s3 endpoint without mock server
|
||||
"""
|
||||
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
|
||||
|
||||
access_key = os.getenv("AWS_ACCESS_KEY_ID")
|
||||
assert access_key, "no aws access key provided"
|
||||
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
|
||||
assert secret_key, "no aws access key provided"
|
||||
|
||||
# session token is needed for local runs with sso auth
|
||||
session_token = os.getenv("AWS_SESSION_TOKEN")
|
||||
|
||||
bucket_name = os.getenv("REMOTE_STORAGE_S3_BUCKET")
|
||||
assert bucket_name, "no remote storage bucket name provided"
|
||||
region = os.getenv("REMOTE_STORAGE_S3_REGION")
|
||||
assert region, "no remote storage region provided"
|
||||
|
||||
# do not leave data in real s3
|
||||
self.keep_remote_storage_contents = False
|
||||
|
||||
# construct a prefix inside bucket for the particular test case and test run
|
||||
self.remote_storage_prefix = f'{self.run_id}/{test_name}'
|
||||
|
||||
self.remote_storage_client = boto3.client(
|
||||
's3',
|
||||
region_name=region,
|
||||
aws_access_key_id=access_key,
|
||||
aws_secret_access_key=secret_key,
|
||||
aws_session_token=session_token,
|
||||
)
|
||||
).create_bucket(Bucket=bucket_name)
|
||||
self.remote_storage = S3Storage(bucket_name=bucket_name,
|
||||
bucket_region=region,
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
prefix_in_bucket=self.remote_storage_prefix)
|
||||
|
||||
def cleanup_remote_storage(self):
|
||||
# here wee check for true remote storage, no the local one
|
||||
# local cleanup is not needed after test because in ci all env will be destroyed anyway
|
||||
if self.remote_storage_prefix is None:
|
||||
log.info("no remote storage was set up, skipping cleanup")
|
||||
return
|
||||
|
||||
if self.keep_remote_storage_contents:
|
||||
log.info("keep_remote_storage_contents skipping remote storage cleanup")
|
||||
return
|
||||
|
||||
log.info("removing data from test s3 bucket %s by prefix %s",
|
||||
self.remote_storage.bucket_name,
|
||||
self.remote_storage_prefix)
|
||||
paginator = self.remote_storage_client.get_paginator('list_objects_v2')
|
||||
pages = paginator.paginate(
|
||||
Bucket=self.remote_storage.bucket_name,
|
||||
Prefix=self.remote_storage_prefix,
|
||||
)
|
||||
|
||||
objects_to_delete = {'Objects': []}
|
||||
cnt = 0
|
||||
for item in pages.search('Contents'):
|
||||
# weirdly when nothing is found it returns [None]
|
||||
if item is None:
|
||||
break
|
||||
|
||||
objects_to_delete['Objects'].append({'Key': item['Key']})
|
||||
|
||||
# flush once aws limit reached
|
||||
if len(objects_to_delete['Objects']) >= 1000:
|
||||
self.remote_storage_client.delete_objects(
|
||||
Bucket=self.remote_storage.bucket_name,
|
||||
Delete=objects_to_delete,
|
||||
)
|
||||
objects_to_delete = dict(Objects=[])
|
||||
cnt += 1
|
||||
|
||||
# flush rest
|
||||
if len(objects_to_delete['Objects']):
|
||||
self.remote_storage_client.delete_objects(Bucket=self.remote_storage.bucket_name,
|
||||
Delete=objects_to_delete)
|
||||
|
||||
log.info("deleted %s objects from remote storage", cnt)
|
||||
endpoint=mock_endpoint,
|
||||
bucket_region=mock_region)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
|
||||
# Stop all the nodes.
|
||||
if self.env:
|
||||
log.info('Cleaning up all storage and compute nodes')
|
||||
@@ -722,8 +581,6 @@ class NeonEnvBuilder:
|
||||
sk.stop(immediate=True)
|
||||
self.env.pageserver.stop(immediate=True)
|
||||
|
||||
self.cleanup_remote_storage()
|
||||
|
||||
|
||||
class NeonEnv:
|
||||
"""
|
||||
@@ -856,13 +713,10 @@ class NeonEnv:
|
||||
|
||||
|
||||
@pytest.fixture(scope=shareable_scope)
|
||||
def _shared_simple_env(
|
||||
request: Any,
|
||||
port_distributor: PortDistributor,
|
||||
mock_s3_server: MockS3Server,
|
||||
default_broker: Etcd,
|
||||
run_id: uuid.UUID,
|
||||
) -> Iterator[NeonEnv]:
|
||||
def _shared_simple_env(request: Any,
|
||||
port_distributor: PortDistributor,
|
||||
mock_s3_server: MockS3Server,
|
||||
default_broker: Etcd) -> Iterator[NeonEnv]:
|
||||
"""
|
||||
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
|
||||
is set, this is shared by all tests using `neon_simple_env`.
|
||||
@@ -876,13 +730,8 @@ def _shared_simple_env(
|
||||
repo_dir = os.path.join(str(top_output_dir), "shared_repo")
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
|
||||
with NeonEnvBuilder(
|
||||
repo_dir=Path(repo_dir),
|
||||
port_distributor=port_distributor,
|
||||
broker=default_broker,
|
||||
mock_s3_server=mock_s3_server,
|
||||
run_id=run_id,
|
||||
) as builder:
|
||||
with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker,
|
||||
mock_s3_server) as builder:
|
||||
env = builder.init_start()
|
||||
|
||||
# For convenience in tests, create a branch from the freshly-initialized cluster.
|
||||
@@ -907,13 +756,10 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
|
||||
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def neon_env_builder(
|
||||
test_output_dir,
|
||||
port_distributor: PortDistributor,
|
||||
mock_s3_server: MockS3Server,
|
||||
default_broker: Etcd,
|
||||
run_id: uuid.UUID,
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
def neon_env_builder(test_output_dir,
|
||||
port_distributor: PortDistributor,
|
||||
mock_s3_server: MockS3Server,
|
||||
default_broker: Etcd) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
|
||||
@@ -931,13 +777,8 @@ def neon_env_builder(
|
||||
repo_dir = os.path.join(test_output_dir, "repo")
|
||||
|
||||
# Return the builder to the caller
|
||||
with NeonEnvBuilder(
|
||||
repo_dir=Path(repo_dir),
|
||||
port_distributor=port_distributor,
|
||||
mock_s3_server=mock_s3_server,
|
||||
broker=default_broker,
|
||||
run_id=run_id,
|
||||
) as builder:
|
||||
with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker,
|
||||
mock_s3_server) as builder:
|
||||
yield builder
|
||||
|
||||
|
||||
@@ -1342,10 +1183,7 @@ class NeonCli(AbstractNeonCli):
|
||||
remote_storage_users=self.env.remote_storage_users,
|
||||
pageserver_config_override=self.env.pageserver.config_override)
|
||||
|
||||
s3_env_vars = None
|
||||
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
|
||||
s3_env_vars = self.env.remote_storage.access_env_vars()
|
||||
|
||||
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
|
||||
return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
|
||||
|
||||
def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]':
|
||||
@@ -1357,10 +1195,7 @@ class NeonCli(AbstractNeonCli):
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]':
|
||||
s3_env_vars = None
|
||||
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
|
||||
s3_env_vars = self.env.remote_storage.access_env_vars()
|
||||
|
||||
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
|
||||
return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars)
|
||||
|
||||
def safekeeper_stop(self,
|
||||
@@ -1502,7 +1337,7 @@ class NeonPageserver(PgProtocol):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
self.stop(immediate=True)
|
||||
self.stop(True)
|
||||
|
||||
def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient:
|
||||
return NeonPageserverHttpClient(
|
||||
@@ -1519,7 +1354,6 @@ def append_pageserver_param_overrides(
|
||||
):
|
||||
if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None:
|
||||
remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
|
||||
|
||||
params_to_update.append(
|
||||
f'--pageserver-config-override=remote_storage={remote_storage_toml_table}')
|
||||
|
||||
@@ -2026,8 +1860,8 @@ class Safekeeper:
|
||||
started_at = time.time()
|
||||
while True:
|
||||
try:
|
||||
with self.http_client() as http_cli:
|
||||
http_cli.check_status()
|
||||
http_cli = self.http_client()
|
||||
http_cli.check_status()
|
||||
except Exception as e:
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > 3:
|
||||
@@ -2178,9 +2012,9 @@ class Etcd:
|
||||
return f'http://127.0.0.1:{self.port}'
|
||||
|
||||
def check_status(self):
|
||||
with requests.Session() as s:
|
||||
s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry
|
||||
s.get(f"{self.client_url()}/health").raise_for_status()
|
||||
s = requests.Session()
|
||||
s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry
|
||||
s.get(f"{self.client_url()}/health").raise_for_status()
|
||||
|
||||
def try_start(self):
|
||||
if self.handle is not None:
|
||||
|
||||
@@ -146,7 +146,7 @@ def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, durat
|
||||
record_thread.join()
|
||||
|
||||
|
||||
def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_event: threading.Event):
|
||||
def start_pgbench_intensive_initialization(env: PgCompare, scale: int):
|
||||
with env.record_duration("run_duration"):
|
||||
# Needs to increase the statement timeout (default: 120s) because the
|
||||
# initialization step can be slow with a large scale.
|
||||
@@ -155,11 +155,9 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_even
|
||||
f'-s{scale}',
|
||||
'-i',
|
||||
'-Idtg',
|
||||
env.pg.connstr(options='-cstatement_timeout=600s')
|
||||
env.pg.connstr(options='-cstatement_timeout=300s')
|
||||
])
|
||||
|
||||
done_event.set()
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
@pytest.mark.parametrize("scale", get_scales_matrix(1000))
|
||||
@@ -168,17 +166,15 @@ def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int):
|
||||
with env.pg.connect().cursor() as cur:
|
||||
cur.execute("CREATE TABLE foo as select generate_series(1,100000)")
|
||||
|
||||
workload_done_event = threading.Event()
|
||||
|
||||
workload_thread = threading.Thread(target=start_pgbench_intensive_initialization,
|
||||
args=(env, scale, workload_done_event))
|
||||
args=(env, scale))
|
||||
workload_thread.start()
|
||||
|
||||
record_thread = threading.Thread(target=record_lsn_write_lag,
|
||||
args=(env, lambda: not workload_done_event.is_set()))
|
||||
args=(env, lambda: workload_thread.is_alive()))
|
||||
record_thread.start()
|
||||
|
||||
record_read_latency(env, lambda: not workload_done_event.is_set(), "SELECT count(*) from foo")
|
||||
record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT count(*) from foo")
|
||||
workload_thread.join()
|
||||
record_thread.join()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user