Compare commits

..

3 Commits

Author SHA1 Message Date
Bojan Serafimov
688f68ecba Undo whitespace 2022-08-04 09:43:27 +02:00
Bojan Serafimov
fb2ffac8b9 Ignore metrics static 2022-08-04 09:42:27 +02:00
Bojan Serafimov
8173e36a1b Find all problematic statics 2022-08-04 09:30:22 +02:00
75 changed files with 590 additions and 1820 deletions

View File

@@ -27,26 +27,6 @@ inputs:
description: 'Whether to upload the performance report'
required: false
default: 'false'
run_with_real_s3:
description: 'Whether to pass real s3 credentials to the test suite'
required: false
default: 'false'
real_s3_bucket:
description: 'Bucket name for real s3 tests'
required: false
default: ''
real_s3_region:
description: 'Region name for real s3 tests'
required: false
default: ''
real_s3_access_key_id:
description: 'Access key id'
required: false
default: ''
real_s3_secret_access_key:
description: 'Secret access key'
required: false
default: ''
runs:
using: "composite"
@@ -83,8 +63,6 @@ runs:
# this variable will be embedded in perf test report
# and is needed to distinguish different environments
PLATFORM: github-actions-selfhosted
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
shell: bash -euxo pipefail {0}
run: |
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
@@ -99,14 +77,6 @@ runs:
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
fi
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
echo "REAL S3 ENABLED"
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }}
export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }}
fi
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
mkdir -p "$PERF_REPORT_DIR"

View File

@@ -35,16 +35,6 @@ jobs:
GIT_VERSION: ${{ github.sha }}
steps:
- name: Fix git ownerwhip
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Checkout
uses: actions/checkout@v3
with:
@@ -219,11 +209,7 @@ jobs:
build_type: ${{ matrix.build_type }}
rust_toolchain: ${{ matrix.rust_toolchain }}
test_selection: batch_others
run_with_real_s3: true
real_s3_bucket: ci-tests-s3
real_s3_region: us-west-2
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data

View File

@@ -11,15 +11,17 @@ than it was before.
## Submitting changes
1. Get at least one +1 on your PR before you push.
1. Make a PR for every change.
Even seemingly trivial patches can break things in surprising ways.
Use of common sense is OK. If you're only fixing a typo in a comment,
it's probably fine to just push it. But if in doubt, open a PR.
2. Get at least one +1 on your PR before you push.
For simple patches, it will only take a minute for someone to review
it.
2. Don't force push small changes after making the PR ready for review.
Doing so will force readers to re-read your entire PR, which will delay
the review process.
3. Always keep the CI green.
Do not push, if the CI failed on your PR. Even if you think it's not

59
Cargo.lock generated
View File

@@ -154,9 +154,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "axum"
version = "0.5.13"
version = "0.5.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b9496f0c1d1afb7a2af4338bbe1d969cddfead41d87a9fb3aaa6d0bbc7af648"
checksum = "d16705af05732b7d3258ec0f7b73c03a658a28925e050d8852d5b568ee8bcf4e"
dependencies = [
"async-trait",
"axum-core",
@@ -317,6 +317,15 @@ dependencies = [
"serde",
]
[[package]]
name = "cast"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
dependencies = [
"rustc_version",
]
[[package]]
name = "cast"
version = "0.3.0"
@@ -495,8 +504,8 @@ name = "control_plane"
version = "0.1.0"
dependencies = [
"anyhow",
"lazy_static",
"nix",
"once_cell",
"pageserver",
"postgres",
"regex",
@@ -570,7 +579,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
dependencies = [
"atty",
"cast",
"cast 0.3.0",
"clap 2.34.0",
"criterion-plot",
"csv",
@@ -591,11 +600,11 @@ dependencies = [
[[package]]
name = "criterion-plot"
version = "0.4.5"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
dependencies = [
"cast",
"cast 0.2.7",
"itertools",
]
@@ -671,9 +680,9 @@ dependencies = [
[[package]]
name = "crypto-common"
version = "0.1.6"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
checksum = "2ccfd8c0ee4cce11e45b3fd6f9d5e69e0cc62912aa6a0cb1bf4617b0eba5a12f"
dependencies = [
"generic-array",
"typenum",
@@ -1107,9 +1116,9 @@ dependencies = [
[[package]]
name = "gimli"
version = "0.26.2"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d"
checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4"
[[package]]
name = "git-version"
@@ -1175,9 +1184,9 @@ dependencies = [
[[package]]
name = "hashbrown"
version = "0.12.3"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022"
[[package]]
name = "heck"
@@ -1379,7 +1388,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
dependencies = [
"autocfg",
"hashbrown 0.12.3",
"hashbrown 0.12.2",
]
[[package]]
@@ -1591,8 +1600,8 @@ dependencies = [
name = "metrics"
version = "0.1.0"
dependencies = [
"lazy_static",
"libc",
"once_cell",
"prometheus",
"workspace_hack",
]
@@ -1842,9 +1851,9 @@ dependencies = [
[[package]]
name = "os_str_bytes"
version = "6.2.0"
version = "6.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4"
checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa"
[[package]]
name = "pageserver"
@@ -1870,6 +1879,7 @@ dependencies = [
"humantime-serde",
"hyper",
"itertools",
"lazy_static",
"metrics",
"nix",
"once_cell",
@@ -2115,9 +2125,9 @@ dependencies = [
"crc32c",
"env_logger",
"hex",
"lazy_static",
"log",
"memoffset",
"once_cell",
"postgres",
"rand",
"regex",
@@ -2277,9 +2287,9 @@ dependencies = [
"hex",
"hmac 0.12.1",
"hyper",
"lazy_static",
"md5",
"metrics",
"once_cell",
"parking_lot 0.12.1",
"pin-project-lite",
"rand",
@@ -2725,9 +2735,9 @@ dependencies = [
[[package]]
name = "rustversion"
version = "1.0.8"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"
checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf"
[[package]]
name = "ryu"
@@ -2753,6 +2763,7 @@ dependencies = [
"hex",
"humantime",
"hyper",
"lazy_static",
"metrics",
"once_cell",
"postgres",
@@ -3606,9 +3617,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"
[[package]]
name = "unicode-ident"
version = "1.0.2"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c"
[[package]]
name = "unicode-normalization"
@@ -3669,9 +3680,9 @@ dependencies = [
"hex-literal",
"hyper",
"jsonwebtoken",
"lazy_static",
"metrics",
"nix",
"once_cell",
"pin-project-lite",
"postgres",
"postgres-protocol",

View File

@@ -9,7 +9,7 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8
serde = { version = "1.0", features = ["derive"] }
serde_with = "1.12.0"
toml = "0.5"
once_cell = "1.13.0"
lazy_static = "1.4"
regex = "1"
anyhow = "1.0"
thiserror = "1"

View File

@@ -30,14 +30,14 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_stdout_file =
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
format!(
"Failed to create etcd stout file in directory {}",
"Failed to create ectd stout file in directory {}",
etcd_data_dir.display()
)
})?;
let etcd_stderr_file =
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
format!(
"Failed to create etcd stderr file in directory {}",
"Failed to create ectd stderr file in directory {}",
etcd_data_dir.display()
)
})?;

View File

@@ -51,11 +51,7 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
] {
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
if let Ok(value) = std::env::var(env_key) {
cmd = cmd.env(env_key, value);
}

View File

@@ -5,7 +5,7 @@
/// enough to extract a few settings we need in Zenith, assuming you don't do
/// funny stuff like include-directives or funny escaping.
use anyhow::{bail, Context, Result};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use regex::Regex;
use std::collections::HashMap;
use std::fmt;
@@ -19,7 +19,9 @@ pub struct PostgresConf {
hash: HashMap<String, String>,
}
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
lazy_static! {
static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
}
impl PostgresConf {
pub fn new() -> PostgresConf {
@@ -137,10 +139,10 @@ fn escape_str(s: &str) -> String {
//
// This regex is a bit more conservative than the rules in guc-file.l, so we quote some
// strings that PostgreSQL would accept without quoting, but that's OK.
static UNQUOTED_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap());
lazy_static! {
static ref UNQUOTED_RE: Regex =
Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
}
if UNQUOTED_RE.is_match(s) {
s.to_string()
} else {

View File

@@ -247,7 +247,7 @@ impl SafekeeperNode {
// Shutting down may take a long time,
// if safekeeper flushes a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
for _ in 0..100 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
@@ -272,11 +272,9 @@ impl SafekeeperNode {
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1));
}
bail!("Failed to stop safekeeper with pid {}", pid);

View File

@@ -318,7 +318,7 @@ impl PageServerNode {
// Shutting down may take a long time,
// if pageserver checkpoints a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
for _ in 0..100 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
@@ -344,11 +344,9 @@ impl PageServerNode {
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1));
}
bail!("Failed to stop pageserver with pid {}", pid);

View File

@@ -1,8 +1,6 @@
#!/bin/sh
set -eux
pageserver_id_param="${NODE_ID:-10}"
broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
if [ "$broker_endpoints_param" != "absent" ]; then
broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
@@ -10,12 +8,10 @@ else
broker_endpoints_param=''
fi
remote_storage_param="${REMOTE_STORAGE:-}"
if [ "$1" = 'pageserver' ]; then
if [ ! -d "/data/tenants" ]; then
echo "Initializing pageserver data directory"
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
fi
echo "Staring pageserver at 0.0.0.0:6400"
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data

View File

@@ -52,8 +52,10 @@
- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [settings.md](./settings.md)
#FIXME: move these under sourcetree.md
#- [pageserver/README.md](/pageserver/README.md)
#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
#- [test_runner/README.md](/test_runner/README.md)
#- [safekeeper/README.md](/safekeeper/README.md)
# RFCs
@@ -79,5 +81,4 @@
- [014-storage-lsm](rfcs/014-storage-lsm.md)
- [015-storage-messaging](rfcs/015-storage-messaging.md)
- [016-connection-routing](rfcs/016-connection-routing.md)
- [017-pageserver-op-atomicity](rfcs/017-pageserver-op-atomicity.md)
- [cluster-size-limits](rfcs/cluster-size-limits.md)

View File

@@ -75,7 +75,7 @@ layer's Segment and range of LSNs.
There are two kinds of layers, in-memory and on-disk layers. In-memory
layers are used to ingest incoming WAL, and provide fast access
to the recent page versions. On-disk layers are stored as files on disk, and
are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more.
are immutable. See pageserver/src/layered_repository/README.md for more.
### Layer file (on-disk layer)
@@ -111,7 +111,7 @@ PostgreSQL LSNs and functions to monitor them:
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information.
Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
* `RestartLSN`: position in WAL confirmed by all safekeepers.
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.

View File

@@ -68,6 +68,8 @@ There are the following implementations present:
* local filesystem — to use in tests mainly
* AWS S3 - to use in production
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
The backup service is disabled by default and can be enabled to interact with a single remote storage.
CLI examples:
@@ -116,7 +118,7 @@ implemented by the LayeredRepository object in
`layered_repository.rs`. There is only that one implementation of the
Repository trait, but it's still a useful abstraction that keeps the
interface for the low-level storage functionality clean. The layered
storage format is described in [pageserver-storage.md](./pageserver-storage.md).
storage format is described in layered_repository/README.md.
Each repository consists of multiple Timelines. Timeline is a
workhorse that accepts page changes from the WAL, and serves

View File

@@ -1,153 +0,0 @@
# Durability and atomicity of tenant/timeline operations
The pageserver has 8 tenant/timeline operations, listed below. In
addition to that, data can be appended to a timeline by WAL receiver,
pages can be requested by the compute node, and tenant/timeline status
can be queries through the mgmt API. But these are the operations that
modify state in pageserver or in S3, and need to worry about crash
safety.
To make these operations atomic and recoverable, let's introduce a new
"tenant index file", called `tenant.json`. For each tenant, there is
one tenant index file, and it contains a list of all timelines for
that tenant:
{
tenant_id: a93a94724945e95e1a0c448004ece2ec
timelines: [
{ timeline_id: "9979cd302340a058606473912651f27f",
ancestor_id: ""
ancestor_lsn: "0/0"
},
{ timeline_id: "f0a6f3372d273dd9ca3480d19e6b565c",
ancestor_id: "9979cd302340a058606473912651f27f"
ancestor_lsn: "1/1698C48"
},
]
}
The file only contains the immutable metadata of each timeline, like
the point it was branched from. The changing parts, like
disk_consistent_lsn, are still stored in the per-timeline metadata
file.
This file allows us to resolve some ambiguous situations, like
remembering that a tenant exists when it doesn't have any timelines.
It also allows us to quickly fetch the list of all timelines of a
tenant, without having to perform S3 LIST operations.
Below is a brief description of all the pageserver tenant/timeline
operations, and how the steps of creating/deleting local files or
directories and uploads to S3 are performed. The steps are listed in
such an order that each operation can be sanely recovered or aborted,
if the pageserver crashes while the operation is being perfromed.
## Create tenant
Create an empty tenant. It doesn't have any timelines initially.
1. Create local tenant-directory with .temp extension
2. Create tenant.json file in the directory, with a special flag
indicating that the tenant-creation is in progress
3. Rename the local tenant directory in place
4. Upload the tenant.json file to S3, without the flag
5. Update the local file, removing the flag
At pageserver startup, if we see a tenant.json file with the special
flag, check if the tenant exists in S3. If not, remove the local directory.
Otherwise remove the flag from local file.
## Create timeline
Create a timeline for a tenant, as result of running initdb.
1. create timeline directory locally, with .temp extension
2. run initdb, creating the initial set of layers
3. upload all layer files to S3
4. upload metadata file to S3
5. update tenant.json file in S3
6. Rename local directory in place
If we crash before step 5, S3 may have a timeline metadata file and some
layer files, without corresponding entry in tenant.json file. That's OK.
Whenever we see that, we can delete the leftover timeline files.
If we want to make that less scary, we could update a tenant.json file in S3
twice. First, add the new timeline ID to the file with a flag indicating
that it's being created. Do that before uploading anything else to S3. And
then in step 5, update tenant.json to indicate that the creation is complete.
## Branch timeline
Create a new timeline with an existing timeline as parent
1. create timeline directory locally, with .temp extension
2. create metadata file in the local directory
3. upload metadata file to S3
4. update tenant.json file in S3
5. Rename local directory in place
Like with Create timeline, if we crash between steps 3 and 4, we will
leave behind a timeline metadata file with no corresponding entry in
tenant.json. That's harmless.
## Delete timeline
1. rename local timeline directory to have .temp extension
2. Update tenant.json file in S3
3. delete index file from S3
4. delete layer files from S3
5. delete local directory
Like with creation, if this is interrupted, we will leave behind
timeline files in S3 with no corresponding entry in tenant.json. If we
want to make that less scary, we can update tenant.json in step 2 with
a tombstone flag for the timeline we're removing, instead of removing
the entry for it outright.
## Delete tenant
1. rename local tenant directory to have .temp extension
2. delete tenant.json file in S3
3. delete all timeline index files from S3
4. delete all layer files from S3
5. delete local directory
Like with timeline creation, this can leave behind files with no corresponding
tenant.json file. We can make it less scary by adding tombstones.
## Attach tenant
1. create local tenant directory with .temp extension
2. Download tenant.json file
3. download index files for every timeline
4. download all layer files (in the future, skip this and download them on demand)
5. rename local tenant directory in place
## Detach tenant
1. rename local tenant directory to have .temp extension
2. delete local directory
## Load tenant
This happens automatically at pageserver startup, for every tenant that is found
in the tenants-directory. I.e. for every tenant that was attached to the pageserver
before the crash or shutdown.
1. download tenant.json file
2. for every timeline that's in remote tenant.json:
1. download remote index file
2. download all layer files that are missing locally (skip in future, and download on-demand)
3. schedule upload of all files present locally, but missing remotely
4. schedule index file upload
3. delete all locally present timeline directories that's not in tenant.json
On startup, delete everything with the .temp extension
- we could skip some of the downloads if we stored the S3 etag of the object in the local file,
and compared that

View File

@@ -28,7 +28,7 @@ The pageserver has a few different duties:
- Receive WAL from the WAL service and decode it.
- Replay WAL that's applicable to the chunks that the Page Server maintains
For more detailed info, see [pageserver-services.md](./pageserver-services.md)
For more detailed info, see [/pageserver/README](/pageserver/README.md)
`/proxy`:
@@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
It acts as a holding area and redistribution center for recently generated WAL.
For more detailed info, see [walservice.md](./walservice.md)
For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
`/workspace_hack`:
The workspace_hack crate exists only to pin down some dependencies.

View File

@@ -75,8 +75,8 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only
one primary node can be actively streaming WAL to the quorum of
safekeepers.
See [this section](safekeeper-protocol.md) for a more detailed description of
the consensus protocol. spec/ contains TLA+ specification of it.
See README_PROTO.md for a more detailed description of the consensus
protocol. spec/ contains TLA+ specification of it.
# Q&A

View File

@@ -9,7 +9,7 @@
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "1.12.0"
once_cell = "1.13.0"
once_cell = "1.8.0"
utils = { path = "../utils" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -6,5 +6,5 @@ edition = "2021"
[dependencies]
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
libc = "0.2"
once_cell = "1.13.0"
lazy_static = "1.4"
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -2,7 +2,7 @@
//! make sure that we use the same dep version everywhere.
//! Otherwise, we might not see all metrics registered via
//! a default registry.
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
pub use prometheus::opts;
pub use prometheus::register;
@@ -41,22 +41,19 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
prometheus::gather()
}
static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
lazy_static! {
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
"libmetrics_disk_io_bytes_total",
"Bytes written and read from disk, grouped by the operation (read|write)",
&["io_operation"]
)
.expect("Failed to register disk i/o bytes int gauge vec")
});
static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
.expect("Failed to register disk i/o bytes int gauge vec");
static ref MAXRSS_KB: IntGauge = register_int_gauge!(
"libmetrics_maxrss_kb",
"Memory usage (Maximum Resident Set Size)"
)
.expect("Failed to register maxrss_kb int gauge")
});
.expect("Failed to register maxrss_kb int gauge");
}
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,

View File

@@ -10,13 +10,13 @@ use std::io::{Read, Result, Write};
/// # use std::io::{Result, Read};
/// # use metrics::{register_int_counter, IntCounter};
/// # use metrics::CountedReader;
/// # use once_cell::sync::Lazy;
/// #
/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
/// # lazy_static::lazy_static! {
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
/// # "int_counter",
/// # "let's count something!"
/// # ).unwrap()
/// # });
/// # ).unwrap();
/// # }
/// #
/// fn do_some_reads(stream: impl Read, count: usize) -> Result<Vec<u8>> {
/// let mut reader = CountedReader::new(stream, |cnt| {
@@ -85,13 +85,13 @@ impl<T: Read> Read for CountedReader<'_, T> {
/// # use std::io::{Result, Write};
/// # use metrics::{register_int_counter, IntCounter};
/// # use metrics::CountedWriter;
/// # use once_cell::sync::Lazy;
/// #
/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
/// # lazy_static::lazy_static! {
/// # static ref INT_COUNTER: IntCounter = register_int_counter!(
/// # "int_counter",
/// # "let's count something!"
/// # ).unwrap()
/// # });
/// # ).unwrap();
/// # }
/// #
/// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> {
/// let mut writer = CountedWriter::new(stream, |cnt| {

View File

@@ -12,7 +12,7 @@ byteorder = "1.4.3"
anyhow = "1.0"
crc32c = "0.6.0"
hex = "0.4.3"
once_cell = "1.13.0"
lazy_static = "1.4"
log = "0.4.14"
memoffset = "0.6.2"
thiserror = "1.0"

View File

@@ -2,7 +2,7 @@
//! Common utilities for dealing with PostgreSQL relation files.
//!
use crate::pg_constants;
use once_cell::sync::OnceCell;
use lazy_static::lazy_static;
use regex::Regex;
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
@@ -54,14 +54,11 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
///
pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
static RELFILE_RE: OnceCell<Regex> = OnceCell::new();
RELFILE_RE.get_or_init(|| {
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap()
});
lazy_static! {
static ref RELFILE_RE: Regex =
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
}
let caps = RELFILE_RE
.get()
.unwrap()
.captures(fname)
.ok_or(FilePathError::InvalidFileName)?;

View File

@@ -16,7 +16,7 @@ use crate::XLogRecord;
use crate::XLOG_PAGE_MAGIC;
use crate::pg_constants::WAL_SEGMENT_SIZE;
use anyhow::{anyhow, bail, ensure};
use anyhow::{bail, ensure};
use byteorder::{ByteOrder, LittleEndian};
use bytes::BytesMut;
use bytes::{Buf, Bytes};
@@ -159,7 +159,7 @@ fn find_end_of_wal_segment(
let mut buf = [0u8; XLOG_BLCKSZ];
let file_name = XLogFileName(tli, segno, wal_seg_size);
let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
file.seek(SeekFrom::Start(offs as u64))?;
// xl_crc is the last field in XLogRecord, will not be read into rec_hdr
const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
@@ -396,13 +396,10 @@ pub fn find_end_of_wal(
let mut high_tli: TimeLineID = 0;
let mut high_ispartial = false;
for entry in fs::read_dir(data_dir)?.flatten() {
for entry in fs::read_dir(data_dir).unwrap().flatten() {
let ispartial: bool;
let entry_name = entry.file_name();
let fname = entry_name
.to_str()
.ok_or_else(|| anyhow!("Invalid file name"))?;
let fname = entry_name.to_str().unwrap();
/*
* Check if the filename looks like an xlog file, or a .partial file.
*/
@@ -414,7 +411,7 @@ pub fn find_end_of_wal(
continue;
}
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
continue;
}
if segno > high_segno

View File

@@ -10,7 +10,7 @@ anyhow = "1.0"
clap = "3.0"
env_logger = "0.9"
log = "0.4"
once_cell = "1.13.0"
once_cell = "1.8.0"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres_ffi = { path = "../" }
tempfile = "3.2"

View File

@@ -7,7 +7,7 @@ edition = "2021"
anyhow = { version = "1.0", features = ["backtrace"] }
async-trait = "0.1"
metrics = { version = "0.1", path = "../metrics" }
once_cell = "1.13.0"
once_cell = "1.8.0"
rusoto_core = "0.48"
rusoto_s3 = "0.48"
serde = { version = "1.0", features = ["derive"] }

View File

@@ -66,9 +66,6 @@ pub trait RemoteStorage: Send + Sync {
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
/// so this method doesnt need to.
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,

View File

@@ -116,7 +116,7 @@ impl RemoteStorage for LocalFs {
prefix: Option<Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
let path = match prefix {
Some(prefix) => Cow::Owned(prefix),
Some(prefix) => Cow::Owned(self.storage_root.join(prefix)),
None => Cow::Borrowed(&self.storage_root),
};
get_all_files(path.as_ref(), false).await

View File

@@ -171,25 +171,17 @@ impl S3Bucket {
let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
// session token is used when authorizing through sso
// which is typically the case when testing locally on developer machine
let session_token = std::env::var("AWS_SESSION_TOKEN").ok();
let client = if access_key_id.is_none() && secret_access_key.is_none() {
debug!("Using IAM-based AWS access");
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
} else {
debug!(
"Using credentials-based AWS access. Session token is set: {}",
session_token.is_some()
);
debug!("Using credentials-based AWS access");
S3Client::new_with(
request_dispatcher,
StaticProvider::new(
StaticProvider::new_minimal(
access_key_id.unwrap_or_default(),
secret_access_key.unwrap_or_default(),
session_token,
None,
),
region,
)
@@ -312,24 +304,32 @@ impl RemoteStorage for S3Bucket {
Ok(document_keys)
}
/// See the doc for `RemoteStorage::list_prefixes`
/// Note: it wont include empty "directories"
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| p.0)
.or_else(|| self.prefix_in_bucket.clone())
.map(|mut p| {
let list_prefix = match prefix {
Some(prefix) => {
let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default();
// if there is no trailing / in default prefix and
// supplied prefix does not start with "/" insert it
if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR)
|| prefix.0.starts_with(S3_PREFIX_SEPARATOR))
{
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
}
prefix_in_bucket.push_str(&prefix.0);
// required to end with a separator
// otherwise request will return only the entry of a prefix
if !p.ends_with(S3_PREFIX_SEPARATOR) {
p.push(S3_PREFIX_SEPARATOR);
if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) {
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
}
p
});
Some(prefix_in_bucket)
}
None => self.prefix_in_bucket.clone(),
};
let mut document_keys = Vec::new();

View File

@@ -8,6 +8,7 @@ anyhow = "1.0"
bincode = "1.3"
bytes = "1.0.1"
hyper = { version = "0.14.7", features = ["full"] }
lazy_static = "1.4.0"
pin-project-lite = "0.2.7"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -27,8 +28,6 @@ rustls = "0.20.2"
rustls-split = "0.3.0"
git-version = "0.3.5"
serde_with = "1.12.0"
once_cell = "1.13.0"
metrics = { path = "../metrics" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -4,8 +4,8 @@ use crate::zid::ZTenantId;
use anyhow::anyhow;
use hyper::header::AUTHORIZATION;
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
use lazy_static::lazy_static;
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::RequestInfo;
use routerify::{Middleware, Router, RouterBuilder, RouterService};
@@ -16,13 +16,13 @@ use std::net::TcpListener;
use super::error::ApiError;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
lazy_static! {
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
"libmetrics_metric_handler_requests_total",
"Number of metric requests made"
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);

View File

@@ -7,7 +7,7 @@ use std::{
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
@@ -19,15 +19,16 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) {
(server_stream, client_stream)
}
static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
let mut cursor = Cursor::new(include_bytes!("key.pem"));
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
});
static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
});
lazy_static! {
static ref KEY: rustls::PrivateKey = {
let mut cursor = Cursor::new(include_bytes!("key.pem"));
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
};
static ref CERT: rustls::Certificate = {
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
};
}
#[test]
fn ssl() {

View File

@@ -884,7 +884,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
match sub_match.subcommand() {
Some(("start", start_match)) => {
if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
eprintln!("pageserver start failed: {e}");
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
@@ -906,19 +906,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
}
if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
eprintln!("pageserver start failed: {e}");
eprintln!("pageserver start failed: {}", e);
exit(1);
}
}
Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
Ok(_) => println!("Page server is up and running"),
Err(err) => {
eprintln!("Page server is not available: {}", err);
exit(1);
}
},
Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
None => bail!("no pageserver subcommand provided"),
}

View File

@@ -21,6 +21,7 @@ futures = "0.3.13"
hex = "0.4.3"
hyper = "0.14"
itertools = "0.10.3"
lazy_static = "1.4.0"
clap = "3.0"
daemonize = "0.4.1"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
@@ -47,7 +48,7 @@ tracing = "0.1.27"
signal-hook = "0.3.10"
url = "2"
nix = "0.23"
once_cell = "1.13.0"
once_cell = "1.8.0"
crossbeam-utils = "0.8.5"
fail = "0.5.0"
git-version = "0.3.5"

View File

@@ -37,7 +37,7 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
// Then fishing out pg_control would be unnecessary
let mut modification = tline.begin_modification(lsn);
let mut modification = tline.begin_modification();
modification.init_empty()?;
// Import all but pg_wal
@@ -56,12 +56,12 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
pg_control = Some(control_file);
}
modification.flush()?;
modification.flush(lsn)?;
}
}
// We're done importing all the data files.
modification.commit()?;
modification.commit(lsn)?;
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -267,7 +267,7 @@ fn import_wal<T: DatadirTimeline>(
waldecoder.feed_bytes(&buf);
let mut nrecords = 0;
let mut modification = tline.begin_modification(endpoint);
let mut modification = tline.begin_modification();
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
@@ -301,7 +301,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
base_lsn: Lsn,
) -> Result<()> {
info!("importing base at {}", base_lsn);
let mut modification = tline.begin_modification(base_lsn);
let mut modification = tline.begin_modification();
modification.init_empty()?;
let mut pg_control: Option<ControlFileData> = None;
@@ -319,7 +319,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush()?;
modification.flush(base_lsn)?;
}
tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
@@ -333,7 +333,7 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
// sanity check: ensure that pg_control is loaded
let _pg_control = pg_control.context("pg_control file not found")?;
modification.commit()?;
modification.commit(base_lsn)?;
Ok(())
}
@@ -385,7 +385,7 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
waldecoder.feed_bytes(&bytes[offset..]);
let mut modification = tline.begin_modification(end_lsn);
let mut modification = tline.begin_modification();
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {

View File

@@ -5,7 +5,7 @@
//! get/put call, walking back the timeline branching history as needed.
//!
//! The files are stored in the .neon/tenants/<tenantid>/timelines/<timelineid>
//! directory. See docs/pageserver-storage.md for how the files are managed.
//! directory. See layered_repository/README for how the files are managed.
//! In addition to the layer files, there is a metadata file in the same
//! directory that contains information about the timeline, in particular its
//! parent timeline, and the last LSN that has been written to disk.

View File

@@ -5,7 +5,7 @@
use crate::page_cache;
use crate::page_cache::{ReadBufResult, PAGE_SZ};
use bytes::Bytes;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::ops::{Deref, DerefMut};
use std::os::unix::fs::FileExt;
use std::sync::atomic::AtomicU64;
@@ -117,7 +117,9 @@ where
}
}
static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
lazy_static! {
static ref NEXT_ID: AtomicU64 = AtomicU64::new(1);
}
/// An adapter for reading a (virtual) file using the page cache.
///

View File

@@ -8,7 +8,7 @@ use crate::page_cache;
use crate::page_cache::PAGE_SZ;
use crate::page_cache::{ReadBufResult, WriteBufResult};
use crate::virtual_file::VirtualFile;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::cmp::min;
use std::collections::HashMap;
use std::fs::OpenOptions;
@@ -21,15 +21,15 @@ use utils::zid::{ZTenantId, ZTimelineId};
use std::os::unix::fs::FileExt;
///
/// This is the global cache of file descriptors (File objects).
///
static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
RwLock::new(EphemeralFiles {
lazy_static! {
///
/// This is the global cache of file descriptors (File objects).
///
static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
next_file_id: 1,
files: HashMap::new(),
})
});
});
}
pub struct EphemeralFiles {
next_file_id: u64,

View File

@@ -15,18 +15,19 @@ use crate::layered_repository::storage_layer::Layer;
use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
use crate::repository::Key;
use anyhow::Result;
use lazy_static::lazy_static;
use metrics::{register_int_gauge, IntGauge};
use once_cell::sync::Lazy;
use std::collections::VecDeque;
use std::ops::Range;
use std::sync::Arc;
use tracing::*;
use utils::lsn::Lsn;
static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
.expect("failed to define a metric")
});
lazy_static! {
static ref NUM_ONDISK_LAYERS: IntGauge =
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
.expect("failed to define a metric");
}
///
/// LayerMap tracks what layers exist on a timeline.

View File

@@ -4,11 +4,11 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use fail::fail_point;
use itertools::Itertools;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use tracing::*;
use std::cmp::{max, min, Ordering};
use std::collections::{hash_map::Entry, HashMap, HashSet};
use std::collections::HashSet;
use std::fs;
use std::fs::{File, OpenOptions};
use std::io::Write;
@@ -38,9 +38,7 @@ use crate::layered_repository::{
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::pgdatadir_mapping::BlockNumber;
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::reltag::RelTag;
use crate::tenant_config::TenantConfOpt;
use crate::DatadirTimeline;
@@ -60,102 +58,76 @@ use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
use crate::{page_cache, storage_sync};
/// Prometheus histogram buckets (in seconds) that capture the majority of
/// latencies in the microsecond range but also extend far enough up to distinguish
/// "bad" from "really bad".
fn get_buckets_for_critical_operations() -> Vec<f64> {
let buckets_per_digit = 5;
let min_exponent = -6;
let max_exponent = 2;
let mut buckets = vec![];
// Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
// because it's more numerically stable and doesn't result in numbers like 9.999999
for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
}
buckets
// Metrics collected on operations on the storage repository.
lazy_static! {
pub static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
"pageserver_storage_operations_seconds",
"Time spent on storage operations",
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric");
}
// Metrics collected on operations on the storage repository.
pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_storage_operations_seconds",
"Time spent on storage operations",
&["operation", "tenant_id", "timeline_id"],
get_buckets_for_critical_operations(),
)
.expect("failed to define a metric")
});
// Metrics collected on operations on the storage repository.
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value",
&["tenant_id", "timeline_id"],
get_buckets_for_critical_operations(),
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
lazy_static! {
static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("failed to define a metric");
static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
"pageserver_wait_lsn_seconds",
"Time spent waiting for WAL to arrive",
&["tenant_id", "timeline_id"],
get_buckets_for_critical_operations(),
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
lazy_static! {
static ref LAST_RECORD_LSN: IntGaugeVec = register_int_gauge_vec!(
"pageserver_last_record_lsn",
"Last record LSN grouped by timeline",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
// Metrics for determining timeline's physical size.
// A layered timeline's physical is defined as the total size of
// (delta/image) layer files on disk.
static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
lazy_static! {
static ref CURRENT_PHYSICAL_SIZE: UIntGaugeVec = register_uint_gauge_vec!(
"pageserver_current_physical_size",
"Current physical size grouped by timeline",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
// or in testing they estimate how much we would upload if we did.
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
lazy_static! {
static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
"pageserver_created_persistent_files_total",
"Number of files created that are meant to be uploaded to cloud storage",
)
.expect("failed to define a metric")
});
static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.expect("failed to define a metric");
static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
"pageserver_written_persistent_bytes_total",
"Total bytes written that are meant to be uploaded to cloud storage",
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
#[derive(Clone)]
pub enum LayeredTimelineEntry {
@@ -323,9 +295,6 @@ pub struct LayeredTimeline {
/// or None if WAL receiver has not received anything for this timeline
/// yet.
pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
/// Relation size cache
rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
}
pub struct WalReceiverInfo {
@@ -337,42 +306,7 @@ pub struct WalReceiverInfo {
/// Inherit all the functions from DatadirTimeline, to provide the
/// functionality to store PostgreSQL relations, SLRUs, etc. in a
/// LayeredTimeline.
impl DatadirTimeline for LayeredTimeline {
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
if lsn >= *cached_lsn {
return Some(*nblocks);
}
}
None
}
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
match rel_size_cache.entry(tag) {
Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
}
}
}
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.insert(tag, (lsn, nblocks));
}
fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.remove(tag);
}
}
impl DatadirTimeline for LayeredTimeline {}
///
/// Information about how much history needs to be retained, needed by
@@ -443,6 +377,8 @@ impl Timeline for LayeredTimeline {
/// Look up the value with the given a key
fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes> {
debug_assert!(lsn <= self.get_last_record_lsn());
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
// The cached image can be returned directly if there is no WAL between the cached image
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
@@ -682,7 +618,6 @@ impl LayeredTimeline {
repartition_threshold: 0,
last_received_wal: Mutex::new(None),
rel_size_cache: RwLock::new(HashMap::new()),
};
result.repartition_threshold = result.get_checkpoint_distance() / 10;
result

View File

@@ -22,7 +22,7 @@ pub mod walreceiver;
pub mod walrecord;
pub mod walredo;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use tracing::info;
use crate::thread_mgr::ThreadKind;
@@ -42,14 +42,14 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3;
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
lazy_static! {
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
"pageserver_live_connections",
"Number of live network connections",
&["pageserver_connection_kind"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
pub const LOG_FILE_NAME: &str = "pageserver.log";

View File

@@ -55,6 +55,7 @@ use utils::{
use crate::layered_repository::writeback_ephemeral_file;
use crate::repository::Key;
// TODO move ownership into a new PageserverState struct
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;

View File

@@ -11,7 +11,7 @@
use anyhow::{bail, ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use regex::Regex;
use std::io::{self, Read};
use std::net::TcpListener;
@@ -434,15 +434,15 @@ const TIME_BUCKETS: &[f64] = &[
0.1, // 1/10 s
];
static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
"pageserver_smgr_query_seconds",
"Time spent on smgr query handling",
&["smgr_query_type", "tenant_id", "timeline_id"],
TIME_BUCKETS.into()
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
impl PageServerHandler {
pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {

View File

@@ -56,16 +56,13 @@ pub trait DatadirTimeline: Timeline {
/// This provides a transaction-like interface to perform a bunch
/// of modifications atomically.
///
/// To ingest a WAL record, call begin_modification(lsn) to get a
/// To ingest a WAL record, call begin_modification() to get a
/// DatadirModification object. Use the functions in the object to
/// modify the repository state, updating all the pages and metadata
/// that the WAL record affects. When you're done, call commit() to
/// commit the changes.
/// that the WAL record affects. When you're done, call commit(lsn) to
/// commit the changes. All the changes will be stamped with the specified LSN.
///
/// Lsn stored in modification is advanced by `ingest_record` and
/// is used by `commit()` to update `last_record_lsn`.
///
/// Calling commit() will flush all the changes and reset the state,
/// Calling commit(lsn) will flush all the changes and reset the state,
/// so the `DatadirModification` struct can be reused to perform the next modification.
///
/// Note that any pending modifications you make through the
@@ -73,7 +70,7 @@ pub trait DatadirTimeline: Timeline {
/// functions of the timeline until you finish! And if you update the
/// same page twice, the last update wins.
///
fn begin_modification(&self, lsn: Lsn) -> DatadirModification<Self>
fn begin_modification(&self) -> DatadirModification<Self>
where
Self: Sized,
{
@@ -82,7 +79,6 @@ pub trait DatadirTimeline: Timeline {
pending_updates: HashMap::new(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
lsn,
}
}
@@ -124,10 +120,6 @@ pub trait DatadirTimeline: Timeline {
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
ensure!(tag.relnode != 0, "invalid relnode");
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(nblocks);
}
if (tag.forknum == pg_constants::FSM_FORKNUM
|| tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, lsn)?
@@ -141,21 +133,13 @@ pub trait DatadirTimeline: Timeline {
let key = rel_size_to_key(tag);
let mut buf = self.get(key, lsn)?;
let nblocks = buf.get_u32_le();
// Update relation size cache
self.update_cached_rel_size(tag, lsn, nblocks);
Ok(nblocks)
Ok(buf.get_u32_le())
}
/// Does relation exist?
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
ensure!(tag.relnode != 0, "invalid relnode");
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(true);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = self.get(key, lsn)?;
@@ -461,18 +445,6 @@ pub trait DatadirTimeline: Timeline {
Ok(result.to_keyspace())
}
/// Get cached size of relation if it not updated after specified LSN
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber>;
/// Update cached relation size if there is no more recent update
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
/// Store cached relation size
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
/// Remove cached relation size
fn remove_cached_rel_size(&self, tag: &RelTag);
}
/// DatadirModification represents an operation to ingest an atomic set of
@@ -485,9 +457,6 @@ pub struct DatadirModification<'a, T: DatadirTimeline> {
/// in the state in 'tline' yet.
pub tline: &'a T,
/// Lsn assigned by begin_modification
pub lsn: Lsn,
// The modifications are not applied directly to the underlying key-value store.
// The put-functions add the modifications here, and they are flushed to the
// underlying key-value store by the 'finish' function.
@@ -697,11 +666,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
self.pending_nblocks += nblocks as isize;
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
// Even if nblocks > 0, we don't insert any actual blocks here. That's up to the
// caller.
Ok(())
}
@@ -717,9 +684,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let buf = nblocks.to_le_bytes();
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
// Update logical database size.
self.pending_nblocks -= old_size as isize - nblocks as isize;
Ok(())
@@ -739,9 +703,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let buf = nblocks.to_le_bytes();
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
self.pending_nblocks += nblocks as isize - old_size as isize;
}
Ok(())
@@ -767,9 +728,6 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let old_size = self.get(size_key)?.get_u32_le();
self.pending_nblocks -= old_size as isize;
// Remove enty from relation size cache
self.tline.remove_cached_rel_size(&rel);
// Delete size entry, as well as all blocks
self.delete(rel_key_range(rel));
@@ -884,7 +842,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub fn flush(&mut self) -> Result<()> {
pub fn flush(&mut self, lsn: Lsn) -> Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -898,7 +856,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let mut result: Result<()> = Ok(());
self.pending_updates.retain(|&key, value| {
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
result = writer.put(key, self.lsn, value);
result = writer.put(key, lsn, value);
false
} else {
true
@@ -919,9 +877,9 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
/// underlying timeline.
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub fn commit(&mut self) -> Result<()> {
pub fn commit(&mut self, lsn: Lsn) -> Result<()> {
let writer = self.tline.writer();
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
@@ -1366,9 +1324,9 @@ pub fn create_test_timeline<R: Repository>(
timeline_id: utils::zid::ZTimelineId,
) -> Result<std::sync::Arc<R::Timeline>> {
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
let mut m = tline.begin_modification(Lsn(8));
let mut m = tline.begin_modification();
m.init_empty()?;
m.commit()?;
m.commit(Lsn(8))?;
Ok(tline)
}

View File

@@ -408,7 +408,7 @@ pub trait TimelineWriter<'a> {
#[cfg(test)]
pub mod repo_harness {
use bytes::BytesMut;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
use std::{fs, path::PathBuf};
@@ -439,7 +439,9 @@ pub mod repo_harness {
buf.freeze()
}
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
lazy_static! {
static ref LOCK: RwLock<()> = RwLock::new(());
}
impl From<TenantConf> for TenantConfOpt {
fn from(tenant_conf: TenantConf) -> Self {
@@ -587,10 +589,11 @@ mod tests {
//use std::sync::Arc;
use bytes::BytesMut;
use hex_literal::hex;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
lazy_static! {
static ref TEST_KEY: Key = Key::from_slice(&hex!("112222222233333333444444445500000001"));
}
#[test]
fn test_basic() -> Result<()> {

View File

@@ -155,7 +155,8 @@ use std::{
use anyhow::{anyhow, bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use once_cell::sync::{Lazy, OnceCell};
use lazy_static::lazy_static;
use once_cell::sync::OnceCell;
use remote_storage::{GenericRemoteStorage, RemoteStorage};
use tokio::{
fs,
@@ -183,8 +184,8 @@ use crate::{
};
use metrics::{
register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec,
IntCounterVec, IntGauge,
register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge,
HistogramVec, IntCounter, IntCounterVec, IntGauge,
};
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
@@ -192,34 +193,34 @@ use self::download::download_index_parts;
pub use self::download::gather_tenant_timelines_index_parts;
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
lazy_static! {
static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!(
"pageserver_remote_storage_remaining_sync_items",
"Number of storage sync items left in the queue"
)
.expect("failed to register pageserver remote storage remaining sync items int gauge")
});
static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("failed to register pageserver remote storage remaining sync items int gauge");
static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
"pageserver_remote_storage_fatal_task_failures_total",
"Number of critically failed tasks"
)
.expect("failed to register pageserver remote storage remaining sync items int gauge");
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
"pageserver_remote_storage_image_sync_seconds",
"Time took to synchronize (download or upload) a whole pageserver image. \
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
&["tenant_id", "timeline_id", "operation_kind", "status"],
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
)
.expect("failed to register pageserver image sync time histogram vec")
});
static REMOTE_INDEX_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
.expect("failed to register pageserver image sync time histogram vec");
static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!(
"pageserver_remote_storage_remote_index_uploads_total",
"Number of remote index uploads",
&["tenant_id", "timeline_id"],
)
.expect("failed to register pageserver remote index upload vec")
});
.expect("failed to register pageserver remote index upload vec");
}
// TODO move ownership into a new PageserverState struct
static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();
/// A timeline status to share with pageserver's sync counterpart,

View File

@@ -130,7 +130,6 @@ where
tenant_path.display()
)
})?;
let timelines = storage
.list_prefixes(Some(tenant_storage_path))
.await
@@ -141,13 +140,6 @@ where
)
})?;
if timelines.is_empty() {
anyhow::bail!(
"no timelines found on the remote storage for tenant {}",
tenant_id
)
}
let mut sync_ids = HashSet::new();
for timeline_remote_storage_key in timelines {

View File

@@ -4,7 +4,7 @@ use std::{fmt::Debug, path::PathBuf};
use anyhow::Context;
use futures::stream::{FuturesUnordered, StreamExt};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use remote_storage::RemoteStorage;
use tokio::fs;
use tracing::{debug, error, info, warn};
@@ -20,14 +20,14 @@ use crate::{
};
use metrics::{register_int_counter_vec, IntCounterVec};
static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
lazy_static! {
static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!(
"pageserver_remote_storage_no_layers_uploads_total",
"Number of skipped uploads due to no layers",
&["tenant_id", "timeline_id"],
)
.expect("failed to register pageserver no layers upload vec")
});
.expect("failed to register pageserver no layers upload vec");
}
/// Serializes and uploads the given index part data to the remote storage.
pub(super) async fn upload_index_part<P, S>(

View File

@@ -25,27 +25,26 @@ use utils::lsn::Lsn;
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
// TODO move ownership into a new PageserverState struct
mod tenants_state {
use anyhow::ensure;
use once_cell::sync::Lazy;
use std::{
collections::HashMap,
sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
};
use tokio::sync::mpsc;
use tracing::{debug, error};
use utils::zid::ZTenantId;
use crate::tenant_mgr::{LocalTimelineUpdate, Tenant};
static TENANTS: Lazy<RwLock<HashMap<ZTenantId, Tenant>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
/// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
/// so that it can enable/disable corresponding processes.
static TIMELINE_UPDATE_SENDER: Lazy<
RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>>,
> = Lazy::new(|| RwLock::new(None));
lazy_static::lazy_static! {
static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>> = RwLock::new(HashMap::new());
/// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
/// so that it can enable/disable corresponding processes.
static ref TIMELINE_UPDATE_SENDER: RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>> = RwLock::new(None);
}
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<ZTenantId, Tenant>> {
TENANTS

View File

@@ -87,6 +87,7 @@ async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
);
}
// TODO move ownership into a new PageserverState struct
static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();

View File

@@ -45,20 +45,22 @@ use tokio::sync::watch;
use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use utils::zid::{ZTenantId, ZTimelineId};
use crate::shutdown_pageserver;
/// Each thread that we track is associated with a "thread ID". It's just
/// an increasing number that we assign, not related to any system thread
/// id.
static NEXT_THREAD_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
// TODO move ownership into a new PageserverState struct
lazy_static! {
/// Each thread that we track is associated with a "thread ID". It's just
/// an increasing number that we assign, not related to any system thread
/// id.
static ref NEXT_THREAD_ID: AtomicU64 = AtomicU64::new(1);
/// Global registry of threads
static THREADS: Lazy<Mutex<HashMap<u64, Arc<PageServerThread>>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
/// Global registry of threads
static ref THREADS: Mutex<HashMap<u64, Arc<PageServerThread>>> = Mutex::new(HashMap::new());
}
// There is a Tokio watch channel for each thread, which can be used to signal the
// thread that it needs to shut down. This thread local variable holds the receiving

View File

@@ -10,7 +10,7 @@
//! This is similar to PostgreSQL's virtual file descriptor facility in
//! src/backend/storage/file/fd.c
//!
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use once_cell::sync::OnceCell;
use std::fs::{File, OpenOptions};
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
@@ -32,24 +32,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
1.0, // 1 sec
];
static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
"pageserver_io_operations_seconds",
"Time spent in IO operations",
&["operation", "tenant_id", "timeline_id"],
STORAGE_IO_TIME_BUCKETS.into()
)
.expect("failed to define a metric")
});
static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
.expect("failed to define a metric");
}
lazy_static! {
static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
"pageserver_io_operations_bytes_total",
"Total amount of bytes read/written in IO operations",
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
.expect("failed to define a metric");
}
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally

View File

@@ -30,6 +30,8 @@ use anyhow::Result;
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use std::collections::HashMap;
use crate::pgdatadir_mapping::*;
use crate::reltag::{RelTag, SlruKind};
use crate::walrecord::*;
@@ -46,6 +48,8 @@ pub struct WalIngest<'a, T: DatadirTimeline> {
checkpoint: CheckPoint,
checkpoint_modified: bool,
relsize_cache: HashMap<RelTag, BlockNumber>,
}
impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
@@ -60,13 +64,13 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
timeline,
checkpoint,
checkpoint_modified: false,
relsize_cache: HashMap::new(),
})
}
///
/// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline.
///
/// This function updates `lsn` field of `DatadirModification`
///
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
/// relations/pages that the record affects.
@@ -78,7 +82,6 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
modification: &mut DatadirModification<T>,
decoded: &mut DecodedWALRecord,
) -> Result<()> {
modification.lsn = lsn;
decode_wal_record(recdata, decoded).context("failed decoding wal record")?;
let mut buf = decoded.record.clone();
@@ -257,7 +260,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
modification.commit()?;
modification.commit(lsn)?;
Ok(())
}
@@ -405,7 +408,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
let vm_size = self.get_relsize(vm_rel)?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -877,6 +880,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
modification: &mut DatadirModification<T>,
rel: RelTag,
) -> Result<()> {
self.relsize_cache.insert(rel, 0);
modification.put_rel_creation(rel, 0)?;
Ok(())
}
@@ -912,6 +916,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
nblocks: BlockNumber,
) -> Result<()> {
modification.put_rel_truncation(rel, nblocks)?;
self.relsize_cache.insert(rel, nblocks);
Ok(())
}
@@ -921,16 +926,23 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
rel: RelTag,
) -> Result<()> {
modification.put_rel_drop(rel)?;
self.relsize_cache.remove(&rel);
Ok(())
}
fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result<BlockNumber> {
let nblocks = if !self.timeline.get_rel_exists(rel, lsn)? {
0
fn get_relsize(&mut self, rel: RelTag) -> Result<BlockNumber> {
if let Some(nblocks) = self.relsize_cache.get(&rel) {
Ok(*nblocks)
} else {
self.timeline.get_rel_size(rel, lsn)?
};
Ok(nblocks)
let last_lsn = self.timeline.get_last_record_lsn();
let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
0
} else {
self.timeline.get_rel_size(rel, last_lsn)?
};
self.relsize_cache.insert(rel, nblocks);
Ok(nblocks)
}
}
fn handle_rel_extend(
@@ -940,16 +952,22 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
blknum: BlockNumber,
) -> Result<()> {
let new_nblocks = blknum + 1;
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = modification.lsn;
let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
// create it with 0 size initially, the logic below will extend it
modification.put_rel_creation(rel, 0)?;
0
let old_nblocks = if let Some(nblocks) = self.relsize_cache.get(&rel) {
*nblocks
} else {
self.timeline.get_rel_size(rel, last_lsn)?
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = self.timeline.get_last_record_lsn();
let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
// create it with 0 size initially, the logic below will extend it
modification.put_rel_creation(rel, 0)?;
0
} else {
self.timeline.get_rel_size(rel, last_lsn)?
};
self.relsize_cache.insert(rel, nblocks);
nblocks
};
if new_nblocks > old_nblocks {
@@ -960,6 +978,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
for gap_blknum in old_nblocks..blknum {
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
}
self.relsize_cache.insert(rel, new_nblocks);
}
Ok(())
}
@@ -1050,10 +1069,10 @@ mod tests {
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
fn init_walingest_test<T: DatadirTimeline>(tline: &T) -> Result<WalIngest<T>> {
let mut m = tline.begin_modification(Lsn(0x10));
let mut m = tline.begin_modification();
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
m.commit()?;
m.commit(Lsn(0x10))?;
let walingest = WalIngest::new(tline, Lsn(0x10))?;
Ok(walingest)
@@ -1065,19 +1084,19 @@ mod tests {
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
let mut m = tline.begin_modification(Lsn(0x20));
let mut m = tline.begin_modification();
walingest.put_rel_creation(&mut m, TESTREL_A)?;
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x30));
m.commit(Lsn(0x20))?;
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x40));
m.commit(Lsn(0x30))?;
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
m.commit()?;
let mut m = tline.begin_modification(Lsn(0x50));
m.commit(Lsn(0x40))?;
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
m.commit()?;
m.commit(Lsn(0x50))?;
assert_current_logical_size(&*tline, Lsn(0x50));
@@ -1123,9 +1142,9 @@ mod tests {
);
// Truncate last block
let mut m = tline.begin_modification(Lsn(0x60));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
m.commit()?;
m.commit(Lsn(0x60))?;
assert_current_logical_size(&*tline, Lsn(0x60));
// Check reported size and contents after truncation
@@ -1147,15 +1166,15 @@ mod tests {
);
// Truncate to zero length
let mut m = tline.begin_modification(Lsn(0x68));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
m.commit()?;
m.commit(Lsn(0x68))?;
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0);
// Extend from 0 to 2 blocks, leaving a gap
let mut m = tline.begin_modification(Lsn(0x70));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
m.commit()?;
m.commit(Lsn(0x70))?;
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2);
assert_eq!(
tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?,
@@ -1167,9 +1186,9 @@ mod tests {
);
// Extend a lot more, leaving a big gap that spans across segments
let mut m = tline.begin_modification(Lsn(0x80));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
m.commit()?;
m.commit(Lsn(0x80))?;
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501);
for blk in 2..1500 {
assert_eq!(
@@ -1193,18 +1212,18 @@ mod tests {
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
let mut m = tline.begin_modification(Lsn(0x20));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
m.commit()?;
m.commit(Lsn(0x20))?;
// Check that rel exists and size is correct
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1);
// Drop rel
let mut m = tline.begin_modification(Lsn(0x30));
let mut m = tline.begin_modification();
walingest.put_rel_drop(&mut m, TESTREL_A)?;
m.commit()?;
m.commit(Lsn(0x30))?;
// Check that rel is not visible anymore
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
@@ -1213,9 +1232,9 @@ mod tests {
//assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none());
// Re-create it
let mut m = tline.begin_modification(Lsn(0x40));
let mut m = tline.begin_modification();
walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
m.commit()?;
m.commit(Lsn(0x40))?;
// Check that rel exists and size is correct
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
@@ -1235,12 +1254,12 @@ mod tests {
// Create a 20 MB relation (the size is arbitrary)
let relsize = 20 * 1024 * 1024 / 8192;
let mut m = tline.begin_modification(Lsn(0x20));
let mut m = tline.begin_modification();
for blkno in 0..relsize {
let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
}
m.commit()?;
m.commit(Lsn(0x20))?;
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
@@ -1261,9 +1280,9 @@ mod tests {
// Truncate relation so that second segment was dropped
// - only leave one page
let mut m = tline.begin_modification(Lsn(0x60));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?;
m.commit()?;
m.commit(Lsn(0x60))?;
// Check reported size and contents after truncation
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1);
@@ -1291,12 +1310,12 @@ mod tests {
// Extend relation again.
// Add enough blocks to create second segment
let lsn = Lsn(0x80);
let mut m = tline.begin_modification(lsn);
let mut m = tline.begin_modification();
for blkno in 0..relsize {
let data = format!("foo blk {} at {}", blkno, lsn);
walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
}
m.commit()?;
m.commit(lsn)?;
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize);
@@ -1324,10 +1343,10 @@ mod tests {
let mut lsn = 0x10;
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?;
m.commit()?;
m.commit(Lsn(lsn))?;
}
assert_current_logical_size(&*tline, Lsn(lsn));
@@ -1339,9 +1358,9 @@ mod tests {
// Truncate one block
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?;
m.commit()?;
m.commit(Lsn(lsn))?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
pg_constants::RELSEG_SIZE
@@ -1350,9 +1369,9 @@ mod tests {
// Truncate another block
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?;
m.commit()?;
m.commit(Lsn(lsn))?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
pg_constants::RELSEG_SIZE - 1
@@ -1364,9 +1383,9 @@ mod tests {
let mut size: i32 = 3000;
while size >= 0 {
lsn += 0x10;
let mut m = tline.begin_modification(Lsn(lsn));
let mut m = tline.begin_modification();
walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
m.commit()?;
m.commit(Lsn(lsn))?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
size as BlockNumber

View File

@@ -66,7 +66,7 @@ pub fn init_wal_receiver_main_thread(
);
let broker_prefix = &conf.broker_etcd_prefix;
info!(
"Starting wal receiver main thread, etcd endpoints: {}",
"Starting wal receiver main thread, etdc endpoints: {}",
etcd_endpoints.iter().map(Url::to_string).join(", ")
);

View File

@@ -154,7 +154,7 @@ pub async fn handle_walreceiver_connection(
{
let mut decoded = DecodedWALRecord::default();
let mut modification = timeline.begin_modification(endlsn);
let mut modification = timeline.begin_modification();
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// let _enter = info_span!("processing record", lsn = %lsn).entered();

View File

@@ -20,8 +20,8 @@
//!
use byteorder::{ByteOrder, LittleEndian};
use bytes::{BufMut, Bytes, BytesMut};
use lazy_static::lazy_static;
use nix::poll::*;
use once_cell::sync::Lazy;
use serde::Serialize;
use std::fs;
use std::fs::OpenOptions;
@@ -105,27 +105,21 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
// We collect the time spent in actual WAL redo ('redo'), and time waiting
// for access to the postgres process ('wait') since there is only one for
// each tenant.
static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
.expect("failed to define a metric")
});
static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
lazy_static! {
static ref WAL_REDO_TIME: Histogram =
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
.expect("failed to define a metric");
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the WAL redo process"
)
.expect("failed to define a metric")
});
static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.expect("failed to define a metric");
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
"pageserver_replayed_wal_records_total",
"Number of WAL records replayed in WAL redo process"
)
.unwrap()
});
.unwrap();
}
///
/// This is the real implementation that uses a Postgres process to

View File

@@ -14,7 +14,7 @@ hashbrown = "0.11.2"
hex = "0.4.3"
hmac = "0.12.1"
hyper = "0.14"
once_cell = "1.13.0"
lazy_static = "1.4.0"
md5 = "0.7.0"
parking_lot = "0.12"
pin-project-lite = "0.2.7"

View File

@@ -12,12 +12,13 @@ use crate::{
stream::PqStream,
waiters::{self, Waiter, Waiters},
};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);
lazy_static! {
static ref CPLANE_WAITERS: Waiters<mgmt::ComputeReady> = Default::default();
}
/// Give caller an opportunity to wait for the cloud's reply.
pub async fn with_waiter<R, T, E>(

View File

@@ -4,8 +4,8 @@ use crate::config::{ProxyConfig, TlsConfig};
use crate::stream::{MetricsStream, PqStream, Stream};
use anyhow::{bail, Context};
use futures::TryFutureExt;
use lazy_static::lazy_static;
use metrics::{register_int_counter, IntCounter};
use once_cell::sync::Lazy;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use utils::pq_proto::{BeMessage as Be, *};
@@ -13,29 +13,23 @@ use utils::pq_proto::{BeMessage as Be, *};
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
lazy_static! {
static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!(
"proxy_accepted_connections_total",
"Number of TCP client connections accepted."
)
.unwrap()
});
static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.unwrap();
static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!(
"proxy_closed_connections_total",
"Number of TCP client connections closed."
)
.unwrap()
});
static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
.unwrap();
static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!(
"proxy_io_bytes_total",
"Number of bytes sent/received between any client and backend."
)
.unwrap()
});
.unwrap();
}
/// A small combinator for pluggable error logging.
async fn log_error<R, F>(future: F) -> F::Output

View File

@@ -1,8 +1,4 @@
[pytest]
filterwarnings =
error::pytest.PytestUnhandledThreadExceptionWarning
error::UserWarning
ignore:record_property is incompatible with junit_family:pytest.PytestWarning
addopts =
-m 'not remote_cluster'
markers =

View File

@@ -9,6 +9,7 @@ bytes = "1.0.1"
byteorder = "1.4.3"
hyper = "0.14"
fs2 = "0.4.3"
lazy_static = "1.4.0"
serde_json = "1"
tracing = "0.1.27"
clap = "3.0"
@@ -28,7 +29,7 @@ const_format = "0.2.21"
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
git-version = "0.3.5"
async-trait = "0.1"
once_cell = "1.13.0"
once_cell = "1.10.0"
toml_edit = { version = "0.13", features = ["easy"] }
postgres_ffi = { path = "../libs/postgres_ffi" }

View File

@@ -2,7 +2,7 @@
use anyhow::{bail, ensure, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use std::fs::{self, File, OpenOptions};
use std::io::{Read, Write};
@@ -26,15 +26,15 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control";
const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
static PERSIST_CONTROL_FILE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_persist_control_file_seconds",
"Seconds to persist and sync control file, grouped by timeline",
&["tenant_id", "timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec")
});
.expect("Failed to register safekeeper_persist_control_file_seconds histogram vec");
}
/// Storage should keep actual state inside of it. It should implement Deref
/// trait to access state fields and have persist method for updating that state.

View File

@@ -4,7 +4,7 @@
use anyhow::{bail, Context, Result};
use etcd_broker::subscription_value::SkTimelineInfo;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use postgres_ffi::xlog_utils::XLogSegNo;
use serde::Serialize;
@@ -559,12 +559,12 @@ struct GlobalTimelinesState {
wal_backup_launcher_tx: Option<Sender<ZTenantTimelineId>>,
}
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
Mutex::new(GlobalTimelinesState {
lazy_static! {
static ref TIMELINES_STATE: Mutex<GlobalTimelinesState> = Mutex::new(GlobalTimelinesState {
timelines: HashMap::new(),
wal_backup_launcher_tx: None,
})
});
});
}
#[derive(Clone, Copy, Serialize)]
pub struct TimelineDeleteForceResult {

View File

@@ -12,7 +12,7 @@ use std::io::{self, Seek, SeekFrom};
use std::pin::Pin;
use tokio::io::AsyncRead;
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use postgres_ffi::xlog_utils::{
find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI,
};
@@ -38,44 +38,31 @@ use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECOND
use tokio::io::{AsyncReadExt, AsyncSeekExt};
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
// i64 is faster than f64, so update to u64 when available.
static WRITE_WAL_BYTES: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
lazy_static! {
// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
// i64 is faster than f64, so update to u64 when available.
static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!(
"safekeeper_write_wal_bytes",
"Bytes written to WAL in a single request, grouped by timeline",
&["tenant_id", "timeline_id"],
vec![
1.0,
10.0,
100.0,
1024.0,
8192.0,
128.0 * 1024.0,
1024.0 * 1024.0,
10.0 * 1024.0 * 1024.0
]
vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]
)
.expect("Failed to register safekeeper_write_wal_bytes histogram vec")
});
static WRITE_WAL_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("Failed to register safekeeper_write_wal_bytes histogram vec");
static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_write_wal_seconds",
"Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline",
&["tenant_id", "timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_write_wal_seconds histogram vec")
});
static FLUSH_WAL_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
.expect("Failed to register safekeeper_write_wal_seconds histogram vec");
static ref FLUSH_WAL_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_flush_wal_seconds",
"Seconds spent syncing WAL to a disk, grouped by timeline",
&["tenant_id", "timeline_id"],
DISK_WRITE_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_flush_wal_seconds histogram vec")
});
.expect("Failed to register safekeeper_flush_wal_seconds histogram vec");
}
struct WalStorageMetrics {
write_wal_bytes: Histogram,

View File

@@ -1,708 +0,0 @@
#
# Script to export tenants from one pageserver and import them into another page server.
#
# Outline of steps:
# 1. Get `(last_lsn, prev_lsn)` from old pageserver
# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
# 3. This tar file might be missing relation files for empty relations, if the pageserver
# is old enough (we didn't always store those). So to recreate them, we start a local
# vanilla postgres on this basebackup and ask it what relations should exist, then touch
# any missing files and re-pack the tar.
# TODO This functionality is no longer needed, so we can delete it later if we don't
# end up using the same utils for the pg 15 upgrade. Not sure.
# 4. We import the patched basebackup into a new pageserver
# 5. We export again via fullbackup, now from the new pageserver and compare the returned
# tar file with the one we imported. This confirms that we imported everything that was
# exported, but doesn't guarantee correctness (what if we didn't **export** everything
# initially?)
# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
#
# For more context on how to use this, see:
# https://github.com/neondatabase/cloud/wiki/Storage-format-migration
import os
from os import path
import shutil
from pathlib import Path
import tempfile
from contextlib import closing
import psycopg2
import subprocess
import argparse
import time
import requests
import uuid
from psycopg2.extensions import connection as PgConnection
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple
###############################################
### client-side utils copied from test fixtures
###############################################
Env = Dict[str, str]
_global_counter = 0
def global_counter() -> int:
""" A really dumb global counter.
This is useful for giving output files a unique number, so if we run the
same command multiple times we can keep their output separate.
"""
global _global_counter
_global_counter += 1
return _global_counter
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
""" Run a process and capture its output
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
where "cmd" is the name of the program and NNN is an incrementing
counter.
If those files already exist, we will overwrite them.
Returns basepath for files with captured output.
"""
assert type(cmd) is list
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
basepath = os.path.join(capture_dir, base)
stdout_filename = basepath + '.stdout'
stderr_filename = basepath + '.stderr'
with open(stdout_filename, 'w') as stdout_f:
with open(stderr_filename, 'w') as stderr_f:
print('(capturing output to "{}.stdout")'.format(base))
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
return basepath
class PgBin:
""" A helper class for executing postgres binaries """
def __init__(self, log_dir: Path, pg_distrib_dir):
self.log_dir = log_dir
self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin')
self.env = os.environ.copy()
self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib')
def _fixpath(self, command: List[str]):
if '/' not in command[0]:
command[0] = os.path.join(self.pg_bin_path, command[0])
def _build_env(self, env_add: Optional[Env]) -> Env:
if env_add is None:
return self.env
env = self.env.copy()
env.update(env_add)
return env
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
"""
Run one of the postgres binaries.
The command should be in list form, e.g. ['pgbench', '-p', '55432']
All the necessary environment variables will be set.
If the first argument (the command name) doesn't include a path (no '/'
characters present), then it will be edited to include the correct path.
If you want stdout/stderr captured to files, use `run_capture` instead.
"""
self._fixpath(command)
print('Running command "{}"'.format(' '.join(command)))
env = self._build_env(env)
subprocess.run(command, env=env, cwd=cwd, check=True)
def run_capture(self,
command: List[str],
env: Optional[Env] = None,
cwd: Optional[str] = None,
**kwargs: Any) -> str:
"""
Run one of the postgres binaries, with stderr and stdout redirected to a file.
This is just like `run`, but for chatty programs. Returns basepath for files
with captured output.
"""
self._fixpath(command)
print('Running command "{}"'.format(' '.join(command)))
env = self._build_env(env)
return subprocess_capture(str(self.log_dir),
command,
env=env,
cwd=cwd,
check=True,
**kwargs)
class PgProtocol:
""" Reusable connection logic """
def __init__(self, **kwargs):
self.default_options = kwargs
def conn_options(self, **kwargs):
conn_options = self.default_options.copy()
if 'dsn' in kwargs:
conn_options.update(parse_dsn(kwargs['dsn']))
conn_options.update(kwargs)
# Individual statement timeout in seconds. 2 minutes should be
# enough for our tests, but if you need a longer, you can
# change it by calling "SET statement_timeout" after
# connecting.
if 'options' in conn_options:
conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options']
else:
conn_options['options'] = "-cstatement_timeout=120s"
return conn_options
# autocommit=True here by default because that's what we need most of the time
def connect(self, autocommit=True, **kwargs) -> PgConnection:
"""
Connect to the node.
Returns psycopg2's connection object.
This method passes all extra params to connstr.
"""
conn = psycopg2.connect(**self.conn_options(**kwargs))
# WARNING: this setting affects *all* tests!
conn.autocommit = autocommit
return conn
def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
"""
Execute query against the node and return all rows.
This method passes all extra params to connstr.
"""
return self.safe_psql_many([query], **kwargs)[0]
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
"""
Execute queries against the node and return all rows.
This method passes all extra params to connstr.
"""
result: List[List[Any]] = []
with closing(self.connect(**kwargs)) as conn:
with conn.cursor() as cur:
for query in queries:
print(f"Executing query: {query}")
cur.execute(query)
if cur.description is None:
result.append([]) # query didn't return data
else:
result.append(cast(List[Any], cur.fetchall()))
return result
class VanillaPostgres(PgProtocol):
def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
super().__init__(host='localhost', port=port, dbname='postgres')
self.pgdatadir = pgdatadir
self.pg_bin = pg_bin
self.running = False
if init:
self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)])
self.configure([f"port = {port}\n"])
def configure(self, options: List[str]):
"""Append lines into postgresql.conf file."""
assert not self.running
with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file:
conf_file.write("\n".join(options))
def start(self, log_path: Optional[str] = None):
assert not self.running
self.running = True
if log_path is None:
log_path = os.path.join(self.pgdatadir, "pg.log")
self.pg_bin.run_capture(
['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start'])
def stop(self):
assert self.running
self.running = False
self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop'])
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
if self.running:
self.stop()
class NeonPageserverApiException(Exception):
pass
class NeonPageserverHttpClient(requests.Session):
def __init__(self, host, port):
super().__init__()
self.host = host
self.port = port
def verbose_error(self, res: requests.Response):
try:
res.raise_for_status()
except requests.RequestException as e:
try:
msg = res.json()['msg']
except:
msg = ''
raise NeonPageserverApiException(msg) from e
def check_status(self):
self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
def tenant_list(self):
res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
res = self.post(
f"http://{self.host}:{self.port}/v1/tenant",
json={
'new_tenant_id': new_tenant_id.hex,
},
)
if res.status_code == 409:
if ok_if_exists:
print(f'could not create tenant: already exists for id {new_tenant_id}')
else:
res.raise_for_status()
elif res.status_code == 201:
print(f'created tenant {new_tenant_id}')
else:
self.verbose_error(res)
return new_tenant_id
def timeline_list(self, tenant_id: uuid.UUID):
res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1"
)
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def lsn_to_hex(num: int) -> str:
""" Convert lsn from int to standard hex notation. """
return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
def lsn_from_hex(lsn_hex: str) -> int:
""" Convert lsn from hex notation to int. """
l, r = lsn_hex.split('/')
return (int(l, 16) << 32) + int(r, 16)
def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient,
tenant: uuid.UUID,
timeline: uuid.UUID) -> int:
detail = pageserver_http_client.timeline_detail(tenant, timeline)
if detail['remote'] is None:
# No remote information at all. This happens right after creating
# a timeline, before any part of it has been uploaded to remote
# storage yet.
return 0
else:
lsn_str = detail['remote']['remote_consistent_lsn']
assert isinstance(lsn_str, str)
return lsn_from_hex(lsn_str)
def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient,
tenant: uuid.UUID,
timeline: uuid.UUID,
lsn: int):
"""waits for local timeline upload up to specified lsn"""
for i in range(10):
current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
if current_lsn >= lsn:
return
print("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1))
time.sleep(1)
raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
##############
# End of utils
##############
def pack_base(log_dir, restored_dir, output_tar):
"""Create tar file from basebackup, being careful to produce relative filenames."""
tmp_tar_name = "tmp.tar"
tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
# We actually cd into the dir and call tar from there. If we call tar from
# outside we won't encode filenames as relative, and they won't parse well
# on import.
subprocess_capture(log_dir, cmd, cwd=restored_dir)
shutil.move(tmp_tar_path, output_tar)
def reconstruct_paths(log_dir, pg_bin, base_tar):
"""Reconstruct what relation files should exist in the datadir by querying postgres."""
with tempfile.TemporaryDirectory() as restored_dir:
# Unpack the base tar
subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
# Start a vanilla postgres from the given datadir and query it to find
# what relfiles should exist, but possibly don't.
port = "55439" # Probably free
with VanillaPostgres(restored_dir, pg_bin, port, init=False) as vanilla_pg:
vanilla_pg.configure([f"port={port}"])
vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
# Create database based on template0 because we can't connect to template0
query = "create database template0copy template template0"
vanilla_pg.safe_psql(query, user="cloud_admin")
vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
# Get all databases
query = "select oid, datname from pg_database"
oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
template0_oid = [
oid for (oid, database) in oid_dbname_pairs if database == "template0"
][0]
# Get rel paths for each database
for oid, database in oid_dbname_pairs:
if database == "template0":
# We can't connect to template0
continue
query = "select relname, pg_relation_filepath(oid) from pg_class"
result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
for relname, filepath in result:
if filepath is not None:
if database == "template0copy":
# Add all template0copy paths to template0
prefix = f"base/{oid}/"
if filepath.startswith(prefix):
suffix = filepath[len(prefix):]
yield f"base/{template0_oid}/{suffix}"
elif filepath.startswith("global"):
print(f"skipping {database} global file {filepath}")
else:
raise AssertionError
else:
yield filepath
def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
"""Add the appropriate empty files to a basebadkup tar."""
with tempfile.TemporaryDirectory() as restored_dir:
# Unpack the base tar
subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
# Touch files that don't exist
for path in paths:
absolute_path = os.path.join(restored_dir, path)
exists = os.path.exists(absolute_path)
if not exists:
print(f"File {absolute_path} didn't exist. Creating..")
Path(absolute_path).touch()
# Repackage
pack_base(log_dir, restored_dir, output_tar)
# HACK This is a workaround for exporting from old pageservers that
# can't export empty relations. In this case we need to start
# a vanilla postgres from the exported datadir, and query it
# to see what empty relations are missing, and then create
# those empty files before importing.
def add_missing_rels(base_tar, output_tar, log_dir, pg_bin):
reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar))
touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
conn = psycopg2.connect(pageserver_connstr)
conn.autocommit = True
with conn.cursor() as cur:
cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
cur.execute(cmd)
res = cur.fetchone()
prev_lsn = res[0]
last_lsn = res[1]
conn.close()
return last_lsn, prev_lsn
def import_timeline(args,
psql_path,
pageserver_connstr,
pageserver_http,
tenant_id,
timeline_id,
last_lsn,
prev_lsn,
tar_filename):
# Import timelines to new pageserver
import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}"
full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
stderr_filename2 = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
stdout_filename = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
print(f"Running: {full_cmd}")
with open(stdout_filename, 'w') as stdout_f:
with open(stderr_filename2, 'w') as stderr_f:
print(f"(capturing output to {stdout_filename})")
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
subprocess.run(full_cmd,
stdout=stdout_f,
stderr=stderr_f,
env=pg_bin._build_env(None),
shell=True,
check=True)
print(f"Done import")
# Wait until pageserver persists the files
wait_for_upload(pageserver_http,
uuid.UUID(tenant_id),
uuid.UUID(timeline_id),
lsn_from_hex(last_lsn))
def export_timeline(args,
psql_path,
pageserver_connstr,
tenant_id,
timeline_id,
last_lsn,
prev_lsn,
tar_filename):
# Choose filenames
incomplete_filename = tar_filename + ".incomplete"
stderr_filename = path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
# Construct export command
query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
# Run export command
print(f"Running: {cmd}")
with open(incomplete_filename, 'w') as stdout_f:
with open(stderr_filename, 'w') as stderr_f:
print(f"(capturing output to {incomplete_filename})")
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
subprocess.run(cmd,
stdout=stdout_f,
stderr=stderr_f,
env=pg_bin._build_env(None),
check=True)
# Add missing rels
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir)
add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin)
# Log more info
file_size = os.path.getsize(tar_filename)
print(f"Done export: {tar_filename}, size {file_size}")
def main(args: argparse.Namespace):
psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql")
old_pageserver_host = args.old_pageserver_host
new_pageserver_host = args.new_pageserver_host
old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
old_http_client.check_status()
old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
new_http_client.check_status()
new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
for tenant_id in args.tenants:
print(f"Tenant: {tenant_id}")
timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
print(f"Timelines: {timelines}")
# Create tenant in new pageserver
if args.only_import is False and not args.timelines:
new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
for timeline in timelines:
# Skip timelines we don't need to export
if args.timelines and timeline['timeline_id'] not in args.timelines:
print(f"Skipping timeline {timeline['timeline_id']}")
continue
# Choose filenames
tar_filename = path.join(args.work_dir,
f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar")
# Export timeline from old pageserver
if args.only_import is False:
last_lsn, prev_lsn = get_rlsn(
old_pageserver_connstr,
timeline['tenant_id'],
timeline['timeline_id'],
)
export_timeline(
args,
psql_path,
old_pageserver_connstr,
timeline['tenant_id'],
timeline['timeline_id'],
last_lsn,
prev_lsn,
tar_filename,
)
# Import into new pageserver
import_timeline(
args,
psql_path,
new_pageserver_connstr,
new_http_client,
timeline['tenant_id'],
timeline['timeline_id'],
last_lsn,
prev_lsn,
tar_filename,
)
# Re-export and compare
re_export_filename = tar_filename + ".reexport"
export_timeline(args,
psql_path,
new_pageserver_connstr,
timeline['tenant_id'],
timeline['timeline_id'],
last_lsn,
prev_lsn,
re_export_filename)
# Check the size is the same
old_size = os.path.getsize(tar_filename),
new_size = os.path.getsize(re_export_filename),
if old_size != new_size:
raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--tenant-id',
dest='tenants',
required=True,
nargs='+',
help='Id of the tenant to migrate. You can pass multiple arguments',
)
parser.add_argument(
'--timeline-id',
dest='timelines',
required=False,
nargs='+',
help='Id of the timeline to migrate. You can pass multiple arguments',
)
parser.add_argument(
'--from-host',
dest='old_pageserver_host',
required=True,
help='Host of the pageserver to migrate data from',
)
parser.add_argument(
'--from-http-port',
dest='old_pageserver_http_port',
required=False,
type=int,
default=9898,
help='HTTP port of the pageserver to migrate data from. Default: 9898',
)
parser.add_argument(
'--from-pg-port',
dest='old_pageserver_pg_port',
required=False,
type=int,
default=6400,
help='pg port of the pageserver to migrate data from. Default: 6400',
)
parser.add_argument(
'--to-host',
dest='new_pageserver_host',
required=True,
help='Host of the pageserver to migrate data to',
)
parser.add_argument(
'--to-http-port',
dest='new_pageserver_http_port',
required=False,
default=9898,
type=int,
help='HTTP port of the pageserver to migrate data to. Default: 9898',
)
parser.add_argument(
'--to-pg-port',
dest='new_pageserver_pg_port',
required=False,
default=6400,
type=int,
help='pg port of the pageserver to migrate data to. Default: 6400',
)
parser.add_argument(
'--ignore-tenant-exists',
dest='ok_if_exists',
required=False,
help=
'Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.',
)
parser.add_argument(
'--pg-distrib-dir',
dest='pg_distrib_dir',
required=False,
default='/usr/local/',
help='Path where postgres binaries are installed. Default: /usr/local/',
)
parser.add_argument(
'--psql-path',
dest='psql_path',
required=False,
default='/usr/local/bin/psql',
help='Path to the psql binary. Default: /usr/local/bin/psql',
)
parser.add_argument(
'--only-import',
dest='only_import',
required=False,
default=False,
action='store_true',
help='Skip export and tenant creation part',
)
parser.add_argument(
'--work-dir',
dest='work_dir',
required=True,
default=False,
help='directory where temporary tar files are stored',
)
args = parser.parse_args()
main(args)

View File

@@ -1,5 +1,6 @@
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
from fixtures.utils import query_scalar

View File

@@ -167,5 +167,3 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
# The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
with pytest.raises(Exception, match="invalid branch start lsn"):
env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn)
thread.join()

View File

@@ -60,38 +60,17 @@ def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID):
def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
tenant_id, timeline_id = env.neon_cli.create_tenant()
client = env.pageserver.http_client()
timeline_details = client.timeline_detail(tenant_id=tenant_id,
timeline_id=timeline_id,
include_non_incremental_logical_size=True)
tenant_id, timeline_id = env.neon_cli.create_tenant()
assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
timeline_details = client.timeline_detail(tenant_id=tenant_id,
timeline_id=timeline_id,
include_non_incremental_logical_size=True)
def expect_updated_msg_lsn(client: NeonPageserverHttpClient,
tenant_id: UUID,
timeline_id: UUID,
prev_msg_lsn: Optional[int]) -> int:
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
# a successful `timeline_details` response must contain the below fields
local_timeline_details = timeline_details['local']
assert "wal_source_connstr" in local_timeline_details.keys()
assert "last_received_msg_lsn" in local_timeline_details.keys()
assert "last_received_msg_ts" in local_timeline_details.keys()
assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty"
last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"])
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \
f"the last received message's LSN {last_msg_lsn} hasn't been updated \
compared to the previous message's LSN {prev_msg_lsn}"
return last_msg_lsn
assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running'
# Test the WAL-receiver related fields in the response to `timeline_details` API call
@@ -100,29 +79,44 @@ def expect_updated_msg_lsn(client: NeonPageserverHttpClient,
# `timeline_details` now.
def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
tenant_id, timeline_id = env.neon_cli.create_tenant()
pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
client = env.pageserver.http_client()
# Wait to make sure that we get a latest WAL receiver data.
# We need to wait here because it's possible that we don't have access to
# the latest WAL yet, when the `timeline_detail` API is first called.
# See: https://github.com/neondatabase/neon/issues/1768.
lsn = wait_until(number_of_iterations=5,
interval=1,
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
tenant_id, timeline_id = env.neon_cli.create_tenant()
pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
# Make a DB modification then expect getting a new WAL receiver's data.
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
wait_until(number_of_iterations=5,
interval=1,
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int:
timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
# a successful `timeline_details` response must contain the below fields
local_timeline_details = timeline_details['local']
assert "wal_source_connstr" in local_timeline_details.keys()
assert "last_received_msg_lsn" in local_timeline_details.keys()
assert "last_received_msg_ts" in local_timeline_details.keys()
assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty"
last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"])
assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \
f"the last received message's LSN {last_msg_lsn} hasn't been updated \
compared to the previous message's LSN {prev_msg_lsn}"
return last_msg_lsn
# Wait to make sure that we get a latest WAL receiver data.
# We need to wait here because it's possible that we don't have access to
# the latest WAL yet, when the `timeline_detail` API is first called.
# See: https://github.com/neondatabase/neon/issues/1768.
lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None))
# Make a DB modification then expect getting a new WAL receiver's data.
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn))
def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.pageserver.http_client() as client:
check_client(client, env.initial_tenant)
client = env.pageserver.http_client()
check_client(client, env.initial_tenant)
def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
@@ -131,5 +125,5 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
management_token = env.auth_keys.generate_management_token()
with env.pageserver.http_client(auth_token=management_token) as client:
check_client(client, env.initial_tenant)
client = env.pageserver.http_client(auth_token=management_token)
check_client(client, env.initial_tenant)

View File

@@ -2,10 +2,11 @@
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
import shutil, os
from contextlib import closing
from pathlib import Path
import time
from uuid import UUID
from fixtures.neon_fixtures import NeonEnvBuilder, RemoteStorageKind, assert_timeline_local, available_remote_storages, wait_until, wait_for_last_record_lsn, wait_for_upload
from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload
from fixtures.log_helper import log
from fixtures.utils import lsn_from_hex, query_scalar
import pytest
@@ -28,19 +29,18 @@ import pytest
# * queries the specific data, ensuring that it matches the one stored before
#
# The tests are done for all types of remote storage pageserver supports.
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_remote_storage_backup_and_restore(
neon_env_builder: NeonEnvBuilder,
remote_storatge_kind: RemoteStorageKind,
):
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, storage_type: str):
# Use this test to check more realistic SK ids: some etcd key parsing bugs were related,
# and this test needs SK to write data to pageserver, so it will be visible
neon_env_builder.safekeepers_id_start = 12
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_remote_storage_backup_and_restore',
)
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
data_id = 1
data_secret = 'very secret secret'
@@ -110,7 +110,7 @@ def test_remote_storage_backup_and_restore(
client.tenant_attach(UUID(tenant_id))
log.info("waiting for timeline redownload")
wait_until(number_of_iterations=20,
wait_until(number_of_iterations=10,
interval=1,
func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)))

View File

@@ -1,19 +1,10 @@
from threading import Thread
from uuid import uuid4
import uuid
import psycopg2
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID):
"""Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
try:
env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0')
except Exception as e:
log.error("do_gc failed: %s", e)
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
@@ -45,7 +36,8 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0')
# try to concurrently run gc and detach
gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id))
gc_thread = Thread(
target=lambda: env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0'), )
gc_thread.start()
last_error = None

View File

@@ -229,7 +229,7 @@ def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path
# basebackup and importing it into the new pageserver.
# This kind of migration can tolerate breaking changes
# to storage format
'major',
pytest.param('major', marks=pytest.mark.xfail(reason="Not implemented")),
])
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
@@ -345,8 +345,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
# Migrate either by attaching from s3 or import/export basebackup
if method == "major":
cmd = [
"poetry",
"run",
"python",
os.path.join(base_dir, "scripts/export_import_between_pageservers.py"),
"--tenant-id",
@@ -363,12 +361,12 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
str(new_pageserver_http_port),
"--to-pg-port",
str(new_pageserver_pg_port),
"--pg-distrib-dir",
pg_distrib_dir,
"--psql-path",
os.path.join(pg_distrib_dir, "bin", "psql"),
"--work-dir",
os.path.join(test_output_dir),
]
subprocess_capture(test_output_dir, cmd, check=True)
subprocess_capture(str(env.repo_dir), cmd, check=True)
elif method == "minor":
# call to attach timeline to new pageserver
new_pageserver_http.tenant_attach(tenant_id)
@@ -429,22 +427,6 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder,
post_migration_check(pg_main, 500500, old_local_path_main)
post_migration_check(pg_second, 1001000, old_local_path_second)
# ensure that we can successfully read all relations on the new pageserver
with pg_cur(pg_second) as cur:
cur.execute('''
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN
SELECT relname FROM pg_class WHERE relkind='r'
LOOP
RAISE NOTICE '%', r.relname;
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
END LOOP;
END$$;
''')
if with_load == 'with_load':
assert load_ok_event.wait(3)
log.info('stopping load thread')

View File

@@ -13,7 +13,7 @@ from uuid import UUID
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload
from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, wait_for_last_record_lsn, wait_for_upload
from fixtures.utils import lsn_from_hex
@@ -38,7 +38,7 @@ async def tenant_workload(env: NeonEnv, pg: Postgres):
async def all_tenants_workload(env: NeonEnv, tenants_pgs):
workers = []
for _, pg in tenants_pgs:
for tenant, pg in tenants_pgs:
worker = tenant_workload(env, pg)
workers.append(asyncio.create_task(worker))
@@ -46,18 +46,23 @@ async def all_tenants_workload(env: NeonEnv, tenants_pgs):
await asyncio.gather(*workers)
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_tenants_many',
)
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str):
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
neon_env_builder.enable_local_fs_remote_storage()
env = neon_env_builder.init_start()
tenants_pgs: List[Tuple[UUID, Postgres]] = []
for _ in range(1, 5):
for i in range(1, 5):
# Use a tiny checkpoint distance, to create a lot of layers quickly
tenant, _ = env.neon_cli.create_tenant(
conf={

View File

@@ -12,8 +12,9 @@ import uuid
from contextlib import closing
from dataclasses import dataclass, field
from multiprocessing import Process, Value
from pathlib import Path
from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageKind, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, available_remote_storages, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload
from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload
from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex, query_scalar
from fixtures.log_helper import log
from typing import List, Optional, Any
@@ -350,7 +351,7 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end):
if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end):
break
elapsed = time.time() - started_at
if elapsed > 30:
if elapsed > 20:
raise RuntimeError(
f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded")
time.sleep(0.5)
@@ -376,15 +377,15 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size):
time.sleep(0.5)
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs'])
def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_safekeepers_wal_backup',
)
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
env = neon_env_builder.init_start()
@@ -424,15 +425,15 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo
wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000')
@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages())
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind):
@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs'])
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storatge_kind,
test_name='test_s3_wal_replay',
)
if storage_type == 'local_fs':
neon_env_builder.enable_local_fs_remote_storage()
elif storage_type == 'mock_s3':
neon_env_builder.enable_s3_mock_remote_storage('test_s3_wal_replay')
else:
raise RuntimeError(f'Unknown storage type: {storage_type}')
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
env = neon_env_builder.init_start()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
from dataclasses import field
from contextlib import contextmanager
from enum import Flag, auto
import enum
import textwrap
from cached_property import cached_property
import abc
@@ -222,7 +221,7 @@ def can_bind(host: str, port: int) -> bool:
# moment. If that changes, we should use start using SO_REUSEADDR here
# too, to allow reusing ports more quickly.
# See https://github.com/neondatabase/neon/issues/801
# sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
#sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try:
sock.bind((host, port))
@@ -231,8 +230,6 @@ def can_bind(host: str, port: int) -> bool:
except socket.error:
log.info(f"Port {port} is in use, skipping")
return False
finally:
sock.close()
class PortDistributor:
@@ -265,11 +262,6 @@ def default_broker(request: Any, port_distributor: PortDistributor):
broker.stop()
@pytest.fixture(scope='session')
def run_id():
yield uuid.uuid4()
@pytest.fixture(scope='session')
def mock_s3_server(port_distributor: PortDistributor):
mock_s3_server = MockS3Server(port_distributor.get_port())
@@ -299,9 +291,7 @@ class PgProtocol:
# change it by calling "SET statement_timeout" after
# connecting.
options = result.get('options', '')
if "statement_timeout" not in options:
options = f'-cstatement_timeout=120s {options}'
result['options'] = options
result['options'] = f'-cstatement_timeout=120s {options}'
return result
# autocommit=True here by default because that's what we need most of the time
@@ -448,46 +438,26 @@ class MockS3Server:
def secret_key(self) -> str:
return 'test'
def access_env_vars(self) -> Dict[Any, Any]:
return {
'AWS_ACCESS_KEY_ID': self.access_key(),
'AWS_SECRET_ACCESS_KEY': self.secret_key(),
}
def kill(self):
self.subprocess.kill()
@enum.unique
class RemoteStorageKind(enum.Enum):
LOCAL_FS = "local_fs"
MOCK_S3 = "mock_s3"
REAL_S3 = "real_s3"
def available_remote_storages() -> List[RemoteStorageKind]:
remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
remote_storages.append(RemoteStorageKind.REAL_S3)
log.info("Enabling real s3 storage for tests")
else:
log.info("Using mock implementations to test remote storage")
return remote_storages
@dataclass
class LocalFsStorage:
root: Path
local_path: Path
@dataclass
class S3Storage:
bucket_name: str
bucket_region: str
access_key: str
secret_key: str
endpoint: Optional[str] = None
prefix_in_bucket: Optional[str] = None
def access_env_vars(self) -> Dict[str, str]:
return {
'AWS_ACCESS_KEY_ID': self.access_key,
'AWS_SECRET_ACCESS_KEY': self.secret_key,
}
endpoint: Optional[str]
RemoteStorage = Union[LocalFsStorage, S3Storage]
@@ -496,20 +466,16 @@ RemoteStorage = Union[LocalFsStorage, S3Storage]
# serialize as toml inline table
def remote_storage_to_toml_inline_table(remote_storage):
if isinstance(remote_storage, LocalFsStorage):
remote_storage_config = f"local_path='{remote_storage.root}'"
res = f"local_path='{remote_storage.local_path}'"
elif isinstance(remote_storage, S3Storage):
remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\
bucket_region='{remote_storage.bucket_region}'"
if remote_storage.prefix_in_bucket is not None:
remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'"
res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'"
if remote_storage.endpoint is not None:
remote_storage_config += f",endpoint='{remote_storage.endpoint}'"
res += f", endpoint='{remote_storage.endpoint}'"
else:
raise Exception(f'Unknown storage configuration {remote_storage}')
else:
raise Exception("invalid remote storage type")
return f"{{{remote_storage_config}}}"
return f"{{{res}}}"
class RemoteStorageUsers(Flag):
@@ -527,31 +493,28 @@ class NeonEnvBuilder:
cleaned up after the test has finished.
"""
def __init__(
self,
repo_dir: Path,
port_distributor: PortDistributor,
broker: Etcd,
run_id: uuid.UUID,
mock_s3_server: MockS3Server,
remote_storage: Optional[RemoteStorage] = None,
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 1,
# Use non-standard SK ids to check for various parsing bugs
safekeepers_id_start: int = 0,
# fsync is disabled by default to make the tests go faster
safekeepers_enable_fsync: bool = False,
auth_enabled: bool = False,
rust_log_override: Optional[str] = None,
default_branch_name=DEFAULT_BRANCH_NAME,
):
self,
repo_dir: Path,
port_distributor: PortDistributor,
broker: Etcd,
mock_s3_server: MockS3Server,
remote_storage: Optional[RemoteStorage] = None,
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 1,
# Use non-standard SK ids to check for various parsing bugs
safekeepers_id_start: int = 0,
# fsync is disabled by default to make the tests go faster
safekeepers_enable_fsync: bool = False,
auth_enabled: bool = False,
rust_log_override: Optional[str] = None,
default_branch_name=DEFAULT_BRANCH_NAME):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
self.port_distributor = port_distributor
self.remote_storage = remote_storage
self.remote_storage_users = remote_storage_users
self.broker = broker
self.run_id = run_id
self.mock_s3_server = mock_s3_server
self.pageserver_config_override = pageserver_config_override
self.num_safekeepers = num_safekeepers
@@ -560,8 +523,6 @@ class NeonEnvBuilder:
self.auth_enabled = auth_enabled
self.default_branch_name = default_branch_name
self.env: Optional[NeonEnv] = None
self.remote_storage_prefix: Optional[str] = None
self.keep_remote_storage_contents: bool = True
def init(self) -> NeonEnv:
# Cannot create more than one environment from one builder
@@ -577,143 +538,41 @@ class NeonEnvBuilder:
self.start()
return env
def enable_remote_storage(
self,
remote_storage_kind: RemoteStorageKind,
test_name: str,
force_enable: bool = True,
):
if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
self.enable_local_fs_remote_storage(force_enable=force_enable)
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
else:
raise RuntimeError(f'Unknown storage type: {remote_storage_kind}')
"""
Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
def enable_local_fs_remote_storage(self, force_enable=True):
"""
Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage'))
def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True):
"""
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
Starts up the mock server, if that does not run yet.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
"""
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
Starts up the mock server, if that does not run yet.
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
"""
def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True):
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
mock_endpoint = self.mock_s3_server.endpoint()
mock_region = self.mock_s3_server.region()
self.remote_storage_client = boto3.client(
boto3.client(
's3',
endpoint_url=mock_endpoint,
region_name=mock_region,
aws_access_key_id=self.mock_s3_server.access_key(),
aws_secret_access_key=self.mock_s3_server.secret_key(),
)
self.remote_storage_client.create_bucket(Bucket=bucket_name)
self.remote_storage = S3Storage(
bucket_name=bucket_name,
endpoint=mock_endpoint,
bucket_region=mock_region,
access_key=self.mock_s3_server.access_key(),
secret_key=self.mock_s3_server.secret_key(),
)
def enable_real_s3_remote_storage(self, test_name: str, force_enable=True):
"""
Sets up configuration to use real s3 endpoint without mock server
"""
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
access_key = os.getenv("AWS_ACCESS_KEY_ID")
assert access_key, "no aws access key provided"
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
assert secret_key, "no aws access key provided"
# session token is needed for local runs with sso auth
session_token = os.getenv("AWS_SESSION_TOKEN")
bucket_name = os.getenv("REMOTE_STORAGE_S3_BUCKET")
assert bucket_name, "no remote storage bucket name provided"
region = os.getenv("REMOTE_STORAGE_S3_REGION")
assert region, "no remote storage region provided"
# do not leave data in real s3
self.keep_remote_storage_contents = False
# construct a prefix inside bucket for the particular test case and test run
self.remote_storage_prefix = f'{self.run_id}/{test_name}'
self.remote_storage_client = boto3.client(
's3',
region_name=region,
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
aws_session_token=session_token,
)
).create_bucket(Bucket=bucket_name)
self.remote_storage = S3Storage(bucket_name=bucket_name,
bucket_region=region,
access_key=access_key,
secret_key=secret_key,
prefix_in_bucket=self.remote_storage_prefix)
def cleanup_remote_storage(self):
# here wee check for true remote storage, no the local one
# local cleanup is not needed after test because in ci all env will be destroyed anyway
if self.remote_storage_prefix is None:
log.info("no remote storage was set up, skipping cleanup")
return
if self.keep_remote_storage_contents:
log.info("keep_remote_storage_contents skipping remote storage cleanup")
return
log.info("removing data from test s3 bucket %s by prefix %s",
self.remote_storage.bucket_name,
self.remote_storage_prefix)
paginator = self.remote_storage_client.get_paginator('list_objects_v2')
pages = paginator.paginate(
Bucket=self.remote_storage.bucket_name,
Prefix=self.remote_storage_prefix,
)
objects_to_delete = {'Objects': []}
cnt = 0
for item in pages.search('Contents'):
# weirdly when nothing is found it returns [None]
if item is None:
break
objects_to_delete['Objects'].append({'Key': item['Key']})
# flush once aws limit reached
if len(objects_to_delete['Objects']) >= 1000:
self.remote_storage_client.delete_objects(
Bucket=self.remote_storage.bucket_name,
Delete=objects_to_delete,
)
objects_to_delete = dict(Objects=[])
cnt += 1
# flush rest
if len(objects_to_delete['Objects']):
self.remote_storage_client.delete_objects(Bucket=self.remote_storage.bucket_name,
Delete=objects_to_delete)
log.info("deleted %s objects from remote storage", cnt)
endpoint=mock_endpoint,
bucket_region=mock_region)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
# Stop all the nodes.
if self.env:
log.info('Cleaning up all storage and compute nodes')
@@ -722,8 +581,6 @@ class NeonEnvBuilder:
sk.stop(immediate=True)
self.env.pageserver.stop(immediate=True)
self.cleanup_remote_storage()
class NeonEnv:
"""
@@ -856,13 +713,10 @@ class NeonEnv:
@pytest.fixture(scope=shareable_scope)
def _shared_simple_env(
request: Any,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd,
run_id: uuid.UUID,
) -> Iterator[NeonEnv]:
def _shared_simple_env(request: Any,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd) -> Iterator[NeonEnv]:
"""
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
is set, this is shared by all tests using `neon_simple_env`.
@@ -876,13 +730,8 @@ def _shared_simple_env(
repo_dir = os.path.join(str(top_output_dir), "shared_repo")
shutil.rmtree(repo_dir, ignore_errors=True)
with NeonEnvBuilder(
repo_dir=Path(repo_dir),
port_distributor=port_distributor,
broker=default_broker,
mock_s3_server=mock_s3_server,
run_id=run_id,
) as builder:
with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker,
mock_s3_server) as builder:
env = builder.init_start()
# For convenience in tests, create a branch from the freshly-initialized cluster.
@@ -907,13 +756,10 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
@pytest.fixture(scope='function')
def neon_env_builder(
test_output_dir,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd,
run_id: uuid.UUID,
) -> Iterator[NeonEnvBuilder]:
def neon_env_builder(test_output_dir,
port_distributor: PortDistributor,
mock_s3_server: MockS3Server,
default_broker: Etcd) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
@@ -931,13 +777,8 @@ def neon_env_builder(
repo_dir = os.path.join(test_output_dir, "repo")
# Return the builder to the caller
with NeonEnvBuilder(
repo_dir=Path(repo_dir),
port_distributor=port_distributor,
mock_s3_server=mock_s3_server,
broker=default_broker,
run_id=run_id,
) as builder:
with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker,
mock_s3_server) as builder:
yield builder
@@ -1342,10 +1183,7 @@ class NeonCli(AbstractNeonCli):
remote_storage_users=self.env.remote_storage_users,
pageserver_config_override=self.env.pageserver.config_override)
s3_env_vars = None
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
s3_env_vars = self.env.remote_storage.access_env_vars()
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]':
@@ -1357,10 +1195,7 @@ class NeonCli(AbstractNeonCli):
return self.raw_cli(cmd)
def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]':
s3_env_vars = None
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
s3_env_vars = self.env.remote_storage.access_env_vars()
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars)
def safekeeper_stop(self,
@@ -1502,7 +1337,7 @@ class NeonPageserver(PgProtocol):
return self
def __exit__(self, exc_type, exc, tb):
self.stop(immediate=True)
self.stop(True)
def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient:
return NeonPageserverHttpClient(
@@ -1519,7 +1354,6 @@ def append_pageserver_param_overrides(
):
if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None:
remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
params_to_update.append(
f'--pageserver-config-override=remote_storage={remote_storage_toml_table}')
@@ -2026,8 +1860,8 @@ class Safekeeper:
started_at = time.time()
while True:
try:
with self.http_client() as http_cli:
http_cli.check_status()
http_cli = self.http_client()
http_cli.check_status()
except Exception as e:
elapsed = time.time() - started_at
if elapsed > 3:
@@ -2178,9 +2012,9 @@ class Etcd:
return f'http://127.0.0.1:{self.port}'
def check_status(self):
with requests.Session() as s:
s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry
s.get(f"{self.client_url()}/health").raise_for_status()
s = requests.Session()
s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry
s.get(f"{self.client_url()}/health").raise_for_status()
def try_start(self):
if self.handle is not None:

View File

@@ -146,7 +146,7 @@ def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, durat
record_thread.join()
def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_event: threading.Event):
def start_pgbench_intensive_initialization(env: PgCompare, scale: int):
with env.record_duration("run_duration"):
# Needs to increase the statement timeout (default: 120s) because the
# initialization step can be slow with a large scale.
@@ -155,11 +155,9 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_even
f'-s{scale}',
'-i',
'-Idtg',
env.pg.connstr(options='-cstatement_timeout=600s')
env.pg.connstr(options='-cstatement_timeout=300s')
])
done_event.set()
@pytest.mark.timeout(1000)
@pytest.mark.parametrize("scale", get_scales_matrix(1000))
@@ -168,17 +166,15 @@ def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int):
with env.pg.connect().cursor() as cur:
cur.execute("CREATE TABLE foo as select generate_series(1,100000)")
workload_done_event = threading.Event()
workload_thread = threading.Thread(target=start_pgbench_intensive_initialization,
args=(env, scale, workload_done_event))
args=(env, scale))
workload_thread.start()
record_thread = threading.Thread(target=record_lsn_write_lag,
args=(env, lambda: not workload_done_event.is_set()))
args=(env, lambda: workload_thread.is_alive()))
record_thread.start()
record_read_latency(env, lambda: not workload_done_event.is_set(), "SELECT count(*) from foo")
record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT count(*) from foo")
workload_thread.join()
record_thread.join()