Compare commits

..

9 Commits

Author SHA1 Message Date
Alek Westover
4af6a4d5e8 hopefully unbroken wip 2023-07-18 08:45:39 -04:00
Alek Westover
b27fa34c00 pass aws creds via cli 2023-07-17 08:31:12 -04:00
Alek Westover
ca22453627 Merge branch 'alek_targz' of github.com:neondatabase/neon into alek_targz_default_on 2023-07-17 07:59:30 -04:00
Alek Westover
0a00869615 this should pass github tests, but will fail with my local cloud repo 2023-07-14 13:55:14 -04:00
Alek Westover
87eead5220 Update rfc 2023-07-14 10:54:16 -04:00
Alek Westover
3cf83014d4 patch rfc 2023-07-14 09:21:46 -04:00
Alek Westover
353a735acb @arpad-m suggested using as_slice instead of creating a cursor 2023-07-14 07:58:05 -04:00
Alek Westover
107ebd3d21 turn remote extensions on by default 2023-07-13 17:05:52 -04:00
Alek Westover
89c93457f3 Add support for remote extensions. When requested, downloads a tar.gz file for the extension and then organizes the contained files. For instance, placing .so files in sharelib. 2023-07-13 16:15:18 -04:00
168 changed files with 2283 additions and 2529 deletions

View File

@@ -12,11 +12,6 @@ opt-level = 3
# Turn on a small amount of optimization in Development mode.
opt-level = 1
[build]
# This is only present for local builds, as it will be overridden
# by the RUSTDOCFLAGS env var in CI.
rustdocflags = ["-Arustdoc::private_intra_doc_links"]
[alias]
build_testing = ["build", "--features", "testing"]
neon = ["run", "--bin", "neon_local"]

View File

@@ -150,14 +150,6 @@ runs:
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
fi
# We use pytest-split plugin to run benchmarks in parallel on different CI runners
if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
mkdir -p $TEST_OUTPUT
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
fi
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ "${{ inputs.build_type }}" == "release" ]]; then

View File

@@ -1,55 +0,0 @@
name: Handle `approved-for-ci-run` label
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
on:
pull_request:
types:
# Default types that triggers a workflow ([1]):
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
- opened
- synchronize
- reopened
# Types that we wand to handle in addition to keep labels tidy:
- closed
# Actual magic happens here:
- labeled
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
jobs:
remove-label:
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
# The PR should be reviewed and labelled manually again.
runs-on: [ ubuntu-latest ]
if: |
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
create-branch:
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
runs-on: [ ubuntu-latest ]
if: |
github.event.action == 'labeled' &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
- uses: actions/checkout@v3
with:
ref: main
- run: gh pr checkout "${PR_NUMBER}"
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"

View File

@@ -5,7 +5,6 @@ on:
branches:
- main
- release
- ci-run/pr-*
pull_request:
defaults:
@@ -128,11 +127,6 @@ jobs:
- name: Run cargo clippy (release)
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() }}
@@ -396,11 +390,13 @@ jobs:
strategy:
fail-fast: false
matrix:
pytest_split_group: [ 1, 2, 3, 4 ]
build_type: [ release ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Pytest benchmarks
uses: ./.github/actions/run-python-test-set
@@ -409,11 +405,9 @@ jobs:
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ github.ref_name == 'main' }}
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -794,7 +788,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.13.1
VM_BUILDER_VERSION: v0.12.1
steps:
- name: Checkout
@@ -1007,8 +1001,6 @@ jobs:
done
- name: Upload postgres-extensions to S3
# TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
if: false
run: |
for BUCKET in $(echo ${S3_BUCKETS}); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}

View File

@@ -3,8 +3,7 @@ name: Check neon with extra platform builds
on:
push:
branches:
- main
- ci-run/pr-*
- main
pull_request:
defaults:

84
Cargo.lock generated
View File

@@ -907,12 +907,14 @@ dependencies = [
"opentelemetry",
"postgres",
"regex",
"remote_storage",
"reqwest",
"serde",
"serde_json",
"tar",
"tokio",
"tokio-postgres",
"toml_edit",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -980,6 +982,7 @@ dependencies = [
"tar",
"thiserror",
"toml",
"tracing",
"url",
"utils",
"workspace_hack",
@@ -2379,9 +2382,9 @@ dependencies = [
[[package]]
name = "opentelemetry"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
@@ -2389,9 +2392,9 @@ dependencies = [
[[package]]
name = "opentelemetry-http"
version = "0.8.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
dependencies = [
"async-trait",
"bytes",
@@ -2402,9 +2405,9 @@ dependencies = [
[[package]]
name = "opentelemetry-otlp"
version = "0.12.0"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
dependencies = [
"async-trait",
"futures",
@@ -2420,47 +2423,48 @@ dependencies = [
[[package]]
name = "opentelemetry-proto"
version = "0.2.0"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
dependencies = [
"futures",
"futures-util",
"opentelemetry",
"prost",
"tonic 0.8.3",
"tonic-build 0.8.4",
]
[[package]]
name = "opentelemetry-semantic-conventions"
version = "0.11.0"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
dependencies = [
"opentelemetry",
]
[[package]]
name = "opentelemetry_api"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
dependencies = [
"fnv",
"futures-channel",
"futures-util",
"indexmap",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
"urlencoding",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
dependencies = [
"async-trait",
"crossbeam-channel",
@@ -2936,9 +2940,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
[[package]]
name = "proc-macro2"
version = "1.0.64"
version = "1.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
dependencies = [
"unicode-ident",
]
@@ -3327,9 +3331,9 @@ dependencies = [
[[package]]
name = "reqwest-tracing"
version = "0.4.5"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
dependencies = [
"anyhow",
"async-trait",
@@ -3854,8 +3858,7 @@ dependencies = [
[[package]]
name = "sharded-slab"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
dependencies = [
"lazy_static",
]
@@ -3998,7 +4001,7 @@ dependencies = [
"tokio",
"tokio-stream",
"tonic 0.9.2",
"tonic-build",
"tonic-build 0.9.2",
"tracing",
"utils",
"workspace_hack",
@@ -4099,7 +4102,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
dependencies = [
"filetime",
"libc",
"xattr 0.2.3",
"xattr",
]
[[package]]
@@ -4380,17 +4383,16 @@ dependencies = [
[[package]]
name = "tokio-tar"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
version = "0.3.0"
source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
dependencies = [
"filetime",
"futures-core",
"libc",
"redox_syscall 0.3.5",
"redox_syscall 0.2.16",
"tokio",
"tokio-stream",
"xattr 1.0.0",
"xattr",
]
[[package]]
@@ -4517,6 +4519,19 @@ dependencies = [
"tracing",
]
[[package]]
name = "tonic-build"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
dependencies = [
"prettyplease 0.1.25",
"proc-macro2",
"prost-build",
"quote",
"syn 1.0.109",
]
[[package]]
name = "tonic-build"
version = "0.9.2"
@@ -4640,9 +4655,9 @@ dependencies = [
[[package]]
name = "tracing-opentelemetry"
version = "0.19.0"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
dependencies = [
"once_cell",
"opentelemetry",
@@ -5364,15 +5379,6 @@ dependencies = [
"libc",
]
[[package]]
name = "xattr"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
dependencies = [
"libc",
]
[[package]]
name = "xmlparser"
version = "0.13.5"

View File

@@ -84,9 +84,9 @@ notify = "5.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.19.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0"
opentelemetry = "0.18.0"
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.10.0"
parking_lot = "0.12"
pbkdf2 = "0.12.1"
pin-project-lite = "0.2"
@@ -95,7 +95,7 @@ prost = "0.11"
rand = "0.8"
regex = "1.4"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
@@ -124,14 +124,13 @@ tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.9.0"
tokio-rustls = "0.23"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7", features = ["io"] }
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.19.0"
tracing-opentelemetry = "0.18.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
url = "2.2"
uuid = { version = "1.2", features = ["v4", "serde"] }
@@ -149,6 +148,7 @@ postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -185,6 +185,11 @@ tonic-build = "0.9"
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
# Changes the MAX_THREADS limit from 4096 to 32768.
# This is a temporary workaround for using tracing from many threads in safekeepers code,
# until async safekeepers patch is merged to the main.
sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
################# Binary contents sections
[profile.release]

View File

@@ -535,10 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
# There is no release tag yet
RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \

View File

@@ -32,3 +32,5 @@ url.workspace = true
compute_api.workspace = true
utils.workspace = true
workspace_hack.workspace = true
toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }

View File

@@ -5,6 +5,8 @@
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
//! - Every start is a fresh start, so the data directory is removed and
//! initialized again on each run.
//! - If remote_extension_config is provided, it will be used to fetch extensions list
//! and download `shared_preload_libraries` from the remote storage.
//! - Next it will put configuration files into the `PGDATA` directory.
//! - Sync safekeepers and get commit LSN.
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,7 +29,8 @@
//! compute_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres
//! -b /usr/local/bin/postgres \
//! -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
//! ```
//!
use std::collections::HashMap;
@@ -35,7 +38,7 @@ use std::fs::File;
use std::panic;
use std::path::Path;
use std::process::exit;
use std::sync::{mpsc, Arc, Condvar, Mutex};
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
@@ -48,6 +51,8 @@ use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::launch_download_extensions;
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
@@ -55,15 +60,42 @@ use compute_tools::params::*;
use compute_tools::spec::*;
const BUILD_TAG_DEFAULT: &str = "local";
const DEFAULT_REMOTE_EXT_CONFIG: &str = r#"{"bucket": "neon-dev-extensions", "region": "eu-central-1", "endpoint": null, "prefix": "5555"}"#;
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
info!("build_tag: {build_tag}");
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let remote_ext_config = matches
.get_one::<String>("remote-ext-config")
.map(|x| x.to_string());
// let remote_ext_config =
// Some(remote_ext_config.unwrap_or(DEFAULT_REMOTE_EXT_CONFIG.to_string()));
let ext_remote_storage = remote_ext_config.map(|x| {
init_remote_storage(&x, build_tag)
.expect("cannot initialize remote extension storage from config")
});
// creds used to connect to remote extensions bucket
// let aws_creds = matches.get_one::<String>("awscreds");
// if let Some(aws_creds) = aws_creds {
// // not sure if this is a bad idea?
// let aws_creds_dict: serde_json::Value = serde_json::from_str(aws_creds)?;
// std::env::set_var(
// "AWS_ACCESS_KEY_ID",
// aws_creds_dict["ID"].as_str().expect("config parse error"),
// );
// std::env::set_var(
// "AWS_SECRET_ACCESS_KEY",
// aws_creds_dict["key"].as_str().expect("config parse error"),
// );
// }
let http_port = *matches
.get_one::<u16>("http-port")
@@ -128,9 +160,6 @@ fn main() -> Result<()> {
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
// Try to use just 'postgres' if no path is provided
let pgbin = matches.get_one::<String>("pgbin").unwrap();
let spec;
let mut live_config_allowed = false;
match spec_json {
@@ -168,6 +197,7 @@ fn main() -> Result<()> {
let mut new_state = ComputeState::new();
let spec_set;
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
new_state.pspec = Some(pspec);
@@ -179,9 +209,12 @@ fn main() -> Result<()> {
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
pgversion: get_pg_version(pgbin),
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage,
available_extensions: OnceLock::new(),
};
let compute = Arc::new(compute_node);
@@ -190,6 +223,8 @@ fn main() -> Result<()> {
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
@@ -222,10 +257,18 @@ fn main() -> Result<()> {
compute.state_changed.notify_all();
drop(state);
// Launch remaining service threads
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
let _configurator_handle =
launch_configurator(&compute).expect("cannot launch configurator thread");
let _download_extensions_handle =
launch_download_extensions(&compute).expect("cannot launch download extensions thread");
// Start Postgres
let mut delay_exit = false;
let mut exit_code = None;
let pg = match compute.start_compute() {
let pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:?}", err);
@@ -238,14 +281,6 @@ fn main() -> Result<()> {
}
};
// Launch remaining service threads
//
// NOTE we do this after starting postgres so that these two extra threads
// don't blow the cpu budget and throttle the startup process.
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
let _configurator_handle =
launch_configurator(&compute).expect("cannot launch configurator thread");
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
if let Some(mut pg) = pg {
@@ -362,6 +397,18 @@ fn cli() -> clap::Command {
.long("control-plane-uri")
.value_name("CONTROL_PLANE_API_BASE_URI"),
)
.arg(
Arg::new("remote-ext-config")
.short('r')
.long("remote-ext-config")
.value_name("REMOTE_EXT_CONFIG"),
)
.arg(
Arg::new("awscreds")
.short('k')
.long("awscreds")
.value_name("AWS_CREDENTIALS"),
)
}
#[test]

View File

@@ -1,14 +1,17 @@
use std::collections::HashSet;
use std::fs;
use std::io::BufRead;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::{Condvar, Mutex};
use std::sync::{Condvar, Mutex, OnceLock};
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use futures::future::join_all;
use postgres::{Client, NoTls};
use tokio;
use tokio_postgres;
use tracing::{info, instrument, warn};
use utils::id::{TenantId, TimelineId};
@@ -18,9 +21,11 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use crate::config;
use remote_storage::GenericRemoteStorage;
use crate::pg_helpers::*;
use crate::spec::*;
use crate::{config, extension_server};
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
@@ -28,6 +33,7 @@ pub struct ComputeNode {
pub connstr: url::Url,
pub pgdata: String,
pub pgbin: String,
pub pgversion: String,
/// We should only allow live re- / configuration of the compute node if
/// it uses 'pull model', i.e. it can go to control-plane and fetch
/// the latest configuration. Otherwise, there could be a case:
@@ -47,6 +53,10 @@ pub struct ComputeNode {
pub state: Mutex<ComputeState>,
/// `Condvar` to allow notifying waiters about state changes.
pub state_changed: Condvar,
/// the S3 bucket that we search for extensions in
pub ext_remote_storage: Option<GenericRemoteStorage>,
// cached lists of available extensions and libraries
pub available_extensions: OnceLock<HashSet<String>>,
}
#[derive(Clone, Debug)]
@@ -357,14 +367,22 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip_all)]
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
pub fn prepare_pgdata(
&self,
compute_state: &ComputeState,
extension_server_port: u16,
) -> Result<()> {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
let pgdata_path = Path::new(&self.pgdata);
// Remove/create an empty pgdata directory and put configuration there.
self.create_pgdata()?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
config::write_postgres_conf(
&pgdata_path.join("postgresql.conf"),
&pspec.spec,
Some(extension_server_port),
)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
// is already connected it will be kicked out, so a secondary (standby)
@@ -506,7 +524,7 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
self.pg_reload_conf(&mut client)?;
@@ -536,7 +554,7 @@ impl ComputeNode {
}
#[instrument(skip_all)]
pub fn start_compute(&self) -> Result<std::process::Child> {
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
@@ -547,7 +565,26 @@ impl ComputeNode {
pspec.timeline_id,
);
self.prepare_pgdata(&compute_state)?;
// This part is sync, because we need to download
// remote shared_preload_libraries before postgres start (if any)
{
let library_load_start_time = Utc::now();
self.prepare_preload_libraries(&compute_state)?;
let library_load_time = Utc::now()
.signed_duration_since(library_load_start_time)
.to_std()
.unwrap()
.as_millis() as u64;
let mut state = self.state.lock().unwrap();
state.metrics.load_libraries_ms = library_load_time;
info!(
"Loading shared_preload_libraries took {:?}ms",
library_load_time
);
}
self.prepare_pgdata(&compute_state, extension_server_port)?;
let start_time = Utc::now();
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -695,4 +732,92 @@ LIMIT 100",
"{{\"pg_stat_statements\": []}}".to_string()
}
}
// If remote extension storage is configured,
// download extension control files
#[tokio::main]
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
let custom_ext_prefixes = spec.custom_extensions.clone().unwrap_or(Vec::new());
info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
let available_extensions = extension_server::get_available_extensions(
ext_remote_storage,
&self.pgbin,
&self.pgversion,
&custom_ext_prefixes,
)
.await?;
self.available_extensions
.set(available_extensions)
.expect("available_extensions.set error");
}
Ok(())
}
pub async fn download_extension(&self, ext_name: &str) -> Result<()> {
match &self.ext_remote_storage {
None => anyhow::bail!("No remote extension storage"),
Some(remote_storage) => {
extension_server::download_extension(
ext_name,
remote_storage,
&self.pgbin,
&self.pgversion,
)
.await
}
}
}
#[tokio::main]
pub async fn prepare_preload_libraries(&self, compute_state: &ComputeState) -> Result<()> {
if self.ext_remote_storage.is_none() {
return Ok(());
}
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
info!("parse shared_preload_libraries from spec.cluster.settings");
let mut libs_vec = Vec::new();
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
info!("parse shared_preload_libraries from provided postgresql.conf");
// that is used in neon_local and python tests
if let Some(conf) = &spec.cluster.postgresql_conf {
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
let mut shared_preload_libraries_line = "";
for line in conf_lines {
if line.starts_with("shared_preload_libraries") {
shared_preload_libraries_line = line;
}
}
let mut preload_libs_vec = Vec::new();
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
preload_libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
libs_vec.extend(preload_libs_vec);
}
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
let mut download_tasks = Vec::new();
for library in &libs_vec {
download_tasks.push(self.download_extension(library));
}
let results = join_all(download_tasks).await;
for result in results {
result?; // propogate any errors
}
Ok(())
}
}

View File

@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
}
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
pub fn write_postgres_conf(
path: &Path,
spec: &ComputeSpec,
extension_server_port: Option<u16>,
) -> Result<()> {
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
@@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
writeln!(file, "# Managed by compute_ctl: end")?;
}
if let Some(port) = extension_server_port {
writeln!(file, "neon.extension_server_port={}", port)?;
}
Ok(())
}

View File

@@ -42,13 +42,15 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
}
}
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
pub fn launch_configurator(
compute: &Arc<ComputeNode>,
) -> Result<thread::JoinHandle<()>, std::io::Error> {
let compute = Arc::clone(compute);
Ok(thread::Builder::new()
thread::Builder::new()
.name("compute-configurator".into())
.spawn(move || {
configurator_main_loop(&compute);
info!("configurator thread is exited");
})?)
})
}

View File

@@ -0,0 +1,237 @@
// Download extension files from the extension store
// and put them in the right place in the postgres directory
/*
The layout of the S3 bucket is as follows:
v14/ext_index.json
-- this contains information necessary to create control files
v14/extensions/test_ext1.tar.gz
-- this contains the library files and sql files necessary to create this extension
v14/extensions/custom_ext1.tar.gz
The difference between a private and public extensions is determined by who can
load the extension this is specified in ext_index.json
Speicially, ext_index.json has a list of public extensions, and a list of
extensions enabled for specific tenant-ids.
*/
use crate::compute::ComputeNode;
use anyhow::Context;
use anyhow::{self, Result};
use flate2::read::GzDecoder;
use remote_storage::*;
use serde_json::{self, Value};
use std::collections::HashSet;
use std::num::{NonZeroU32, NonZeroUsize};
use std::path::Path;
use std::str;
use std::sync::Arc;
use std::thread;
use tar::Archive;
use tokio::io::AsyncReadExt;
use tracing::info;
fn get_pg_config(argument: &str, pgbin: &str) -> String {
// gives the result of `pg_config [argument]`
// where argument is a flag like `--version` or `--sharedir`
let pgconfig = pgbin
.strip_suffix("postgres")
.expect("bad pgbin")
.to_owned()
+ "/pg_config";
let config_output = std::process::Command::new(pgconfig)
.arg(argument)
.output()
.expect("pg_config error");
std::str::from_utf8(&config_output.stdout)
.expect("pg_config error")
.trim()
.to_string()
}
pub fn get_pg_version(pgbin: &str) -> String {
// pg_config --version returns a (platform specific) human readable string
// such as "PostgreSQL 15.4". We parse this to v14/v15
let human_version = get_pg_config("--version", pgbin);
if human_version.contains("15") {
return "v15".to_string();
} else if human_version.contains("14") {
return "v14".to_string();
}
panic!("Unsuported postgres version {human_version}");
}
// download extension control files
// if custom_ext_prefixes is provided - search also in custom extension paths
pub async fn get_available_extensions(
remote_storage: &GenericRemoteStorage,
pgbin: &str,
pg_version: &str,
custom_ext_prefixes: &[String],
) -> Result<HashSet<String>> {
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
let index_path = pg_version.to_owned() + "/ext_index.json";
let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
info!("download ext_index.json: {:?}", &index_path);
// TODO: potential optimization: cache ext_index.json
let mut download = remote_storage.download(&index_path).await?;
let mut write_data_buffer = Vec::new();
download
.download_stream
.read_to_end(&mut write_data_buffer)
.await?;
let ext_index_str = match str::from_utf8(&write_data_buffer) {
Ok(v) => v,
Err(e) => panic!("Invalid UTF-8 sequence: {}", e),
};
let ext_index_full: Value = serde_json::from_str(ext_index_str)?;
let ext_index_full = ext_index_full.as_object().context("error parsing json")?;
let control_data = ext_index_full["control_data"]
.as_object()
.context("json parse error")?;
let enabled_extensions = ext_index_full["enabled_extensions"]
.as_object()
.context("json parse error")?;
info!("{:?}", control_data.clone());
info!("{:?}", enabled_extensions.clone());
let mut prefixes = vec!["public".to_string()];
prefixes.extend(custom_ext_prefixes.to_owned());
info!("{:?}", &prefixes);
let mut all_extensions = HashSet::new();
for prefix in prefixes {
let prefix_extensions = match enabled_extensions.get(&prefix) {
Some(Value::Array(ext_name)) => ext_name,
_ => {
info!("prefix {} has no extensions", prefix);
continue;
}
};
info!("{:?}", prefix_extensions);
for ext_name in prefix_extensions {
all_extensions.insert(ext_name.as_str().context("json parse error")?.to_string());
}
}
for prefix in &all_extensions {
let control_contents = control_data[prefix].as_str().context("json parse error")?;
let control_path = local_sharedir.join(prefix.to_owned() + ".control");
info!("WRITING FILE {:?}{:?}", control_path, control_contents);
std::fs::write(control_path, control_contents)?;
}
Ok(all_extensions.into_iter().collect())
}
// download all sqlfiles (and possibly data files) for a given extension name
pub async fn download_extension(
ext_name: &str,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
pg_version: &str,
) -> Result<()> {
// TODO: potential optimization: only download the extension if it doesn't exist
// problem: how would we tell if it exists?
let ext_name = ext_name.replace(".so", "");
let ext_name_targz = ext_name.to_owned() + ".tar.gz";
if Path::new(&ext_name_targz).exists() {
info!("extension {:?} already exists", ext_name_targz);
return Ok(());
}
let ext_path = RemotePath::new(
&Path::new(pg_version)
.join("extensions")
.join(ext_name_targz.clone()),
)?;
info!(
"Start downloading extension {:?} from {:?}",
ext_name, ext_path
);
let mut download = remote_storage.download(&ext_path).await?;
let mut write_data_buffer = Vec::new();
download
.download_stream
.read_to_end(&mut write_data_buffer)
.await?;
let unzip_dest = pgbin.strip_suffix("/bin/postgres").expect("bad pgbin");
let tar = GzDecoder::new(write_data_buffer.as_slice());
let mut archive = Archive::new(tar);
archive.unpack(unzip_dest)?;
info!("Download + unzip {:?} completed successfully", &ext_path);
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
let zip_sharedir = format!("{unzip_dest}/extensions/{ext_name}/share/extension");
info!("mv {zip_sharedir:?}/* {local_sharedir:?}");
for file in std::fs::read_dir(zip_sharedir)? {
let old_file = file?.path();
let new_file =
Path::new(&local_sharedir).join(old_file.file_name().context("error parsing file")?);
std::fs::rename(old_file, new_file)?;
}
let local_libdir = Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql");
let zip_libdir = format!("{unzip_dest}/extensions/{ext_name}/lib");
info!("mv {zip_libdir:?}/* {local_libdir:?}");
for file in std::fs::read_dir(zip_libdir)? {
let old_file = file?.path();
let new_file =
Path::new(&local_libdir).join(old_file.file_name().context("error parsing file")?);
std::fs::rename(old_file, new_file)?;
}
Ok(())
}
// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
pub fn init_remote_storage(
remote_ext_config: &str,
default_prefix: &str,
) -> anyhow::Result<GenericRemoteStorage> {
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
let remote_ext_bucket = remote_ext_config["bucket"]
.as_str()
.context("config parse error")?;
let remote_ext_region = remote_ext_config["region"]
.as_str()
.context("config parse error")?;
let remote_ext_endpoint = remote_ext_config["endpoint"].as_str();
let remote_ext_prefix = remote_ext_config["prefix"]
.as_str()
.unwrap_or(default_prefix)
.to_string();
// TODO: potentially allow modification of other parameters
// however, default values should be fine for now
let config = S3Config {
bucket_name: remote_ext_bucket.to_string(),
bucket_region: remote_ext_region.to_string(),
prefix_in_bucket: Some(remote_ext_prefix),
endpoint: remote_ext_endpoint.map(|x| x.to_string()),
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)
}
pub fn launch_download_extensions(
compute: &Arc<ComputeNode>,
) -> Result<thread::JoinHandle<()>, std::io::Error> {
let compute = Arc::clone(compute);
thread::Builder::new()
.name("download-extensions".into())
.spawn(move || {
info!("start download_extension_files");
let compute_state = compute.state.lock().expect("error unlocking compute.state");
compute
.prepare_external_extensions(&compute_state)
.expect("error preparing extensions");
info!("download_extension_files done, exiting thread");
})
}

View File

@@ -121,6 +121,27 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
// download extension files from S3 on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
info!("req.uri {:?}", req.uri());
let filename = route.split('/').last().unwrap().to_string();
info!(
"serving /extension_server POST request, filename: {:?}",
&filename
);
match compute.download_extension(&filename).await {
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("extension download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
// Return the `404 Not Found` for any other routes.
_ => {
let mut not_found = Response::new(Body::from("404 Not Found"));

View File

@@ -139,6 +139,34 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/extension_server:
post:
tags:
- Extension
summary: Download extension from S3 to local folder.
description: ""
operationId: downloadExtension
responses:
200:
description: Extension downloaded
content:
text/plain:
schema:
type: string
description: Error text or 'OK' if download succeeded.
example: "OK"
400:
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:

View File

@@ -9,6 +9,7 @@ pub mod http;
#[macro_use]
pub mod logger;
pub mod compute;
pub mod extension_server;
pub mod monitor;
pub mod params;
pub mod pg_helpers;

View File

@@ -105,10 +105,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
/// Launch a separate compute monitor thread and return its `JoinHandle`.
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>, std::io::Error> {
let state = Arc::clone(state);
Ok(thread::Builder::new()
thread::Builder::new()
.name("compute-monitor".into())
.spawn(move || watch_compute_activity(&state))?)
.spawn(move || watch_compute_activity(&state))
}

View File

@@ -19,7 +19,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil
/// Escape a string for including it in a SQL literal. Wrapping the result
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
/// See https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47
/// for the original implementation.
pub fn escape_literal(s: &str) -> String {
let res = s.replace('\'', "''").replace('\\', "\\\\");

View File

@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
// File `postgresql.conf` is no longer included into `basebackup`, so just
// always write all config into it creating new file.
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
update_pg_hba(pgdata_path)?;

View File

@@ -32,3 +32,4 @@ utils.workspace = true
compute_api.workspace = true
workspace_hack.workspace = true
tracing.workspace = true

View File

@@ -10,7 +10,7 @@
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
//! The pid stored in the file is later used to stop the service.
//!
//! See the [`lock_file`](utils::lock_file) module for more info.
//! See [`lock_file`] module for more info.
use std::ffi::OsStr;
use std::io::Write;

View File

@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
// If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
_ => {}
}
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers)?;
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
} else {
let branch_name = sub_args
.get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
pg_version,
mode,
)?;
ep.start(&auth_token, safekeepers)?;
ep.start(&auth_token, safekeepers, remote_ext_config)?;
}
}
"stop" => {
@@ -1003,6 +1005,12 @@ fn cli() -> Command {
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
.help("Configure the S3 bucket that we search for extensions in.")
.required(false);
let lsn_arg = Arg::new("lsn")
.long("lsn")
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1161,6 +1169,7 @@ fn cli() -> Command {
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
)
.subcommand(
Command::new("stop")

View File

@@ -2,9 +2,8 @@
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! ```
//!
use anyhow::Context;
use std::path::PathBuf;

View File

@@ -2,9 +2,7 @@
//!
//! In the local test environment, the data for each endpoint is stored in
//!
//! ```text
//! .neon/endpoints/<endpoint id>
//! ```
//!
//! Some basic information about the endpoint, like the tenant and timeline IDs,
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
@@ -24,7 +22,7 @@
//!
//! Directory contents:
//!
//! ```text
//! ```ignore
//! .neon/endpoints/main/
//! compute.log - log output of `compute_ctl` and `postgres`
//! endpoint.json - serialized `EndpointConf` struct
@@ -289,7 +287,7 @@ impl Endpoint {
.env
.safekeepers
.iter()
.map(|sk| format!("localhost:{}", sk.get_compute_port()))
.map(|sk| format!("localhost:{}", sk.pg_port))
.collect::<Vec<String>>()
.join(",");
conf.append("neon.safekeepers", &safekeepers);
@@ -313,12 +311,12 @@ impl Endpoint {
// TODO: use future host field from safekeeper spec
// Pass the list of safekeepers to the replica so that it can connect to any of them,
// whichever is availiable.
// whichever is available.
let sk_ports = self
.env
.safekeepers
.iter()
.map(|x| x.get_compute_port().to_string())
.map(|x| x.pg_port.to_string())
.collect::<Vec<_>>()
.join(",");
let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
@@ -420,7 +418,12 @@ impl Endpoint {
Ok(())
}
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
pub fn start(
&self,
auth_token: &Option<String>,
safekeepers: Vec<NodeId>,
remote_ext_config: Option<&String>,
) -> Result<()> {
if self.status() == "running" {
anyhow::bail!("The endpoint is already running");
}
@@ -463,7 +466,7 @@ impl Endpoint {
.iter()
.find(|node| node.id == sk_id)
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
}
}
@@ -488,6 +491,15 @@ impl Endpoint {
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
// TODO FIXME: This is a hack to test custom extensions locally.
// In test_download_extensions, we assume that the custom extension
// prefix is the tenant ID. So we set it here.
//
// The proper way to implement this is to pass the custom extension
// in spec, but we don't have a way to do that yet in the python tests.
// NEW HACK: we enable the anon custom extension for everyone! this is of course just for testing
// how will we do it for real?
custom_extensions: Some(vec!["123454321".to_string(), self.tenant_id.to_string()]),
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,6 +531,11 @@ impl Endpoint {
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
if let Some(remote_ext_config) = remote_ext_config {
cmd.args(["--remote-ext-config", remote_ext_config]);
}
let child = cmd.spawn()?;
// Write down the pid so we can wait for it when we want to stop

View File

@@ -137,7 +137,6 @@ impl Default for PageServerConf {
pub struct SafekeeperConf {
pub id: NodeId,
pub pg_port: u16,
pub pg_tenant_only_port: Option<u16>,
pub http_port: u16,
pub sync: bool,
pub remote_storage: Option<String>,
@@ -150,7 +149,6 @@ impl Default for SafekeeperConf {
Self {
id: NodeId(0),
pg_port: 0,
pg_tenant_only_port: None,
http_port: 0,
sync: true,
remote_storage: None,
@@ -160,14 +158,6 @@ impl Default for SafekeeperConf {
}
}
impl SafekeeperConf {
/// Compute is served by port on which only tenant scoped tokens allowed, if
/// it is configured.
pub fn get_compute_port(&self) -> u16 {
self.pg_tenant_only_port.unwrap_or(self.pg_port)
}
}
impl LocalEnv {
pub fn pg_distrib_dir_raw(&self) -> PathBuf {
self.pg_distrib_dir.clone()

View File

@@ -2,9 +2,8 @@
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! ```
//!
use std::io::Write;
use std::path::PathBuf;
use std::process::Child;
@@ -120,55 +119,45 @@ impl SafekeeperNode {
let availability_zone = format!("sk-{}", id_string);
let mut args = vec![
"-D".to_owned(),
datadir
.to_str()
.with_context(|| {
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
})?
.to_owned(),
"--id".to_owned(),
id_string,
"--listen-pg".to_owned(),
listen_pg,
"--listen-http".to_owned(),
listen_http,
"--availability-zone".to_owned(),
availability_zone,
"-D",
datadir.to_str().with_context(|| {
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
})?,
"--id",
&id_string,
"--listen-pg",
&listen_pg,
"--listen-http",
&listen_http,
"--availability-zone",
&availability_zone,
];
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
}
if !self.conf.sync {
args.push("--no-sync".to_owned());
args.push("--no-sync");
}
let broker_endpoint = format!("{}", self.env.broker.client_url());
args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);
args.extend(["--broker-endpoint", &broker_endpoint]);
let mut backup_threads = String::new();
if let Some(threads) = self.conf.backup_threads {
backup_threads = threads.to_string();
args.extend(["--backup-threads".to_owned(), backup_threads]);
args.extend(["--backup-threads", &backup_threads]);
} else {
drop(backup_threads);
}
if let Some(ref remote_storage) = self.conf.remote_storage {
args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
args.extend(["--remote-storage", remote_storage]);
}
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
if self.conf.auth_enabled {
args.extend([
"--auth-validation-public-key-path".to_owned(),
key_path
.to_str()
.with_context(|| {
format!("Key path {key_path:?} cannot be represented as a unicode string")
})?
.to_owned(),
"--auth-validation-public-key-path",
key_path.to_str().with_context(|| {
format!("Key path {key_path:?} cannot be represented as a unicode string")
})?,
]);
}

View File

@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.
In async Rust, futures can be "cancelled" at any await point, by
dropping the Future. For example, `tokio::select!` returns as soon as
one of the Futures returns, and drops the others. `tokio::time::timeout`
is another example. In the Rust ecosystem, some functions are
one of the Futures returns, and drops the others. `tokio::timeout!` is
another example. In the Rust ecosystem, some functions are
cancellation-safe, meaning they can be safely dropped without
side-effects, while others are not. See documentation of
`tokio::select!` for examples.
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
cancellation-safe, and must be polled to completion.
The downside of non-cancellation safe code is that you have to be very
careful when using `tokio::select!`, `tokio::time::timeout`, and other
such functions that can cause a Future to be dropped. They can only be
used with functions that are explicitly documented to be cancellation-safe,
careful when using `tokio::select!`, `tokio::timeout!`, and other such
functions that can cause a Future to be dropped. They can only be used
with functions that are explicitly documented to be cancellation-safe,
or you need to spawn a separate task to shield from the cancellation.
At the entry points to the code, we also take care to poll futures to

View File

@@ -0,0 +1,187 @@
# Supporting custom user Extensions (Dynamic Extension Loading)
Created 2023-05-03
## Motivation
There are many extensions in the PostgreSQL ecosystem, and not all extensions
are of a quality that we can confidently support them. Additionally, our
current extension inclusion mechanism has several problems because we build all
extensions into the primary Compute image: We build the extensions every time
we build the compute image regardless of whether we actually need to rebuild
the image, and the inclusion of these extensions in the image adds a hard
dependency on all supported extensions - thus increasing the image size, and
with it the time it takes to download that image - increasing first start
latency.
This RFC proposes a dynamic loading mechanism that solves most of these
problems.
## Summary
`compute_ctl` is made responsible for loading extensions on-demand into
the container's file system for dynamically loaded extensions, and will also
make sure that the extensions in `shared_preload_libraries` are downloaded
before the compute node starts.
## Components
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
## Requirements
Compute nodes with no extra extensions should not be negatively impacted by
the existence of support for many extensions.
Installing an extension into PostgreSQL should be easy.
Non-preloaded extensions shouldn't impact startup latency.
Uninstalled extensions shouldn't impact query latency.
A small latency penalty for dynamically loaded extensions is acceptable in
the first seconds of compute startup, but not in steady-state operations.
## Proposed implementation
### On-demand, JIT-loading of extensions
Before postgres starts we download
- control files for all extensions available to that compute node;
- all `shared_preload_libraries`;
After postgres is running, `compute_ctl` listens for requests to load files.
When PostgreSQL requests a file, `compute_ctl` downloads it.
PostgreSQL requests files in the following cases:
- When loading a preload library set in `local_preload_libraries`
- When explicitly loading a library with `LOAD`
- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
#### Summary
Pros:
- Startup is only as slow as it takes to load all (shared_)preload_libraries
- Supports BYO Extension
Cons:
- O(sizeof(extensions)) IO requirement for loading all extensions.
### Alternative solutions
1. Allow users to add their extensions to the base image
Pros:
- Easy to deploy
Cons:
- Doesn't scale - first start size is dependent on image size;
- All extensions are shared across all users: It doesn't allow users to
bring their own restrictive-licensed extensions
2. Bring Your Own compute image
Pros:
- Still easy to deploy
- User can bring own patched version of PostgreSQL
Cons:
- First start latency is O(sizeof(extensions image))
- Warm instance pool for skipping pod schedule latency is not feasible with
O(n) custom images
- Support channels are difficult to manage
3. Download all user extensions in bulk on compute start
Pros:
- Easy to deploy
- No startup latency issues for "clean" users.
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- Downloading all extensions in advance takes a lot of time, thus startup
latency issues
4. Store user's extensions in persistent storage
Pros:
- Easy to deploy
- No startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- EC2 instances have only limited number of attachments shared between EBS
volumes, direct-attached NVMe drives, and ENIs.
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
the device is unavailable whilst moving the mount between instances).
- EBS can only mount on one instance at a time (except the expensive IO2
device type).
5. Store user's extensions in network drive
Pros:
- Easy to deploy
- Few startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- We'd need networked drives, and a lot of them, which would store many
duplicate extensions.
- **UNCHECKED:** Compute instance migration may not work nicely with
networked IOs
### Idea extensions
The extension store does not have to be S3 directly, but could be a Node-local
caching service on top of S3. This would reduce the load on the network for
popular extensions.
## Extension Storage implementation
The layout of the S3 bucket is as follows:
```
v14/ext_index.json
-- this contains information necessary to create control files
v14/extensions/test_ext1.tar.gz
-- this contains the library files and sql files necessary to create this extension
v14/extensions/custom_ext1.tar.gz
```
The difference between private and public extensions is determined by who can
load the extension. This is specified in `ext_index.json`.
Speicially, `ext_index.json` has a list of public extensions, and a list of
extensions enabled for specific tenant-ids. Here is an example `ext_index.json`:
```
{
"enabled_extensions": {
"123454321": [
"anon"
],
"public": [
"embedding"
]
},
"control_data": {
"embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
"anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
}
}
```
### How to add new extension to the Extension Storage?
Simply upload build artifacts to the S3 bucket.
Implement a CI step for that. Splitting it from ompute-node-image build.
### How do we deal with extension versions and updates?
Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
This is needed to ensure that `/share` and `/lib` files are in sync.
For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
### Alternatives
For extensions written on trusted languages we can also adopt
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
This will increase the amount supported extensions and decrease the amount of work required to support them.

View File

@@ -1,22 +0,0 @@
# Useful development tools
This readme contains some hints on how to set up some optional development tools.
## ccls
[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
to work well. There are different ways to do it but here's what works for me:
1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
2. Go to `vendor/postgres-v15`
3. Run `make clean && ./configure`
4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
Some additional tips for various IDEs:
### Emacs
To improve performance: `(setq lsp-lens-enable nil)`

View File

@@ -75,6 +75,7 @@ pub struct ComputeMetrics {
pub start_postgres_ms: u64,
pub config_ms: u64,
pub total_startup_ms: u64,
pub load_libraries_ms: u64,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.

View File

@@ -60,6 +60,9 @@ pub struct ComputeSpec {
/// If set, 'storage_auth_token' is used as the password to authenticate to
/// the pageserver and safekeepers.
pub storage_auth_token: Option<String>,
// list of prefixes to search for custom extensions in remote extension storage
pub custom_extensions: Option<Vec<String>>,
}
#[serde_as]

View File

@@ -1,4 +1,4 @@
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
use std::{future::Future, time::Instant};

View File

@@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use strum_macros;
use utils::{
completion,
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
@@ -77,12 +76,7 @@ pub enum TenantState {
/// system is being shut down.
///
/// Transitions out of this state are possible through `set_broken()`.
Stopping {
// Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
// otherwise it will not be skipped during deserialization
#[serde(skip)]
progress: completion::Barrier,
},
Stopping,
/// The tenant is recognized by the pageserver, but can no longer be used for
/// any operations.
///
@@ -124,7 +118,7 @@ impl TenantState {
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
// we set the Stopping state irrespective of whether the tenant
// has finished attaching or not.
Self::Stopping { .. } => Maybe,
Self::Stopping => Maybe,
}
}
@@ -417,16 +411,12 @@ pub struct LayerResidenceEvent {
pub reason: LayerResidenceEventReason,
}
/// The reason for recording a given [`LayerResidenceEvent`].
/// The reason for recording a given [`ResidenceEvent`].
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum LayerResidenceEventReason {
/// The layer map is being populated, e.g. during timeline load or attach.
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
/// We need to record such events because there is no persistent storage for the events.
///
// https://github.com/rust-lang/rust/issues/74481
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
LayerLoad,
/// We just created the layer (e.g., freeze_and_flush or compaction).
/// Such layers are always [`LayerResidenceStatus::Resident`].
@@ -934,13 +924,7 @@ mod tests {
"Activating",
),
(line!(), TenantState::Active, "Active"),
(
line!(),
TenantState::Stopping {
progress: utils::completion::Barrier::default(),
},
"Stopping",
),
(line!(), TenantState::Stopping, "Stopping"),
(
line!(),
TenantState::Broken {

View File

@@ -60,9 +60,8 @@ impl Ord for RelTag {
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
///
/// ```text
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
/// ```
///
impl fmt::Display for RelTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(forkname) = forknumber_to_name(self.forknum) {

View File

@@ -49,16 +49,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
}
}
///
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
///
/// Formats:
///
/// ```text
/// <oid>
/// <oid>_<fork name>
/// <oid>.<segment number>
/// <oid>_<fork name>.<segment number>
/// ```
///
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
///

View File

@@ -5,11 +5,11 @@
//! It is similar to what tokio_util::codec::Framed with appropriate codec
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
//! separately without using split from futures::stream::StreamExt (which
//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
//! allocates box[1] in polling internally). tokio::io::split is used for splitting
//! instead. Plus we customize error messages more than a single type for all io
//! calls.
//!
//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
use bytes::{Buf, BytesMut};
use std::{
future::Future,
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
/// Split into owned read and write parts. Beware of potential issues with
/// using halves in different tasks on TLS stream:
/// <https://github.com/tokio-rs/tls/issues/40>
/// https://github.com/tokio-rs/tls/issues/40
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
let (read_half, write_half) = tokio::io::split(self.stream);
let reader = FramedReader {

View File

@@ -934,15 +934,6 @@ impl<'a> BeMessage<'a> {
}
}
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
let mut terminated = [0; 6];
for (i, &elem) in code.iter().enumerate() {
terminated[i] = elem;
}
terminated
}
#[cfg(test)]
mod tests {
use super::*;
@@ -974,3 +965,12 @@ mod tests {
assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
}
}
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
let mut terminated = [0; 6];
for (i, &elem) in code.iter().enumerate() {
terminated[i] = elem;
}
terminated
}

View File

@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
/// ~200 RPS for IAM services
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// No limits on the client side, which currenltly means 1000 for AWS S3.
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -50,12 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(PathBuf);
impl std::fmt::Display for RemotePath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.display())
}
}
impl RemotePath {
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
anyhow::ensure!(
@@ -190,6 +184,20 @@ pub enum GenericRemoteStorage {
}
impl GenericRemoteStorage {
// A function for listing all the files in a "directory"
// Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
// lists common *prefixes*, if any of files
// Example:
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
@@ -201,14 +209,6 @@ impl GenericRemoteStorage {
}
}
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
pub async fn upload(
&self,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,

View File

@@ -151,7 +151,10 @@ impl RemoteStorage for LocalFs {
let mut files = vec![];
let mut directory_queue = vec![full_path.clone()];
while let Some(cur_folder) = directory_queue.pop() {
while !directory_queue.is_empty() {
let cur_folder = directory_queue
.pop()
.expect("queue cannot be empty: we just checked");
let mut entries = fs::read_dir(cur_folder.clone()).await?;
while let Some(entry) = entries.next_entry().await? {
let file_name: PathBuf = entry.file_name().into();

View File

@@ -349,10 +349,17 @@ impl RemoteStorage for S3Bucket {
/// See the doc for `RemoteStorage::list_files`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let folder_name = folder
let mut folder_name = folder
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
// remove leading "/" if one exists
if let Some(folder_name_slash) = folder_name.clone() {
if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
folder_name = Some(folder_name_slash[1..].to_string());
}
}
// AWS may need to break the response into several parts
let mut continuation_token = None;
let mut all_files = vec![];

View File

@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
// 2. D+C+a+b
// 3. D+A+B
/// `Segment` which has had its size calculated.
/// [`Segment`] which has had it's size calculated.
#[derive(Clone, Debug)]
struct SegmentSize {
method: SegmentMethod,

View File

@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
/// directly into HTTP servers. However, I couldn't find one for Hyper,
/// so I had to write our own. OpenTelemetry website has a registry of
/// instrumentation libraries at:
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
/// If a Hyper crate appears, consider switching to that.
pub async fn tracing_handler<F, R>(
req: Request<Body>,

View File

@@ -16,7 +16,7 @@ use crate::id::TenantId;
/// Algorithm to use. We require EdDSA.
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum Scope {
// Provides access to all data for a specific tenant (specified in `struct Claims` below)

View File

@@ -12,13 +12,6 @@ pub struct Completion(mpsc::Sender<()>);
#[derive(Clone)]
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
impl Default for Barrier {
fn default() -> Self {
let (_, rx) = channel();
rx
}
}
impl Barrier {
pub async fn wait(self) {
self.0.lock().await.recv().await;
@@ -31,15 +24,6 @@ impl Barrier {
}
}
impl PartialEq for Barrier {
fn eq(&self, other: &Self) -> bool {
// we don't use dyn so this is good
Arc::ptr_eq(&self.0, &other.0)
}
}
impl Eq for Barrier {}
/// Create new Guard and Barrier pair.
pub fn channel() -> (Completion, Barrier) {
let (tx, rx) = mpsc::channel::<()>(1);

View File

@@ -1,111 +0,0 @@
/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
///
/// It can be used with `anyhow::Error` as well.
///
/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
/// formatting.
///
/// ## Usage
///
/// ```rust
/// #[derive(Debug, thiserror::Error)]
/// enum MyCoolError {
/// #[error("should never happen")]
/// Bad(#[source] std::io::Error),
/// }
///
/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
///
/// # fn main() {
/// use utils::error::report_compact_sources;
///
/// if let Err(e) = failing_call() {
/// let e = report_compact_sources(&e);
/// assert_eq!(format!("{e}"), "should never happen: permission denied");
/// }
/// # }
/// ```
///
/// ## TODO
///
/// When we are able to describe return position impl trait in traits, this should of course be an
/// extension trait. Until then avoid boxing with this more ackward interface.
pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)?;
// why is E a generic parameter here? hope that rustc will see through a default
// Error::source implementation and leave the following out if there cannot be any
// sources:
Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
}
}
struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
impl<'a> Iterator for Sources<'a> {
type Item = &'a (dyn std::error::Error + 'static);
fn next(&mut self) -> Option<Self::Item> {
let rem = self.0;
let next = self.0.and_then(|x| x.source());
self.0 = next;
rem
}
}
AnyhowDisplayAlternateAlike(e)
}
#[cfg(test)]
mod tests {
use super::report_compact_sources;
#[test]
fn report_compact_sources_examples() {
use std::fmt::Write;
#[derive(Debug, thiserror::Error)]
enum EvictionError {
#[error("cannot evict a remote layer")]
CannotEvictRemoteLayer,
#[error("stat failed")]
StatFailed(#[source] std::io::Error),
#[error("layer was no longer part of LayerMap")]
LayerNotFound(#[source] anyhow::Error),
}
let examples = [
(
line!(),
EvictionError::CannotEvictRemoteLayer,
"cannot evict a remote layer",
),
(
line!(),
EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
"stat failed: permission denied",
),
(
line!(),
EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
"layer was no longer part of LayerMap: foobar",
),
];
let mut s = String::new();
for (line, example, expected) in examples {
s.clear();
write!(s, "{}", report_compact_sources(&example)).expect("string grows");
assert_eq!(s, expected, "example on line {line}");
}
}
}

View File

@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
.map_err(ApiError::BadRequest)
}
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
request: &mut Request<Body>,
) -> Result<Option<T>, ApiError> {

View File

@@ -63,9 +63,6 @@ pub mod rate_limit;
/// Simple once-barrier and a guard which keeps barrier awaiting.
pub mod completion;
/// Reporting utilities
pub mod error;
mod failpoint_macro_helpers {
/// use with fail::cfg("$name", "return(2000)")
@@ -133,8 +130,8 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
///
/// #############################################################################################
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
/// The problem needs further investigation and regular `const` declaration instead of a macro.

View File

@@ -1,10 +1,9 @@
//! A module to create and read lock files.
//!
//! File locking is done using [`fcntl::flock`] exclusive locks.
//! The only consumer of this module is currently
//! [`pid_file`](crate::pid_file). See the module-level comment
//! there for potential pitfalls with lock files that are used
//! to store PIDs (pidfiles).
//! The only consumer of this module is currently [`pid_file`].
//! See the module-level comment there for potential pitfalls
//! with lock files that are used to store PIDs (pidfiles).
use std::{
fs,
@@ -82,7 +81,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
}
/// Returned by [`read_and_hold_lock_file`].
/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
/// Check out the [`pid_file`] module for what the variants mean
/// and potential caveats if the lock files that are used to store PIDs.
pub enum LockFileRead {
/// No file exists at the given path.

View File

@@ -112,7 +112,7 @@ pub fn init(
///
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
/// If the assumptions about the initialization order are not held, use
/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
/// lost.
#[must_use]
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {

View File

@@ -23,9 +23,9 @@ pub enum SeqWaitError {
/// Monotonically increasing value
///
/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
/// It is handy to store some other fields under the same mutex in SeqWait<S>
/// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
/// any type that can expose counter. `V` is the type of exposed counter.
/// any type that can expose counter. <V> is the type of exposed counter.
pub trait MonotonicCounter<V> {
/// Bump counter value and check that it goes forward
/// N.B.: new_val is an actual new value, not a difference.
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
/// [`wait_for`]: SeqWait::wait_for
/// [`advance`]: SeqWait::advance
///
/// `S` means Storage, `V` is type of counter that this storage exposes.
/// <S> means Storage, <V> is type of counter that this storage exposes.
///
pub struct SeqWait<S, V>
where

View File

@@ -1,15 +1,8 @@
//! Assert that the current [`tracing::Span`] has a given set of fields.
//!
//! Can only produce meaningful positive results when tracing has been configured as in example.
//! Absence of `tracing_error::ErrorLayer` is not detected yet.
//!
//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
//! is completly unconfigured.
//!
//! # Usage
//!
//! ```rust
//! # fn main() {
//! ```
//! use tracing_subscriber::prelude::*;
//! let registry = tracing_subscriber::registry()
//! .with(tracing_error::ErrorLayer::default());
@@ -27,18 +20,23 @@
//!
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
//! if let Err(missing) = check_fields_present!([&extractor]) {
//! // if you copypaste this to a custom assert method, remember to add #[track_caller]
//! // to get the "user" code location for the panic.
//! panic!("Missing fields: {missing:?}");
//! match check_fields_present([&extractor]) {
//! Ok(()) => {},
//! Err(missing) => {
//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
//! }
//! }
//! # }
//! ```
//!
//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
//!
#[derive(Debug)]
use std::{
collections::HashSet,
fmt::{self},
hash::{Hash, Hasher},
};
pub enum ExtractionResult {
Present,
Absent,
@@ -73,101 +71,49 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
}
}
/// Checks that the given extractors are satisfied with the current span hierarchy.
///
/// This should not be called directly, but used through [`check_fields_present`] which allows
/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
#[doc(hidden)]
pub fn check_fields_present0<const L: usize>(
must_be_present: [&dyn Extractor; L],
) -> Result<Summary, Vec<&dyn Extractor>> {
let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
let trace = tracing_error::SpanTrace::capture();
trace.with_spans(|md, _formatted_fields| {
// when trying to understand the inner workings of how does the matching work, note that
// this closure might be called zero times if the span is disabled. normally it is called
// once per span hierarchy level.
missing.retain(|extractor| match extractor.extract(md.fields()) {
ExtractionResult::Present => false,
ExtractionResult::Absent => true,
});
struct MemoryIdentity<'a>(&'a dyn Extractor);
// continue walking up until we've found all missing
!missing.is_empty()
});
if missing.is_empty() {
Ok(Summary::FoundEverything)
} else if !tracing_subscriber_configured() {
Ok(Summary::Unconfigured)
} else {
// we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
// missing, which can be annoying. for this case, we could probably use
// SpanTrace::status().
//
// another way to end up here is with RUST_LOG=pageserver=off while configuring the
// logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
// this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
Err(missing)
impl<'a> MemoryIdentity<'a> {
fn as_ptr(&self) -> *const () {
self.0 as *const _ as *const ()
}
}
impl<'a> PartialEq for MemoryIdentity<'a> {
fn eq(&self, other: &Self) -> bool {
self.as_ptr() == other.as_ptr()
}
}
impl<'a> Eq for MemoryIdentity<'a> {}
impl<'a> Hash for MemoryIdentity<'a> {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_ptr().hash(state);
}
}
impl<'a> fmt::Debug for MemoryIdentity<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
}
}
/// Checks that the given extractors are satisfied with the current span hierarchy.
///
/// The macro is the preferred way of checking if fields exist while passing checks if a test does
/// not have tracing configured.
///
/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
/// However we can game a module namespaced macro for `use` purposes by re-exporting the
/// #[macro_export] exported name with an alias (below).
#[doc(hidden)]
#[macro_export]
macro_rules! __check_fields_present {
($extractors:expr) => {{
{
use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
match check_fields_present0($extractors) {
Ok(FoundEverything) => Ok(()),
Ok(Unconfigured) if cfg!(test) => {
// allow unconfigured in tests
Ok(())
},
Ok(Unconfigured) => {
panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
},
Err(missing) => Err(missing)
}
}
}}
}
pub use crate::__check_fields_present as check_fields_present;
/// Explanation for why the check was deemed ok.
///
/// Mainly useful for testing, or configuring per-crate behaviour as in with
/// [`check_fields_present`].
#[derive(Debug)]
pub enum Summary {
/// All extractors were found.
///
/// Should only happen when tracing is properly configured.
FoundEverything,
/// Tracing has not been configured at all. This is ok for tests running without tracing set
/// up.
Unconfigured,
}
fn tracing_subscriber_configured() -> bool {
let mut noop_configured = false;
tracing::dispatcher::get_default(|d| {
// it is possible that this closure will not be invoked, but the current implementation
// always invokes it
noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
/// The extractor names passed as keys to [`new`].
pub fn check_fields_present<const L: usize>(
must_be_present: [&dyn Extractor; L],
) -> Result<(), Vec<&dyn Extractor>> {
let mut missing: HashSet<MemoryIdentity> =
HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
let trace = tracing_error::SpanTrace::capture();
trace.with_spans(|md, _formatted_fields| {
missing.retain(|extractor| match extractor.0.extract(md.fields()) {
ExtractionResult::Present => false,
ExtractionResult::Absent => true,
});
!missing.is_empty() // continue walking up until we've found all missing
});
!noop_configured
if missing.is_empty() {
Ok(())
} else {
Err(missing.into_iter().map(|mi| mi.0).collect())
}
}
#[cfg(test)]
@@ -177,36 +123,6 @@ mod tests {
use super::*;
use std::{
collections::HashSet,
fmt::{self},
hash::{Hash, Hasher},
};
struct MemoryIdentity<'a>(&'a dyn Extractor);
impl<'a> MemoryIdentity<'a> {
fn as_ptr(&self) -> *const () {
self.0 as *const _ as *const ()
}
}
impl<'a> PartialEq for MemoryIdentity<'a> {
fn eq(&self, other: &Self) -> bool {
self.as_ptr() == other.as_ptr()
}
}
impl<'a> Eq for MemoryIdentity<'a> {}
impl<'a> Hash for MemoryIdentity<'a> {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_ptr().hash(state);
}
}
impl<'a> fmt::Debug for MemoryIdentity<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
}
}
struct Setup {
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
tenant_extractor: MultiNameExtractor<2>,
@@ -243,8 +159,7 @@ mod tests {
let setup = setup_current_thread();
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
}
#[test]
@@ -252,8 +167,8 @@ mod tests {
let setup = setup_current_thread();
let span = tracing::info_span!("root", timeline_id = "timeline-1");
let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
.unwrap_err();
let missing =
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]);
}
@@ -270,8 +185,7 @@ mod tests {
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
}
#[test]
@@ -284,7 +198,7 @@ mod tests {
let span = tracing::info_span!("child", timeline_id = "timeline-1");
let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]);
}
@@ -293,8 +207,7 @@ mod tests {
let setup = setup_current_thread();
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor]);
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
check_fields_present([&setup.tenant_extractor]).unwrap();
}
#[test]
@@ -310,8 +223,7 @@ mod tests {
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
let _guard = span.enter();
let res = check_fields_present0([&setup.tenant_extractor]);
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
check_fields_present([&setup.tenant_extractor]).unwrap();
}
#[test]
@@ -319,7 +231,7 @@ mod tests {
let setup = setup_current_thread();
let span = tracing::info_span!("root", timeline_id = "timeline-1");
let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]);
}
@@ -333,107 +245,43 @@ mod tests {
let span = tracing::info_span!("child", timeline_id = "timeline-1");
let _guard = span.enter();
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
assert_missing(missing, vec![&setup.tenant_extractor]);
}
#[test]
fn tracing_error_subscriber_not_set_up_straight_line() {
fn tracing_error_subscriber_not_set_up() {
// no setup
let span = tracing::info_span!("foo", e = "some value");
let _guard = span.enter();
let extractor = MultiNameExtractor::new("E", ["e"]);
let res = check_fields_present0([&extractor]);
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
// similarly for a not found key
let extractor = MultiNameExtractor::new("F", ["foobar"]);
let res = check_fields_present0([&extractor]);
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
let missing = check_fields_present([&extractor]).unwrap_err();
assert_missing(missing, vec![&extractor]);
}
#[test]
fn tracing_error_subscriber_not_set_up_with_instrument() {
// no setup
// demo a case where span entering is used to establish a parent child connection, but
// when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
let span = tracing::info_span!("foo", e = "some value");
let _guard = span.enter();
let subspan = tracing::info_span!("bar", f = "foobar");
drop(_guard);
// normally this would work, but without any tracing-subscriber configured, both
// check_field_present find nothing
let _guard = subspan.enter();
let extractors: [&dyn Extractor; 2] = [
&MultiNameExtractor::new("E", ["e"]),
&MultiNameExtractor::new("F", ["f"]),
];
let res = check_fields_present0(extractors);
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
// similarly for a not found key
let extractor = MultiNameExtractor::new("G", ["g"]);
let res = check_fields_present0([&extractor]);
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
}
#[test]
fn tracing_subscriber_configured() {
// this will fail if any utils::logging::init callers appear, but let's hope they do not
// appear.
assert!(!super::tracing_subscriber_configured());
let _g = setup_current_thread();
assert!(super::tracing_subscriber_configured());
}
#[test]
fn not_found_when_disabled_by_filter() {
#[should_panic]
fn panics_if_tracing_error_subscriber_has_wrong_filter() {
let r = tracing_subscriber::registry().with({
tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
|md| !(md.is_span() && *md.level() == tracing::Level::INFO),
))
tracing_error::ErrorLayer::default().with_filter(
tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
if md.is_span() && *md.level() == tracing::Level::INFO {
return false;
}
true
}),
)
});
let _guard = tracing::subscriber::set_default(r);
// this test is a rather tricky one, it has a number of possible outcomes depending on the
// execution order when executed with other tests even if no test sets the global default
// subscriber.
let span = tracing::info_span!("foo", e = "some value");
let _guard = span.enter();
let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
if span.is_disabled() {
// the tests are running single threaded, or we got lucky and no other tests subscriber
// was got to register their per-CALLSITE::META interest between `set_default` and
// creation of the span, thus the filter got to apply and registered interest of Never,
// so the span was never created.
//
// as the span is disabled, no keys were recorded to it, leading check_fields_present0
// to find an error.
let missing = check_fields_present0(extractors).unwrap_err();
assert_missing(missing, vec![extractors[0]]);
} else {
// when the span is enabled, it is because some other test is running at the same time,
// and that tests registry has filters which are interested in our above span.
//
// because the span is now enabled, all keys will be found for it. the
// tracing_error::SpanTrace does not consider layer filters during the span hierarchy
// walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
// this test-induced issue.
let res = check_fields_present0(extractors);
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
}
let extractor = MultiNameExtractor::new("E", ["e"]);
let missing = check_fields_present([&extractor]).unwrap_err();
assert_missing(missing, vec![&extractor]);
}
}

View File

@@ -82,7 +82,6 @@ strum_macros.workspace = true
criterion.workspace = true
hex-literal.workspace = true
tempfile.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
[[bench]]
name = "bench_layer_map"

View File

@@ -7,10 +7,10 @@
//! - The y axis represents LSN, growing upwards.
//!
//! Coordinates in both axis are compressed for better readability.
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
//!
//! Example use:
//! ```bash
//! ```
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
//! $ firefox out.svg
@@ -20,7 +20,7 @@
//! or from pageserver log files.
//!
//! TODO Consider shipping this as a grafana panel plugin:
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
//! https://grafana.com/tutorials/build-a-panel-plugin/
use anyhow::Result;
use pageserver::repository::Key;
use std::cmp::Ordering;

View File

@@ -19,6 +19,12 @@ use tokio::io;
use tokio::io::AsyncWrite;
use tracing::*;
/// NB: This relies on a modified version of tokio_tar that does *not* write the
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
/// without explicitly calling 'finish' or 'into_inner'!
///
/// See https://github.com/neondatabase/tokio-tar/pull/1
///
use tokio_tar::{Builder, EntryType, Header};
use crate::context::RequestContext;

View File

@@ -396,8 +396,8 @@ fn start_pageserver(
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
Ok(_) => {
let init_sizes_done = tokio::select! {
_ = &mut init_sizes_done => {
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
@@ -406,7 +406,7 @@ fn start_pageserver(
);
None
}
Err(_) => {
_ = tokio::time::sleep(timeout) => {
tracing::info!(
timeout_millis = timeout.as_millis(),
"Initial logical size timeout elapsed; starting background jobs"

View File

@@ -171,13 +171,11 @@ pub struct PageServerConf {
pub log_format: LogFormat,
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
/// See the comment in `eviction_task` for details.
///
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
// How often to collect metrics and send them to the metrics endpoint.
@@ -995,8 +993,6 @@ impl ConfigurableSemaphore {
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
/// behave like [`futures::future::pending`], just waiting until new permits are added.
///
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
pub fn new(initial_permits: NonZeroUsize) -> Self {
ConfigurableSemaphore {
initial_permits,

View File

@@ -179,9 +179,6 @@ impl RequestContext {
/// a context and you are unwilling to change all callers to provide one.
///
/// Before we add cancellation, we should get rid of this method.
///
/// [`attached_child`]: Self::attached_child
/// [`detached_child`]: Self::detached_child
pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
Self::new(task_kind, download_behavior)
}

View File

@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
use crate::{
config::PageServerConf,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
tenant::{self, storage_layer::PersistentLayer, Timeline},
};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
.await;
let sleep_until = start + task_config.period;
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
.await
.is_ok()
{
break;
tokio::select! {
_ = tokio::time::sleep_until(sleep_until) => {},
_ = cancel.cancelled() => {
break
}
}
}
}
@@ -390,22 +390,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
match result {
Some(Ok(())) => {
Some(Ok(true)) => {
usage_assumed.add_available_bytes(layer.file_size());
}
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
}
Some(Err(EvictionError::FileNotFound)) => {
evictions_failed.file_sizes += layer.file_size();
evictions_failed.count += 1;
}
Some(Err(
e @ EvictionError::LayerNotFound(_)
| e @ EvictionError::StatFailed(_),
)) => {
let e = utils::error::report_compact_sources(&e);
warn!(%layer, "failed to evict layer: {e}");
Some(Ok(false)) => {
// this is:
// - Replacement::{NotFound, Unexpected}
// - it cannot be is_remote_layer, filtered already
evictions_failed.file_sizes += layer.file_size();
evictions_failed.count += 1;
}
@@ -413,6 +404,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
assert!(cancel.is_cancelled());
return;
}
Some(Err(e)) => {
// we really shouldn't be getting this, precondition failure
error!("failed to evict layer: {:#}", e);
}
}
}
}

View File

@@ -1143,7 +1143,7 @@ async fn disk_usage_eviction_run(
let Some(storage) = state.remote_storage.clone() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration"
)));
)))
};
let state = state.disk_usage_eviction_state.clone();

View File

@@ -385,7 +385,7 @@ pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub struct EvictionsWithLowResidenceDuration {
data_source: &'static str,
@@ -541,17 +541,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
// keep in sync with control plane Go code so that we can validate
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
// Go code uses milliseconds. Variable is called `computeStartupBuckets`
[
5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
]
.map(|ms| (ms as f64) / 1000.0)
});
pub struct BasebackupQueryTime(HistogramVec);
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
BasebackupQueryTime({
@@ -559,7 +548,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
"pageserver_basebackup_query_seconds",
"Histogram of basebackup queries durations, by result type",
&["result"],
COMPUTE_STARTUP_BUCKETS.to_vec(),
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
})
@@ -829,7 +818,7 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
pub struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics,
start: Instant,
@@ -887,7 +876,7 @@ impl StorageTimeMetrics {
/// Starts timing a new operation.
///
/// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
/// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
pub fn start_timer(&self) -> StorageTimeMetricsTimer {
StorageTimeMetricsTimer::new(self.clone())
}
@@ -1267,7 +1256,7 @@ impl RemoteTimelineClientMetrics {
/// Update the metrics that change when a call to the remote timeline client instance starts.
///
/// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
/// is more suitable.
/// Never do both.
pub(crate) fn call_begin(
@@ -1300,7 +1289,7 @@ impl RemoteTimelineClientMetrics {
/// Manually udpate the metrics that track completions, instead of using the guard object.
/// Using the guard object is generally preferable.
/// See [`call_begin`](Self::call_begin) for more context.
/// See [`call_begin`] for more context.
pub(crate) fn call_end(
&self,
file_kind: &RemoteOpFileKind,

View File

@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
/// context, breaking the atomicity is OK. If the import is interrupted, the
/// whole import fails and the timeline will be deleted anyway.
/// (Or to be precise, it will be left behind for debugging purposes and
/// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
///
/// Note: A consequence of flushing the pending operations is that they
/// won't be visible to subsequent operations until `commit`. The function

View File

@@ -205,7 +205,7 @@ pub enum TaskKind {
///
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
/// That abstraction doesn't use `task_mgr`.
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
/// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
///
/// Once the connection is established, the `TaskHandle` task creates a
@@ -213,21 +213,16 @@ pub enum TaskKind {
/// the `Connection` object.
/// A `CancellationToken` created by the `TaskHandle` task ensures
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
///
/// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
/// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
WalReceiverManager,
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
/// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
/// See the comment on [`WalReceiverManager`].
///
/// [`WalReceiverManager`]: Self::WalReceiverManager
WalReceiverConnectionHandler,
/// The task that polls the `tokio-postgres::Connection` object.
/// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
/// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
/// Spawned by task [`WalReceiverConnectionHandler`].
/// See the comment on [`WalReceiverManager`].
WalReceiverConnectionPoller,
// Garbage collection worker. One per tenant
@@ -511,13 +506,17 @@ pub async fn shutdown_tasks(
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
.await
.is_err()
{
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for {} to shut down", task.name);
let join_handle = tokio::select! {
biased;
_ = &mut join_handle => { None },
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for {} to shut down", task.name);
Some(join_handle)
}
};
if let Some(join_handle) = join_handle {
// we never handled this return value, but:
// - we don't deschedule which would lead to is_cancelled
// - panics are already logged (is_panicked)

View File

@@ -121,7 +121,7 @@ pub mod mgr;
pub mod tasks;
pub mod upload_queue;
pub(crate) mod timeline;
mod timeline;
pub mod size;
@@ -133,7 +133,7 @@ pub use timeline::{
// re-export this function so that page_cache.rs can use it.
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
// re-export for use in remote_timeline_client.rs
// re-export for use in storage_sync.rs
pub use crate::tenant::metadata::save_metadata;
// re-export for use in walreceiver
@@ -281,7 +281,7 @@ pub enum DeleteTimelineError {
}
pub enum SetStoppingError {
AlreadyStopping(completion::Barrier),
AlreadyStopping,
Broken,
}
@@ -318,6 +318,10 @@ impl std::fmt::Display for WaitToBecomeActiveError {
}
}
pub(crate) enum ShutdownError {
AlreadyStopping,
}
struct DeletionGuard(OwnedMutexGuard<bool>);
impl DeletionGuard {
@@ -1168,7 +1172,7 @@ impl Tenant {
)
}
/// Helper for unit tests to create an empty timeline.
/// Helper for unit tests to create an emtpy timeline.
///
/// The timeline is has state value `Active` but its background loops are not running.
// This makes the various functions which anyhow::ensure! for Active state work in tests.
@@ -1455,7 +1459,7 @@ impl Tenant {
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
info!("got layer_removal_cs.lock(), deleting layer files");
// NB: remote_timeline_client upload tasks that reference these layers have been cancelled
// NB: storage_sync upload tasks that reference these layers have been cancelled
// by the caller.
let local_timeline_directory = self
@@ -1717,7 +1721,7 @@ impl Tenant {
self.state.send_modify(|current_state| {
use pageserver_api::models::ActivatingFrom;
match &*current_state {
TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => {
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
}
TenantState::Loading => {
@@ -1781,16 +1785,7 @@ impl Tenant {
/// - detach + ignore (freeze_and_flush == false)
///
/// This will attempt to shutdown even if tenant is broken.
///
/// `shutdown_progress` is a [`completion::Barrier`] for the shutdown initiated by this call.
/// If the tenant is already shutting down, we return a clone of the first shutdown call's
/// `Barrier` as an `Err`. This not-first caller can use the returned barrier to join with
/// the ongoing shutdown.
async fn shutdown(
&self,
shutdown_progress: completion::Barrier,
freeze_and_flush: bool,
) -> Result<(), completion::Barrier> {
pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
span::debug_assert_current_span_has_tenant_id();
// Set tenant (and its timlines) to Stoppping state.
//
@@ -1809,16 +1804,12 @@ impl Tenant {
// But the tenant background loops are joined-on in our caller.
// It's mesed up.
// we just ignore the failure to stop
match self.set_stopping(shutdown_progress).await {
match self.set_stopping().await {
Ok(()) => {}
Err(SetStoppingError::Broken) => {
// assume that this is acceptable
}
Err(SetStoppingError::AlreadyStopping(other)) => {
// give caller the option to wait for this this shutdown
return Err(other);
}
Err(SetStoppingError::AlreadyStopping) => return Err(ShutdownError::AlreadyStopping),
};
if freeze_and_flush {
@@ -1850,7 +1841,7 @@ impl Tenant {
/// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
///
/// This function is not cancel-safe!
async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
async fn set_stopping(&self) -> Result<(), SetStoppingError> {
let mut rx = self.state.subscribe();
// cannot stop before we're done activating, so wait out until we're done activating
@@ -1862,7 +1853,7 @@ impl Tenant {
);
false
}
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
})
.await
.expect("cannot drop self.state while on a &self method");
@@ -1877,7 +1868,7 @@ impl Tenant {
// FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
// are created after the transition to Stopping. That's harmless, as the Timelines
// won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
*current_state = TenantState::Stopping { progress };
*current_state = TenantState::Stopping;
// Continue stopping outside the closure. We need to grab timelines.lock()
// and we plan to turn it into a tokio::sync::Mutex in a future patch.
true
@@ -1889,9 +1880,9 @@ impl Tenant {
err = Some(SetStoppingError::Broken);
false
}
TenantState::Stopping { progress } => {
TenantState::Stopping => {
info!("Tenant is already in Stopping state");
err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
err = Some(SetStoppingError::AlreadyStopping);
false
}
});
@@ -1935,7 +1926,7 @@ impl Tenant {
);
false
}
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
})
.await
.expect("cannot drop self.state while on a &self method");
@@ -1958,7 +1949,7 @@ impl Tenant {
warn!("Tenant is already in Broken state");
}
// This is the only "expected" path, any other path is a bug.
TenantState::Stopping { .. } => {
TenantState::Stopping => {
warn!(
"Marking Stopping tenant as Broken state, reason: {}",
reason
@@ -1991,7 +1982,7 @@ impl Tenant {
TenantState::Active { .. } => {
return Ok(());
}
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
TenantState::Broken { .. } | TenantState::Stopping => {
// There's no chance the tenant can transition back into ::Active
return Err(WaitToBecomeActiveError::WillNotBecomeActive {
tenant_id: self.tenant_id,
@@ -3359,18 +3350,14 @@ pub mod harness {
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
(
self.try_load(&ctx, None)
self.try_load(&ctx)
.await
.expect("failed to load test tenant"),
ctx,
)
}
pub async fn try_load(
&self,
ctx: &RequestContext,
remote_storage: Option<remote_storage::GenericRemoteStorage>,
) -> anyhow::Result<Arc<Tenant>> {
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
let walredo_mgr = Arc::new(TestRedoManager);
let tenant = Arc::new(Tenant::new(
@@ -3379,7 +3366,7 @@ pub mod harness {
TenantConfOpt::from(self.tenant_conf),
walredo_mgr,
self.tenant_id,
remote_storage,
None,
));
tenant
.load(None, ctx)
@@ -3917,11 +3904,7 @@ mod tests {
metadata_bytes[8] ^= 1;
std::fs::write(metadata_path, metadata_bytes)?;
let err = harness
.try_load(&ctx, None)
.await
.err()
.expect("should fail");
let err = harness.try_load(&ctx).await.err().expect("should fail");
// get all the stack with all .context, not tonly the last one
let message = format!("{err:#}");
let expected = "Failed to parse metadata bytes from path";
@@ -4352,13 +4335,13 @@ mod tests {
// assert freeze_and_flush exercised the initdb optimization
{
let state = tline.flush_loop_state.lock().unwrap();
let timeline::FlushLoopState::Running {
expect_initdb_optimization,
initdb_optimization_count,
} = *state
else {
panic!("unexpected state: {:?}", *state);
};
let
timeline::FlushLoopState::Running {
expect_initdb_optimization,
initdb_optimization_count,
} = *state else {
panic!("unexpected state: {:?}", *state);
};
assert!(expect_initdb_optimization);
assert!(initdb_optimization_count > 0);
}

View File

@@ -442,7 +442,7 @@ where
writer: W,
///
/// `stack[0]` is the current root page, `stack.last()` is the leaf.
/// stack[0] is the current root page, stack.last() is the leaf.
///
/// We maintain the length of the stack to be always greater than zero.
/// Two exceptions are:

View File

@@ -16,7 +16,7 @@
//! Other read methods are less critical but still impact performance of background tasks.
//!
//! This data structure relies on a persistent/immutable binary search tree. See the
//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
//! you to modify the tree in such a way that each modification creates a new "version"
//! of the tree. When you modify it, you get a new version, but all previous versions are
@@ -40,7 +40,7 @@
//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
//! to throw away most of the persistent BST and build a new one, starting from the oldest
//! LSN. See [`LayerMap::flush_updates()`].
//! LSN. See `LayerMap::flush_updates()`.
//!
mod historic_layer_coverage;

View File

@@ -122,7 +122,8 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
self.head = self
.historic
.iter()
.next_back()
.rev()
.next()
.map(|(_, v)| v.clone())
.unwrap_or_default();
}
@@ -411,7 +412,7 @@ fn test_persistent_overlapping() {
/// still be more critical.
///
/// See this for more on persistent and retroactive techniques:
/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
pub struct BufferedHistoricLayerCoverage<Value> {
/// A persistent layer map that we rebuild when we need to retroactively update
historic_coverage: HistoricLayerCoverage<Value>,

View File

@@ -2,7 +2,7 @@ use std::ops::Range;
// NOTE the `im` crate has 20x more downloads and also has
// persistent/immutable BTree. But it's bugged so rpds is a
// better choice <https://github.com/neondatabase/neon/issues/3395>
// better choice https://github.com/neondatabase/neon/issues/3395
use rpds::RedBlackTreeMapSync;
/// Data structure that can efficiently:
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
/// - insert layers in non-decreasing lsn.start order
///
/// For a detailed explanation and justification of this approach, see:
/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
///
/// NOTE The struct is parameterized over Value for easier
/// testing, but in practice it's some sort of layer.
@@ -113,7 +113,8 @@ impl<Value: Clone> LayerCoverage<Value> {
pub fn query(&self, key: i128) -> Option<Value> {
self.nodes
.range(..=key)
.next_back()?
.rev()
.next()?
.1
.as_ref()
.map(|(_, v)| v.clone())

View File

@@ -24,7 +24,7 @@
//! Currently, this is not used in the system. Future refactors will ensure
//! the storage state will be recorded in this file, and the system can be
//! recovered from this file. This is tracked in
//! <https://github.com/neondatabase/neon/issues/4418>
//! https://github.com/neondatabase/neon/issues/4418
use std::io::{self, Read, Write};

View File

@@ -1,12 +1,10 @@
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
//! has a metadata that needs to be stored persistently.
//!
//! Later, the file gets used in [`remote_timeline_client`] as a part of
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
//! external storage import and export operations.
//!
//! The module contains all structs and related helper methods related to timeline metadata.
//!
//! [`remote_timeline_client`]: super::remote_timeline_client
use std::fs::{File, OpenOptions};
use std::io::Write;

View File

@@ -233,17 +233,11 @@ pub fn schedule_local_tenant_processing(
/// That could be easily misinterpreted by control plane, the consumer of the
/// management API. For example, it could attach the tenant on a different pageserver.
/// We would then be in split-brain once this pageserver restarts.
#[instrument(skip_all)]
#[instrument]
pub async fn shutdown_all_tenants() {
shutdown_all_tenants0(&TENANTS).await
}
async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
use utils::completion;
// Prevent new tenants from being created.
let tenants_to_shut_down = {
let mut m = tenants.write().await;
let mut m = TENANTS.write().await;
match &mut *m {
TenantsMap::Initializing => {
*m = TenantsMap::ShuttingDown(HashMap::default());
@@ -268,41 +262,14 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
for (tenant_id, tenant) in tenants_to_shut_down {
join_set.spawn(
async move {
// ordering shouldn't matter for this, either we store true right away or never
let ordering = std::sync::atomic::Ordering::Relaxed;
let joined_other = std::sync::atomic::AtomicBool::new(false);
let freeze_and_flush = true;
let mut shutdown = std::pin::pin!(async {
let freeze_and_flush = true;
let res = {
let (_guard, shutdown_progress) = completion::channel();
tenant.shutdown(shutdown_progress, freeze_and_flush).await
};
if let Err(other_progress) = res {
// join the another shutdown in progress
joined_other.store(true, ordering);
other_progress.wait().await;
match tenant.shutdown(freeze_and_flush).await {
Ok(()) => debug!("tenant successfully stopped"),
Err(super::ShutdownError::AlreadyStopping) => {
warn!("tenant was already shutting down")
}
});
// in practice we might not have a lot time to go, since systemd is going to
// SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
// a warning.
let warning = std::time::Duration::from_secs(5);
let mut warning = std::pin::pin!(tokio::time::sleep(warning));
tokio::select! {
_ = &mut shutdown => {},
_ = &mut warning => {
let joined_other = joined_other.load(ordering);
warn!(%joined_other, "waiting for the shutdown to complete");
shutdown.await;
}
};
debug!("tenant successfully stopped");
}
}
.instrument(info_span!("shutdown", %tenant_id)),
);
@@ -446,15 +413,6 @@ pub async fn detach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
detach_ignored: bool,
) -> Result<(), TenantStateError> {
detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
}
async fn detach_tenant0(
conf: &'static PageServerConf,
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
detach_ignored: bool,
) -> Result<(), TenantStateError> {
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -467,8 +425,7 @@ async fn detach_tenant0(
};
let removal_result =
remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
.await;
remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
// Ignored tenants are not present in memory and will bail the removal from memory operation.
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -515,15 +472,7 @@ pub async fn ignore_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> Result<(), TenantStateError> {
ignore_tenant0(conf, &TENANTS, tenant_id).await
}
async fn ignore_tenant0(
conf: &'static PageServerConf,
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
) -> Result<(), TenantStateError> {
remove_tenant_from_memory(tenants, tenant_id, async {
remove_tenant_from_memory(tenant_id, async {
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
fs::File::create(&ignore_mark_file)
.await
@@ -648,21 +597,18 @@ where
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
/// operation would be needed to remove it.
async fn remove_tenant_from_memory<V, F>(
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
tenant_cleanup: F,
) -> Result<V, TenantStateError>
where
F: std::future::Future<Output = anyhow::Result<V>>,
{
use utils::completion;
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
// avoid holding the lock for the entire process.
let tenant = {
tenants
TENANTS
.write()
.await
.get(&tenant_id)
@@ -670,20 +616,14 @@ where
.ok_or(TenantStateError::NotFound(tenant_id))?
};
// allow pageserver shutdown to await for our completion
let (_guard, progress) = completion::channel();
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match tenant.shutdown(progress, freeze_and_flush).await {
match tenant.shutdown(freeze_and_flush).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
// wait for it but return an error right away because these are distinct requests.
return Err(TenantStateError::IsStopping(tenant_id));
Err(super::ShutdownError::AlreadyStopping) => {
return Err(TenantStateError::IsStopping(tenant_id))
}
}
@@ -692,14 +632,14 @@ where
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
{
Ok(hook_value) => {
let mut tenants_accessor = tenants.write().await;
let mut tenants_accessor = TENANTS.write().await;
if tenants_accessor.remove(&tenant_id).is_none() {
warn!("Tenant {tenant_id} got removed from memory before operation finished");
}
Ok(hook_value)
}
Err(e) => {
let tenants_accessor = tenants.read().await;
let tenants_accessor = TENANTS.read().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => {
tenant.set_broken(e.to_string()).await;
@@ -816,109 +756,3 @@ pub async fn immediate_compact(
Ok(wait_task_done)
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::sync::Arc;
use tracing::{info_span, Instrument};
use super::{super::harness::TenantHarness, TenantsMap};
#[tokio::test(start_paused = true)]
async fn shutdown_joins_remove_tenant_from_memory() {
// the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
// sure `shutdown_all_tenants0` per-tenant processing joins in any active
// remove_tenant_from_memory calls, which is enforced by making the operation last until
// we've ran `shutdown_all_tenants0` for a long time.
let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
.unwrap()
.load()
.await;
// harness loads it to active, which is forced and nothing is running on the tenant
let id = t.tenant_id();
// tenant harness configures the logging and we cannot escape it
let _e = info_span!("testing", tenant_id = %id).entered();
let tenants = HashMap::from([(id, t.clone())]);
let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
let (until_cleanup_started, cleanup_started) = utils::completion::channel();
// start a "detaching operation", which will take a while, until can_complete_cleanup
let cleanup_task = {
let jh = tokio::spawn({
let tenants = tenants.clone();
async move {
let cleanup = async move {
drop(until_cleanup_started);
can_complete_cleanup.wait().await;
anyhow::Ok(())
};
super::remove_tenant_from_memory(&tenants, id, cleanup).await
}
.instrument(info_span!("foobar", tenant_id = %id))
});
// now the long cleanup should be in place, with the stopping state
cleanup_started.wait().await;
jh
};
let mut cleanup_progress = std::pin::pin!(t
.shutdown(utils::completion::Barrier::default(), false)
.await
.unwrap_err()
.wait());
let mut shutdown_task = {
let (until_shutdown_started, shutdown_started) = utils::completion::channel();
let shutdown_task = tokio::spawn(async move {
drop(until_shutdown_started);
super::shutdown_all_tenants0(&tenants).await;
});
shutdown_started.wait().await;
shutdown_task
};
// if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
// get to complete within timeout and fail the test. it is expected to continue awaiting
// until completion or SIGKILL during normal shutdown.
//
// the timeout is long to cover anything that shutdown_task could be doing, but it is
// handled instantly because we use tokio's time pausing in this test. 100s is much more than
// what we get from systemd on shutdown (10s).
let long_time = std::time::Duration::from_secs(100);
tokio::select! {
_ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
_ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
_ = tokio::time::sleep(long_time) => {},
}
// allow the remove_tenant_from_memory and thus eventually the shutdown to continue
drop(until_cleanup_completed);
let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
je.expect("Tenant::shutdown shutdown not have panicked");
cleanup_task
.await
.expect("no panicking")
.expect("remove_tenant_from_memory failed");
futures::future::poll_immediate(
t.shutdown(utils::completion::Barrier::default(), false)
.await
.unwrap_err()
.wait(),
)
.await
.expect("the stopping progress must still be complete");
}
}

View File

@@ -135,7 +135,7 @@
//! - Initiate upload queue with that [`IndexPart`].
//! - Reschedule all lost operations by comparing the local filesystem state
//! and remote state as per [`IndexPart`]. This is done in
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
//!
//! Note that if we crash during file deletion between the index update
//! that removes the file from the list of files, and deleting the remote file,
@@ -163,8 +163,8 @@
//! - download their remote [`IndexPart`]s
//! - create `Timeline` struct and a `RemoteTimelineClient`
//! - initialize the client's upload queue with its `IndexPart`
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
//! for layers that are referenced by `IndexPart` but not present locally
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
//! but not present locally
//! - schedule uploads for layers that are only present locally.
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
//! the local filesystem, write the remote metadata to the local filesystem
@@ -198,8 +198,6 @@
//! in remote storage.
//! But note that we don't test any of this right now.
//!
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
mod delete;
mod download;
@@ -842,16 +840,14 @@ impl RemoteTimelineClient {
let remaining: Vec<RemotePath> = remaining
.into_iter()
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
.inspect(|path| {
if let Some(name) = path.object_name() {
info!(%name, "deleting a file not referenced from index_part.json");
} else {
warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
}
})
.collect();
if !remaining.is_empty() {
warn!(
"Found {} files not bound to index_file.json, proceeding with their deletion",
remaining.len()
);
warn!("About to remove {} files", remaining.len());
self.storage_impl.delete_objects(&remaining).await?;
}
@@ -860,7 +856,7 @@ impl RemoteTimelineClient {
debug!("deleting index part");
self.storage_impl.delete(&index_file_path).await?;
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
info!(deletions_queued, "done deleting, including index_part.json");
Ok(())
}

View File

@@ -62,11 +62,12 @@ pub(super) async fn upload_timeline_layer<'a>(
let source_file = match source_file_res {
Ok(source_file) => source_file,
Err(e) if e.kind() == ErrorKind::NotFound => {
// If we encounter this arm, it wasn't intended, but it's also not
// a big problem, if it's because the file was deleted before an
// upload. However, a nonexistent file can also be indicative of
// something worse, like when a file is scheduled for upload before
// it has been written to disk yet.
// In some situations we might run into the underlying file being deleted by
// e.g. compaction before the uploader gets to it. In that instance, we don't
// want to retry the error: a deleted file won't come back. In theory, the
// file might not have been written in the first place, which also indicates
// a bug. Still log the situation so that we can keep an eye on it.
// See https://github.com/neondatabase/neon/issues/4526
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
return Ok(());
}

View File

@@ -110,11 +110,11 @@ pub struct TimelineInputs {
///
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
/// is updated on-demand, during the start of this calculation and separate from the
/// [`TimelineInputs::latest_gc_cutoff`].
/// [`Timeline::latest_gc_cutoff`].
///
/// For timelines in general:
///
/// ```text
/// ```ignore
/// 0-----|---------|----|------------| · · · · · |·> lsn
/// initdb_lsn branchpoints* next_gc_cutoff latest
/// ```

View File

@@ -11,7 +11,10 @@ pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<
#[cfg(debug_assertions)]
#[track_caller]
pub(crate) fn debug_assert_current_span_has_tenant_id() {
if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
panic!("missing extractors: {missing:?}")
if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
panic!(
"missing extractors: {:?}",
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
)
}
}

View File

@@ -162,9 +162,6 @@ impl LayerAccessStats {
/// The caller is responsible for recording a residence event
/// using [`record_residence_event`] before calling `latest_activity`.
/// If they don't, [`latest_activity`] will return `None`.
///
/// [`record_residence_event`]: Self::record_residence_event
/// [`latest_activity`]: Self::latest_activity
pub(crate) fn empty_will_record_residence_event_later() -> Self {
LayerAccessStats(Mutex::default())
}
@@ -172,9 +169,6 @@ impl LayerAccessStats {
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
///
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
///
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn for_loading_layer(
layer_map_lock_held_witness: &LayerManager,
status: LayerResidenceStatus,
@@ -193,8 +187,6 @@ impl LayerAccessStats {
/// The `new_status` is not recorded in `self`.
///
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
///
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn clone_for_residence_change(
&self,
layer_map_lock_held_witness: &LayerManager,
@@ -302,13 +294,11 @@ impl LayerAccessStats {
/// implementation error. This function logs a rate-limited warning in that case.
///
/// TODO: use type system to avoid the need for `fallback`.
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
/// The approach in https://github.com/neondatabase/neon/pull/3775
/// could be used to enforce that a residence event is recorded
/// before a layer is added to the layer map. We could also have
/// a layer wrapper type that holds the LayerAccessStats, and ensure
/// that that type can only be produced by inserting into the layer map.
///
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
let locked = self.0.lock().unwrap();
let inner = &locked.for_eviction_policy;
@@ -333,7 +323,7 @@ impl LayerAccessStats {
}
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
/// required by [`LayerMap`](super::layer_map::LayerMap).
/// required by [`LayerMap`].
///
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
/// timeline names, because those are known in the context of which the layers
@@ -380,10 +370,10 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
}
/// Returned by [`PersistentLayer::iter`]
/// Returned by [`Layer::iter`]
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
/// Returned by [`PersistentLayer::key_iter`]
/// Returned by [`Layer::key_iter`]
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
/// Get a layer descriptor from a layer.
@@ -442,10 +432,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
None
}
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
None
}
fn is_remote_layer(&self) -> bool {
false
}

View File

@@ -7,18 +7,14 @@
//! must be page images or WAL records with the 'will_init' flag set, so that
//! they can be replayed without referring to an older page version.
//!
//! The delta files are stored in `timelines/<timeline_id>` directory. Currently,
//! The delta files are stored in timelines/<timeline_id> directory. Currently,
//! there are no subdirectories, and each delta file is named like this:
//!
//! ```text
//! <key start>-<key end>__<start LSN>-<end LSN>
//! ```
//!
//! For example:
//!
//! ```text
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
//! ```
//!
//! Every delta file consists of three parts: "summary", "index", and
//! "values". The summary is a fixed size header at the beginning of the file,
@@ -51,7 +47,6 @@ use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tracing::*;
use utils::{
@@ -415,10 +410,6 @@ impl AsLayerDesc for DeltaLayer {
}
impl PersistentLayer for DeltaLayer {
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
Some(self)
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
@@ -809,7 +800,7 @@ impl DeltaLayerWriterInner {
///
/// # Note
///
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
/// possible for the writer to drop before `finish` is actually called. So this
/// could lead to odd temporary files in the directory, exhausting file system.
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`

View File

@@ -57,9 +57,8 @@ impl Ord for DeltaFileName {
/// Represents the filename of a DeltaLayer
///
/// ```text
/// <key start>-<key end>__<LSN start>-<LSN end>
/// ```
///
impl DeltaFileName {
///
/// Parse a string as a delta file name. Returns None if the filename does not
@@ -163,9 +162,7 @@ impl ImageFileName {
///
/// Represents the filename of an ImageLayer
///
/// ```text
/// <key start>-<key end>__<LSN>
/// ```
impl ImageFileName {
///
/// Parse a string as an image file name. Returns None if the filename does not

View File

@@ -7,15 +7,11 @@
//! timelines/<timeline_id> directory. Currently, there are no
//! subdirectories, and each image layer file is named like this:
//!
//! ```text
//! <key start>-<key end>__<LSN>
//! ```
//!
//! For example:
//!
//! ```text
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
//! ```
//!
//! Every image layer file consists of three parts: "summary",
//! "index", and "values". The summary is a fixed size header at the
@@ -664,7 +660,7 @@ impl ImageLayerWriterInner {
///
/// # Note
///
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
/// possible for the writer to drop before `finish` is actually called. So this
/// could lead to odd temporary files in the directory, exhausting file system.
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`

View File

@@ -25,7 +25,7 @@ use super::{
};
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
/// [`DeltaLayer`](super::DeltaLayer).
/// [`crate::storage_layer::DeltaLayer`].
///
/// RemoteLayer might be downloaded on-demand during operations which are
/// allowed download remote layers and during which, it gets replaced with a
@@ -50,8 +50,6 @@ pub struct RemoteLayer {
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
/// a possible fast loop between `Timeline::get_reconstruct_data` and
/// `Timeline::download_remote_layer`, which also logs.
///
/// [`ongoing_download`]: Self::ongoing_download
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
}

View File

@@ -122,12 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
.is_ok()
{
info!("received cancellation request during idling");
break;
tokio::select! {
_ = cancel.cancelled() => {
info!("received cancellation request during idling");
break;
},
_ = tokio::time::sleep(sleep_duration) => {},
}
}
}
@@ -196,12 +196,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
warn_when_period_overrun(started_at.elapsed(), period, "gc");
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await
.is_ok()
{
info!("received cancellation request during idling");
break;
tokio::select! {
_ = cancel.cancelled() => {
info!("received cancellation request during idling");
break;
},
_ = tokio::time::sleep(sleep_duration) => {},
}
}
}
@@ -263,9 +263,9 @@ pub(crate) async fn random_init_delay(
rng.gen_range(Duration::ZERO..=period)
};
match tokio::time::timeout(d, cancel.cancelled()).await {
Ok(_) => Err(Cancelled),
Err(_) => Ok(()),
tokio::select! {
_ = cancel.cancelled() => Err(Cancelled),
_ = tokio::time::sleep(d) => Ok(()),
}
}

View File

@@ -24,7 +24,7 @@ use tracing::*;
use utils::id::TenantTimelineId;
use std::cmp::{max, min, Ordering};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::collections::{BinaryHeap, HashMap};
use std::fs;
use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
@@ -183,7 +183,7 @@ pub struct Timeline {
walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
/// Remote storage client.
/// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
/// See [`storage_sync`] module comment for details.
pub remote_client: Option<Arc<RemoteTimelineClient>>,
// What page versions do we hold in the repository? If we get a
@@ -240,8 +240,6 @@ pub struct Timeline {
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
///
/// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -981,12 +979,8 @@ impl Timeline {
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let Some(layer) = self.find_layer(layer_file_name).await else {
return Ok(None);
};
let Some(remote_layer) = layer.downcast_remote_layer() else {
return Ok(Some(false));
};
let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) };
if self.remote_client.is_none() {
return Ok(Some(false));
}
@@ -995,12 +989,10 @@ impl Timeline {
Ok(Some(true))
}
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
/// Like [`evict_layer_batch`], but for just one layer.
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let Some(local_layer) = self.find_layer(layer_file_name).await else {
return Ok(None);
};
let Some(local_layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
let remote_client = self
.remote_client
.as_ref()
@@ -1011,25 +1003,25 @@ impl Timeline {
.evict_layer_batch(remote_client, &[local_layer], cancel)
.await?;
assert_eq!(results.len(), 1);
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
match result {
None => anyhow::bail!("task_mgr shutdown requested"),
Some(Ok(())) => Ok(Some(true)),
Some(Err(e)) => Err(anyhow::Error::new(e)),
Some(Ok(b)) => Ok(Some(b)),
Some(Err(e)) => Err(e),
}
}
/// Evict a batch of layers.
///
/// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured."
/// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
///
/// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
pub(crate) async fn evict_layers(
/// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
pub async fn evict_layers(
&self,
_: &GenericRemoteStorage,
layers_to_evict: &[Arc<dyn PersistentLayer>],
cancel: CancellationToken,
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
let remote_client = self.remote_client.clone().expect(
"GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
);
@@ -1064,7 +1056,7 @@ impl Timeline {
remote_client: &Arc<RemoteTimelineClient>,
layers_to_evict: &[Arc<dyn PersistentLayer>],
cancel: CancellationToken,
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
// ensure that the layers have finished uploading
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
remote_client
@@ -1110,9 +1102,11 @@ impl Timeline {
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
local_layer: &Arc<dyn PersistentLayer>,
layer_mgr: &mut LayerManager,
) -> Result<(), EvictionError> {
) -> anyhow::Result<bool> {
if local_layer.is_remote_layer() {
return Err(EvictionError::CannotEvictRemoteLayer);
// TODO(issue #3851): consider returning an err here instead of false,
// which is the same out the match later
return Ok(false);
}
let layer_file_size = local_layer.file_size();
@@ -1121,22 +1115,13 @@ impl Timeline {
.local_path()
.expect("local layer should have a local path")
.metadata()
// when the eviction fails because we have already deleted the layer in compaction for
// example, a NotFound error bubbles up from here.
.map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
EvictionError::FileNotFound
} else {
EvictionError::StatFailed(e)
}
})?
.context("get local layer file stat")?
.modified()
.map_err(EvictionError::StatFailed)?;
.context("get mtime of layer file")?;
let local_layer_residence_duration =
match SystemTime::now().duration_since(local_layer_mtime) {
Err(e) => {
warn!(layer = %local_layer, "layer mtime is in the future: {}", e);
warn!("layer mtime is in the future: {}", e);
None
}
Ok(delta) => Some(delta),
@@ -1167,65 +1152,54 @@ impl Timeline {
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
layer_mgr
.replace_and_verify(local_layer.clone(), new_remote_layer)
.map_err(EvictionError::LayerNotFound)?;
let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) {
Ok(()) => {
if let Err(e) = local_layer.delete_resident_layer_file() {
error!("failed to remove layer file on evict after replacement: {e:#?}");
}
// Always decrement the physical size gauge, even if we failed to delete the file.
// Rationale: we already replaced the layer with a remote layer in the layer map,
// and any subsequent download_remote_layer will
// 1. overwrite the file on disk and
// 2. add the downloaded size to the resident size gauge.
//
// If there is no re-download, and we restart the pageserver, then load_layer_map
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);
if let Err(e) = local_layer.delete_resident_layer_file() {
// this should never happen, because of layer_removal_cs usage and above stat
// access for mtime
error!("failed to remove layer file on evict after replacement: {e:#?}");
}
// Always decrement the physical size gauge, even if we failed to delete the file.
// Rationale: we already replaced the layer with a remote layer in the layer map,
// and any subsequent download_remote_layer will
// 1. overwrite the file on disk and
// 2. add the downloaded size to the resident size gauge.
//
// If there is no re-download, and we restart the pageserver, then load_layer_map
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);
self.metrics.evictions.inc();
self.metrics.evictions.inc();
if let Some(delta) = local_layer_residence_duration {
self.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(delta);
info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
} else {
info!(layer=%local_layer, "evicted layer after unknown residence period");
}
if let Some(delta) = local_layer_residence_duration {
self.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(delta);
info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
} else {
info!(layer=%local_layer, "evicted layer after unknown residence period");
}
true
}
Err(err) => {
if cfg!(debug_assertions) {
panic!("failed to replace: {err}, evicted: {local_layer:?}");
} else {
error!(evicted=?local_layer, "failed to replace: {err}");
}
false
}
};
Ok(())
Ok(succeed)
}
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum EvictionError {
#[error("cannot evict a remote layer")]
CannotEvictRemoteLayer,
/// Most likely the to-be evicted layer has been deleted by compaction or gc which use the same
/// locks, so they got to execute before the eviction.
#[error("file backing the layer has been removed already")]
FileNotFound,
#[error("stat failed")]
StatFailed(#[source] std::io::Error),
/// In practice, this can be a number of things, but lets assume it means only this.
///
/// This case includes situations such as the Layer was evicted and redownloaded in between,
/// because the file existed before an replacement attempt was made but now the Layers are
/// different objects in memory.
#[error("layer was no longer part of LayerMap")]
LayerNotFound(#[source] anyhow::Error),
}
/// Number of times we will compute partition within a checkpoint distance.
const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
@@ -1795,7 +1769,7 @@ impl Timeline {
/// 3. Schedule upload of local-only layer files (which will then also update the remote
/// IndexPart to include the new layer files).
///
/// Refer to the [`remote_timeline_client`] module comment for more context.
/// Refer to the `storage_sync` module comment for more context.
///
/// # TODO
/// May be a bit cleaner to do things based on populated remote client,
@@ -2628,9 +2602,7 @@ impl Timeline {
guard.layer_map().frozen_layers.front().cloned()
// drop 'layers' lock to allow concurrent reads and writes
};
let Some(layer_to_flush) = layer_to_flush else {
break Ok(());
};
let Some(layer_to_flush) = layer_to_flush else { break Ok(()) };
if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
error!("could not flush frozen layer: {err:?}");
break Err(err);
@@ -2703,7 +2675,7 @@ impl Timeline {
// files instead. This is possible as long as *all* the data imported into the
// repository have the same LSN.
let lsn_range = frozen_layer.get_lsn_range();
let (layer_paths_to_upload, delta_layer_to_add) =
let layer_paths_to_upload =
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
@@ -2722,12 +2694,8 @@ impl Timeline {
let (partitioning, _lsn) = self
.repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
.await?;
// For image layers, we add them immediately into the layer map.
(
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
.await?,
None,
)
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
.await?
} else {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
@@ -2741,50 +2709,29 @@ impl Timeline {
assert!(!*expect_initdb_optimization, "expected initdb optimization");
}
}
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let layer = self.create_delta_layer(&frozen_layer).await?;
(
HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
Some(layer),
)
// normal case, write out a L0 delta layer file.
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer).await?;
HashMap::from([(delta_path, metadata)])
};
pausable_failpoint!("flush-frozen-before-sync");
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now. The flushed layer is stored in
// the mapping in `create_delta_layer`.
{
let mut guard = self.layers.write().await;
let l = guard.layer_map_mut().frozen_layers.pop_front();
if let Some(ref l) = delta_layer_to_add {
// TODO: move access stats, metrics update, etc. into layer manager.
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
// Only one thread may call this function at a time (for this
// timeline). If two threads tried to flush the same frozen
// layer to disk at the same time, that would not work.
assert!(compare_arced_layers(&l.unwrap(), &frozen_layer));
// update metrics
let sz = l.file_size();
self.metrics.resident_physical_size_gauge.add(sz);
self.metrics.num_persistent_files_created.inc_by(1);
self.metrics.persistent_bytes_written.inc_by(sz);
}
guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
// release lock on 'layers'
}
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
// a compaction can delete the file and then it won't be available for uploads any more.
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
// race situation.
// See https://github.com/neondatabase/neon/issues/4526
pausable_failpoint!("flush-frozen-pausable");
// This failpoint is used by another test case `test_pageserver_recovery`.
fail_point!("flush-frozen-exit");
fail_point!("checkpoint-after-sync");
// Update the metadata file, with new 'disk_consistent_lsn'
//
@@ -2866,12 +2813,11 @@ impl Timeline {
Ok(())
}
// Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
// in layer map immediately. The caller is responsible to put it into the layer map.
// Write out the given frozen in-memory layer as a new L0 delta file
async fn create_delta_layer(
self: &Arc<Self>,
frozen_layer: &Arc<InMemoryLayer>,
) -> anyhow::Result<DeltaLayer> {
) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
let span = tracing::info_span!("blocking");
let new_delta: DeltaLayer = tokio::task::spawn_blocking({
let _g = span.entered();
@@ -2908,8 +2854,25 @@ impl Timeline {
})
.await
.context("spawn_blocking")??;
let new_delta_name = new_delta.filename();
let sz = new_delta.desc.file_size;
Ok(new_delta)
// Add it to the layer map
let l = Arc::new(new_delta);
let mut guard = self.layers.write().await;
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
guard.track_new_l0_delta_layer(l);
// update metrics
self.metrics.resident_physical_size_gauge.add(sz);
self.metrics.num_persistent_files_created.inc_by(1);
self.metrics.persistent_bytes_written.inc_by(sz);
Ok((new_delta_name, LayerFileMetadata::new(sz)))
}
async fn repartition(
@@ -3161,7 +3124,7 @@ impl Timeline {
#[derive(Default)]
struct CompactLevel0Phase1Result {
new_layers: Vec<Arc<DeltaLayer>>,
new_layers: Vec<DeltaLayer>,
deltas_to_compact: Vec<Arc<PersistentLayerDesc>>,
}
@@ -3310,8 +3273,6 @@ impl Timeline {
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
/// start of level0 files compaction, the on-demand download should be revisited as well.
///
/// [`compact_inner`]: Self::compact_inner
fn compact_level0_phase1(
self: Arc<Self>,
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
@@ -3339,37 +3300,6 @@ impl Timeline {
return Ok(CompactLevel0Phase1Result::default());
}
// This failpoint is used together with `test_duplicate_layers` integration test.
// It returns the compaction result exactly the same layers as input to compaction.
// We want to ensure that this will not cause any problem when updating the layer map
// after the compaction is finished.
//
// Currently, there are two rare edge cases that will cause duplicated layers being
// inserted.
// 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
// is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
// map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
// point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
// and this causes an overwrite. This is acceptable because the content is the same, and we should do a
// layer replace instead of the normal remove / upload process.
// 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
// size length. Compaction will likely create the same set of n files afterwards.
//
// This failpoint is a superset of both of the cases.
fail_point!("compact-level0-phase1-return-same", |_| {
println!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
Ok(CompactLevel0Phase1Result {
new_layers: level0_deltas
.iter()
.map(|x| x.clone().downcast_delta_layer().unwrap())
.collect(),
deltas_to_compact: level0_deltas
.iter()
.map(|x| x.layer_desc().clone().into())
.collect(),
})
});
// Gather the files to compact in this iteration.
//
// Start with the oldest Level 0 delta file, and collect any other
@@ -3628,9 +3558,7 @@ impl Timeline {
|| contains_hole
{
// ... if so, flush previous layer and prepare to write new one
new_layers.push(Arc::new(
writer.take().unwrap().finish(prev_key.unwrap().next())?,
));
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
writer = None;
if contains_hole {
@@ -3668,7 +3596,7 @@ impl Timeline {
prev_key = Some(key);
}
if let Some(writer) = writer {
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
new_layers.push(writer.finish(prev_key.unwrap().next())?);
}
// Sync layers
@@ -3764,7 +3692,7 @@ impl Timeline {
}
// Before deleting any layers, we need to wait for their upload ops to finish.
// See remote_timeline_client module level comment on consistency.
// See storage_sync module level comment on consistency.
// Do it here because we don't want to hold self.layers.write() while waiting.
if let Some(remote_client) = &self.remote_client {
debug!("waiting for upload ops to complete");
@@ -3777,11 +3705,6 @@ impl Timeline {
let mut guard = self.layers.write().await;
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
// In some rare cases, we may generate a file with exactly the same key range / LSN as before the compaction.
// We should move to numbering the layer files instead of naming them using key range / LSN some day. But for
// now, we just skip the file to avoid unintentional modification to files on the disk and in the layer map.
let mut duplicated_layers = HashSet::new();
let mut insert_layers = Vec::new();
let mut remove_layers = Vec::new();
@@ -3808,33 +3731,21 @@ impl Timeline {
.add(metadata.len());
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
l.access_stats().record_residence_event(
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
x.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
let l = l as Arc<dyn PersistentLayer>;
if guard.contains(&l) {
duplicated_layers.insert(l.layer_desc().key());
} else {
if LayerMap::is_l0(l.layer_desc()) {
return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
}
insert_layers.push(l);
}
insert_layers.push(x);
}
// Now that we have reshuffled the data to set of new delta layers, we can
// delete the old ones
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
for ldesc in deltas_to_compact {
if duplicated_layers.contains(&ldesc.key()) {
// skip duplicated layers, they will not be removed; we have already overwritten them
// with new layers in the compaction phase 1.
continue;
}
layer_names_to_delete.push(ldesc.filename());
remove_layers.push(guard.get_from_desc(&ldesc));
for l in deltas_to_compact {
layer_names_to_delete.push(l.filename());
remove_layers.push(guard.get_from_desc(&l));
}
guard.finish_compact_l0(
@@ -4593,7 +4504,6 @@ impl LocalLayerInfoForDiskUsageEviction {
}
impl Timeline {
/// Returns non-remote layers for eviction.
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let guard = self.layers.read().await;
let layers = guard.layer_map();
@@ -4763,179 +4673,3 @@ pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
left == right
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use utils::{id::TimelineId, lsn::Lsn};
use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};
use super::{EvictionError, Timeline};
#[tokio::test]
async fn two_layer_eviction_attempts_at_the_same_time() {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let ctx = any_context();
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let rc = timeline
.remote_client
.clone()
.expect("just configured this");
let layer = find_some_layer(&timeline).await;
let cancel = tokio_util::sync::CancellationToken::new();
let batch = [layer];
let first = {
let cancel = cancel.clone();
async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
}
};
let second = async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
};
let (first, second) = tokio::join!(first, second);
let (first, second) = (only_one(first), only_one(second));
match (first, second) {
(Ok(()), Err(EvictionError::FileNotFound))
| (Err(EvictionError::FileNotFound), Ok(())) => {
// one of the evictions gets to do it,
// other one gets FileNotFound. all is good.
}
other => unreachable!("unexpected {:?}", other),
}
}
#[tokio::test]
async fn layer_eviction_aba_fails() {
let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let ctx = any_context();
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let _e = tracing::info_span!("foobar", tenant_id = %tenant.tenant_id, timeline_id = %timeline.timeline_id).entered();
let rc = timeline.remote_client.clone().unwrap();
// TenantHarness allows uploads to happen given GenericRemoteStorage is configured
let layer = find_some_layer(&timeline).await;
let cancel = tokio_util::sync::CancellationToken::new();
let batch = [layer];
let first = {
let cancel = cancel.clone();
async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
}
};
// lets imagine this is stuck somehow, still referencing the original `Arc<dyn PersistentLayer>`
let second = {
let cancel = cancel.clone();
async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
}
};
// while it's stuck, we evict and end up redownloading it
only_one(first.await).expect("eviction succeeded");
let layer = find_some_layer(&timeline).await;
let layer = layer.downcast_remote_layer().unwrap();
timeline.download_remote_layer(layer).await.unwrap();
let res = only_one(second.await);
assert!(
matches!(res, Err(EvictionError::LayerNotFound(_))),
"{res:?}"
);
// no more specific asserting, outside of preconds this is the only valid replacement
// failure
}
fn any_context() -> crate::context::RequestContext {
use crate::context::*;
use crate::task_mgr::*;
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
fn only_one<T>(mut input: Vec<Option<T>>) -> T {
assert_eq!(1, input.len());
input
.pop()
.expect("length just checked")
.expect("no cancellation")
}
async fn find_some_layer(timeline: &Timeline) -> Arc<dyn PersistentLayer> {
let layers = timeline.layers.read().await;
let desc = layers
.layer_map()
.iter_historic_layers()
.next()
.expect("must find one layer to evict");
layers.get_from_desc(&desc)
}
}

View File

@@ -30,7 +30,6 @@ use crate::{
tenant::{
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
storage_layer::PersistentLayer,
timeline::EvictionError,
LogicalSizeCalculationCause, Tenant,
},
};
@@ -101,11 +100,11 @@ impl Timeline {
match cf {
ControlFlow::Break(()) => break,
ControlFlow::Continue(sleep_until) => {
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
.await
.is_ok()
{
break;
tokio::select! {
_ = cancel.cancelled() => {
break;
}
_ = tokio::time::sleep_until(sleep_until) => { }
}
}
}
@@ -271,22 +270,20 @@ impl Timeline {
None => {
stats.skipped_for_shutdown += 1;
}
Some(Ok(())) => {
Some(Ok(true)) => {
debug!("evicted layer {l:?}");
stats.evicted += 1;
}
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
Some(Ok(false)) => {
debug!("layer is not evictable: {l:?}");
stats.not_evictable += 1;
}
Some(Err(EvictionError::FileNotFound)) => {
// compaction/gc removed the file while we were waiting on layer_removal_cs
stats.not_evictable += 1;
}
Some(Err(
e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
)) => {
let e = utils::error::report_compact_sources(&e);
warn!(layer = %l, "failed to evict layer: {e}");
stats.not_evictable += 1;
Some(Err(e)) => {
// This variant is the case where an unexpected error happened during eviction.
// Expected errors that result in non-eviction are `Some(Ok(false))`.
// So, dump Debug here to gather as much info as possible in this rare case.
warn!("failed to evict layer {l:?}: {e:?}");
stats.errors += 1;
}
}
}

View File

@@ -194,23 +194,10 @@ impl LayerManager {
updates.flush();
}
/// Flush a frozen layer and add the written delta layer to the layer map.
pub fn finish_flush_l0_layer(
&mut self,
delta_layer: Option<DeltaLayer>,
frozen_layer_for_check: &Arc<InMemoryLayer>,
) {
let l = self.layer_map.frozen_layers.pop_front();
/// Insert into the layer map when a new delta layer is created, called from `create_delta_layer`.
pub fn track_new_l0_delta_layer(&mut self, delta_layer: Arc<DeltaLayer>) {
let mut updates = self.layer_map.batch_update();
// Only one thread may call this function at a time (for this
// timeline). If two threads tried to flush the same frozen
// layer to disk at the same time, that would not work.
assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
if let Some(delta_layer) = delta_layer {
Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
}
Self::insert_historic_layer(delta_layer, &mut updates, &mut self.layer_fmgr);
updates.flush();
}
@@ -308,10 +295,6 @@ impl LayerManager {
Ok(())
}
pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
self.layer_fmgr.contains(layer)
}
}
pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
@@ -336,10 +319,6 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
}
}
pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
self.0.contains_key(&layer.layer_desc().key())
}
pub(crate) fn new() -> Self {
Self(HashMap::new())
}

View File

@@ -14,7 +14,10 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
&*crate::tenant::span::TENANT_ID_EXTRACTOR,
&*TIMELINE_ID_EXTRACTOR,
];
if let Err(missing) = check_fields_present!(fields) {
panic!("missing extractors: {missing:?}")
if let Err(missing) = check_fields_present(fields) {
panic!(
"missing extractors: {:?}",
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
)
}
}

View File

@@ -6,7 +6,7 @@
//! Current connection state is tracked too, to ensure it's not getting stale.
//!
//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
//! then a (re)connection happens, if necessary.
//! then a [re]connection happens, if necessary.
//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};

View File

@@ -1,4 +1,4 @@
comment = '** Deprecated ** Please use pg_embedding instead'
comment = 'hnsw index'
default_version = '0.1.0'
module_pathname = '$libdir/hnsw'
relocatable = true

View File

@@ -4,6 +4,7 @@
MODULE_big = neon
OBJS = \
$(WIN32RES) \
extension_server.o \
file_cache.o \
libpagestore.o \
libpqwalproposer.o \

View File

@@ -0,0 +1,104 @@
/*-------------------------------------------------------------------------
*
* extension_server.c
* Request compute_ctl to download extension files.
*
* IDENTIFICATION
* contrib/neon/extension_server.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tcop/pquery.h"
#include "tcop/utility.h"
#include "access/xact.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "commands/defrem.h"
#include "miscadmin.h"
#include "utils/acl.h"
#include "fmgr.h"
#include "utils/guc.h"
#include "port.h"
#include "fmgr.h"
#include <curl/curl.h>
static int extension_server_port = 0;
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
// to download all SQL (and data) files for an extension:
// curl -X POST http://localhost:8080/extension_server/postgis
// it covers two possible extension files layouts:
// 1. extension_name--version--platform.sql
// 2. extension_name/extension_name--version.sql
// extension_name/extra_files.csv
//
// to download specific library file:
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
static bool
neon_download_extension_file_http(const char *filename, bool is_library)
{
CURL *curl;
CURLcode res;
char *compute_ctl_url;
char *postdata;
bool ret = false;
if ((curl = curl_easy_init()) == NULL)
{
elog(ERROR, "Failed to initialize curl handle");
}
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
extension_server_port, filename, is_library ? "?is_library=true" : "");
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
// NOTE: 15L may be insufficient time for large extensions like postgis
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L /* seconds */);
if (curl)
{
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if (res == CURLE_OK)
{
ret = true;
}
else
{
// Don't error here because postgres will try to find the file
// and will fail with some proper error message if it's not found.
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
}
/* always cleanup */
curl_easy_cleanup(curl);
}
return ret;
}
void pg_init_extension_server()
{
// Port to connect to compute_ctl on localhost
// to request extension files.
DefineCustomIntVariable("neon.extension_server_port",
"connection string to the compute_ctl",
NULL,
&extension_server_port,
0, 0, INT_MAX,
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
// set download_extension_file_hook
prev_download_extension_file_hook = download_extension_file_hook;
download_extension_file_hook = neon_download_extension_file_http;
}

View File

@@ -0,0 +1 @@

View File

@@ -35,8 +35,11 @@ _PG_init(void)
{
pg_init_libpagestore();
pg_init_walproposer();
InitControlPlaneConnector();
pg_init_extension_server();
// Important: This must happen after other parts of the extension
// are loaded, otherwise any settings to GUCs that were set before
// the extension was loaded will be removed.

View File

@@ -21,6 +21,8 @@ extern char *neon_tenant;
extern void pg_init_libpagestore(void);
extern void pg_init_walproposer(void);
extern void pg_init_extension_server(void);
/*
* Returns true if we shouldn't do REDO on that block in record indicated by
* block_id; false otherwise.

406
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -5,6 +5,7 @@ use proxy::http;
use proxy::metrics;
use anyhow::bail;
use clap::{self, Arg};
use proxy::config::{self, ProxyConfig};
use std::pin::pin;
use std::{borrow::Cow, net::SocketAddr};
@@ -17,70 +18,6 @@ use utils::{project_git_version, sentry_init::init_sentry};
project_git_version!(GIT_VERSION);
use clap::{Parser, ValueEnum};
#[derive(Clone, Debug, ValueEnum)]
enum AuthBackend {
Console,
Postgres,
Link,
}
/// Neon proxy/router
#[derive(Parser)]
#[command(version = GIT_VERSION, about)]
struct ProxyCliArgs {
/// listen for incoming client connections on ip:port
#[clap(short, long, default_value = "127.0.0.1:4432")]
proxy: String,
#[clap(value_enum, long, default_value_t = AuthBackend::Link)]
auth_backend: AuthBackend,
/// listen for management callback connection on ip:port
#[clap(short, long, default_value = "127.0.0.1:7000")]
mgmt: String,
/// listen for incoming http connections (metrics, etc) on ip:port
#[clap(long, default_value = "127.0.0.1:7001")]
http: String,
/// listen for incoming wss connections on ip:port
#[clap(long)]
wss: Option<String>,
/// redirect unauthenticated users to the given uri in case of link auth
#[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
uri: String,
/// cloud API endpoint for authenticating users
#[clap(
short,
long,
default_value = "http://localhost:3000/authenticate_proxy_request/"
)]
auth_endpoint: String,
/// path to TLS key for client postgres connections
///
/// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
#[clap(short = 'k', long, alias = "ssl-key")]
tls_key: Option<String>,
/// path to TLS cert for client postgres connections
///
/// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
#[clap(short = 'c', long, alias = "ssl-cert")]
tls_cert: Option<String>,
/// path to directory with TLS certificates for client postgres connections
#[clap(long)]
certs_dir: Option<String>,
/// http endpoint to receive periodic metric updates
#[clap(long)]
metric_collection_endpoint: Option<String>,
/// how often metrics should be sent to a collection endpoint
#[clap(long)]
metric_collection_interval: Option<String>,
/// cache for `wake_compute` api method (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
wake_compute_cache: String,
/// Allow self-signed certificates for compute nodes (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
allow_self_signed_compute: bool,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let _logging_guard = proxy::logging::init().await?;
@@ -90,21 +27,21 @@ async fn main() -> anyhow::Result<()> {
info!("Version: {GIT_VERSION}");
::metrics::set_build_info_metric(GIT_VERSION);
let args = ProxyCliArgs::parse();
let args = cli().get_matches();
let config = build_config(&args)?;
info!("Authentication backend: {}", config.auth_backend);
// Check that we can bind to address before further initialization
let http_address: SocketAddr = args.http.parse()?;
let http_address: SocketAddr = args.get_one::<String>("http").unwrap().parse()?;
info!("Starting http on {http_address}");
let http_listener = TcpListener::bind(http_address).await?.into_std()?;
let mgmt_address: SocketAddr = args.mgmt.parse()?;
let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
info!("Starting mgmt on {mgmt_address}");
let mgmt_listener = TcpListener::bind(mgmt_address).await?;
let proxy_address: SocketAddr = args.proxy.parse()?;
let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
info!("Starting proxy on {proxy_address}");
let proxy_listener = TcpListener::bind(proxy_address).await?;
let cancellation_token = CancellationToken::new();
@@ -118,7 +55,7 @@ async fn main() -> anyhow::Result<()> {
cancellation_token.clone(),
));
if let Some(wss_address) = args.wss {
if let Some(wss_address) = args.get_one::<String>("wss") {
let wss_address: SocketAddr = wss_address.parse()?;
info!("Starting wss on {wss_address}");
let wss_listener = TcpListener::bind(wss_address).await?;
@@ -165,24 +102,31 @@ async fn main() -> anyhow::Result<()> {
}
/// ProxyConfig is created at proxy startup, and lives forever.
fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let tls_config = match (&args.tls_key, &args.tls_cert) {
fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> {
let tls_config = match (
args.get_one::<String>("tls-key"),
args.get_one::<String>("tls-cert"),
) {
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
key_path,
cert_path,
args.certs_dir.as_ref(),
args.get_one::<String>("certs-dir"),
)?),
(None, None) => None,
_ => bail!("either both or neither tls-key and tls-cert must be specified"),
};
if args.allow_self_signed_compute {
let allow_self_signed_compute: bool = args
.get_one::<String>("allow-self-signed-compute")
.unwrap()
.parse()?;
if allow_self_signed_compute {
warn!("allowing self-signed compute certificates");
}
let metric_collection = match (
&args.metric_collection_endpoint,
&args.metric_collection_interval,
args.get_one::<String>("metric-collection-endpoint"),
args.get_one::<String>("metric-collection-interval"),
) {
(Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
endpoint: endpoint.parse()?,
@@ -195,38 +139,145 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
),
};
let auth_backend = match &args.auth_backend {
AuthBackend::Console => {
let config::CacheOptions { size, ttl } = args.wake_compute_cache.parse()?;
let auth_backend = match args.get_one::<String>("auth-backend").unwrap().as_str() {
"console" => {
let config::CacheOptions { size, ttl } = args
.get_one::<String>("wake-compute-cache")
.unwrap()
.parse()?;
info!("Using NodeInfoCache (wake_compute) with size={size} ttl={ttl:?}");
let caches = Box::leak(Box::new(console::caches::ApiCaches {
node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
}));
let url = args.auth_endpoint.parse()?;
let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
let endpoint = http::Endpoint::new(url, http::new_client());
let api = console::provider::neon::Api::new(endpoint, caches);
auth::BackendType::Console(Cow::Owned(api), ())
}
AuthBackend::Postgres => {
let url = args.auth_endpoint.parse()?;
"postgres" => {
let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
let api = console::provider::mock::Api::new(url);
auth::BackendType::Postgres(Cow::Owned(api), ())
}
AuthBackend::Link => {
let url = args.uri.parse()?;
"link" => {
let url = args.get_one::<String>("uri").unwrap().parse()?;
auth::BackendType::Link(Cow::Owned(url))
}
other => bail!("unsupported auth backend: {other}"),
};
let config = Box::leak(Box::new(ProxyConfig {
tls_config,
auth_backend,
metric_collection,
allow_self_signed_compute: args.allow_self_signed_compute,
allow_self_signed_compute,
}));
Ok(config)
}
fn cli() -> clap::Command {
clap::Command::new("Neon proxy/router")
.disable_help_flag(true)
.version(GIT_VERSION)
.arg(
Arg::new("proxy")
.short('p')
.long("proxy")
.help("listen for incoming client connections on ip:port")
.default_value("127.0.0.1:4432"),
)
.arg(
Arg::new("auth-backend")
.long("auth-backend")
.value_parser(["console", "postgres", "link"])
.default_value("link"),
)
.arg(
Arg::new("mgmt")
.short('m')
.long("mgmt")
.help("listen for management callback connection on ip:port")
.default_value("127.0.0.1:7000"),
)
.arg(
Arg::new("http")
.long("http")
.help("listen for incoming http connections (metrics, etc) on ip:port")
.default_value("127.0.0.1:7001"),
)
.arg(
Arg::new("wss")
.long("wss")
.help("listen for incoming wss connections on ip:port"),
)
.arg(
Arg::new("uri")
.short('u')
.long("uri")
.help("redirect unauthenticated users to the given uri in case of link auth")
.default_value("http://localhost:3000/psql_session/"),
)
.arg(
Arg::new("auth-endpoint")
.short('a')
.long("auth-endpoint")
.help("cloud API endpoint for authenticating users")
.default_value("http://localhost:3000/authenticate_proxy_request/"),
)
.arg(
Arg::new("tls-key")
.short('k')
.long("tls-key")
.alias("ssl-key") // backwards compatibility
.help("path to TLS key for client postgres connections"),
)
.arg(
Arg::new("tls-cert")
.short('c')
.long("tls-cert")
.alias("ssl-cert") // backwards compatibility
.help("path to TLS cert for client postgres connections"),
)
// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
.arg(
Arg::new("certs-dir")
.long("certs-dir")
.help("path to directory with TLS certificates for client postgres connections"),
)
.arg(
Arg::new("metric-collection-endpoint")
.long("metric-collection-endpoint")
.help("http endpoint to receive periodic metric updates"),
)
.arg(
Arg::new("metric-collection-interval")
.long("metric-collection-interval")
.help("how often metrics should be sent to a collection endpoint"),
)
.arg(
Arg::new("wake-compute-cache")
.long("wake-compute-cache")
.help("cache for `wake_compute` api method (use `size=0` to disable)")
.default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO),
)
.arg(
Arg::new("allow-self-signed-compute")
.long("allow-self-signed-compute")
.help("Allow self-signed certificates for compute nodes (for testing)")
.default_value("false"),
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn verify_cli() {
cli().debug_assert();
}
}

View File

@@ -262,21 +262,24 @@ pub mod timed_lru {
token: Option<(C, C::LookupInfo<C::Key>)>,
/// The value itself.
value: C::Value,
pub value: C::Value,
}
impl<C: Cache> Cached<C> {
/// Place any entry into this wrapper; invalidation will be a no-op.
pub fn new_uncached(value: C::Value) -> Self {
Self { token: None, value }
/// Unfortunately, rust doesn't let us implement [`From`] or [`Into`].
pub fn new_uncached(value: impl Into<C::Value>) -> Self {
Self {
token: None,
value: value.into(),
}
}
/// Drop this entry from a cache if it's still there.
pub fn invalidate(self) -> C::Value {
pub fn invalidate(&self) {
if let Some((cache, info)) = &self.token {
cache.invalidate(info);
}
self.value
}
/// Tell if this entry is actually cached.

View File

@@ -110,7 +110,7 @@ impl<'a> Session<'a> {
impl Session<'_> {
/// Store the cancel token for the given session.
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
/// This enables query cancellation in [`crate::proxy::handshake`].
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
info!("enabling query cancellation for this session");
self.cancel_map

View File

@@ -1,9 +1,4 @@
use crate::{
auth::parse_endpoint_param,
cancellation::CancelClosure,
console::errors::WakeComputeError,
error::{io_error, UserFacingError},
};
use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
@@ -18,7 +13,7 @@ const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
#[derive(Debug, Error)]
pub enum ConnectionError {
/// This error doesn't seem to reveal any secrets; for instance,
/// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
/// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such.
#[error("{COULD_NOT_CONNECT}: {0}")]
Postgres(#[from] tokio_postgres::Error),
@@ -29,12 +24,6 @@ pub enum ConnectionError {
TlsError(#[from] native_tls::Error),
}
impl From<WakeComputeError> for ConnectionError {
fn from(value: WakeComputeError) -> Self {
io_error(value).into()
}
}
impl UserFacingError for ConnectionError {
fn to_string_client(&self) -> String {
use ConnectionError::*;

View File

@@ -211,7 +211,7 @@ pub struct CacheOptions {
}
impl CacheOptions {
/// Default options for [`crate::console::provider::NodeInfoCache`].
/// Default options for [`crate::auth::caches::NodeInfoCache`].
pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";
/// Parse cache options passed via cmdline.

View File

@@ -186,18 +186,18 @@ pub trait Api {
async fn get_auth_info(
&self,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
creds: &ClientCredentials<'_>,
) -> Result<Option<AuthInfo>, errors::GetAuthInfoError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
creds: &ClientCredentials<'_>,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}
/// Various caches for [`console`](super).
/// Various caches for [`console`].
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub node_info: NodeInfoCache,

View File

@@ -106,7 +106,7 @@ impl super::Api for Api {
async fn get_auth_info(
&self,
_extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
creds: &ClientCredentials<'_>,
) -> Result<Option<AuthInfo>, GetAuthInfoError> {
self.do_get_auth_info(creds).await
}
@@ -115,7 +115,7 @@ impl super::Api for Api {
async fn wake_compute(
&self,
_extra: &ConsoleReqExtra<'_>,
_creds: &ClientCredentials,
_creds: &ClientCredentials<'_>,
) -> Result<CachedNodeInfo, WakeComputeError> {
self.do_wake_compute()
.map_ok(CachedNodeInfo::new_uncached)

Some files were not shown because too many files have changed in this diff Show More