Merge remote-tracking branch 'origin/main' into arpad/lsn_by_ts

This commit is contained in:
Arpad Müller
2023-11-11 02:24:51 +01:00
93 changed files with 2097 additions and 1251 deletions

View File

@@ -22,5 +22,11 @@ platforms = [
# "x86_64-pc-windows-msvc",
]
[final-excludes]
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
# from depending on workspace-hack because most of the dependencies are not used.
workspace-members = ["vm_monitor"]
# Write out exact versions rather than a semver range. (Defaults to false.)
# exact-versions = true

View File

@@ -17,8 +17,9 @@ assignees: ''
## Implementation ideas
## Tasks
- [ ]
```[tasklist]
### Tasks
```
## Other related tasks and Epics

View File

@@ -1,5 +1,7 @@
self-hosted-runner:
labels:
- arm64
- dev
- gen3
- large
- small

View File

@@ -172,10 +172,10 @@ jobs:
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() }}
run: cargo deny check
run: cargo deny check --hide-inclusion-graph
build-neon:
needs: [ check-permissions ]
needs: [ check-permissions, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
env:
BUILD_TYPE: ${{ matrix.build_type }}
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
steps:
- name: Fix git ownership
@@ -723,6 +724,7 @@ jobs:
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}

View File

@@ -21,7 +21,10 @@ env:
jobs:
check-macos-build:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
timeout-minutes: 90
runs-on: macos-latest
@@ -112,8 +115,182 @@ jobs:
- name: Check that no warnings are produced
run: ./run_clippy.sh
check-linux-arm-build:
timeout-minutes: 90
runs-on: [ self-hosted, dev, arm64 ]
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
CARGO_FEATURES: --features testing
CARGO_FLAGS: --locked --release
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Set env variables
run: |
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
- name: Build walproposer-lib
run: mold -run make walproposer-lib -j$(nproc)
- name: Run cargo build
run: |
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test
run: |
cargo test $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
timeout-minutes: 90
runs-on: [ self-hosted, dev, arm64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
# This will catch compiler & clippy warnings in all feature combinations.
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
# NB: keep clippy args in sync with ./run_clippy.sh
- run: |
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
echo "No clippy args found in .neon_clippy_args"
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() }}
run: cargo deny check
gather-rust-build-stats:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
if: |
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned

632
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -36,6 +36,7 @@ license = "Apache-2.0"
## All dependency versions, used in the project
[workspace.dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
azure_core = "0.16"
azure_identity = "0.16"
@@ -47,6 +48,7 @@ async-trait = "0.1"
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.29"
aws-smithy-http = "0.56"
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
aws-credential-types = "0.56"
aws-types = "0.56"
axum = { version = "0.6.20", features = ["ws"] }
@@ -65,7 +67,7 @@ comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-utils = "0.8.5"
dashmap = "5.5.0"
dashmap = { version = "5.5.0", features = ["raw-api"] }
either = "1.8"
enum-map = "2.4.2"
enumset = "1.0.12"
@@ -81,7 +83,7 @@ hex = "0.4"
hex-literal = "0.4"
hmac = "0.12.1"
hostname = "0.3.1"
http-types = "2"
http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"

View File

@@ -27,6 +27,7 @@ RUN set -e \
FROM $REPOSITORY/$IMAGE:$TAG AS build
WORKDIR /home/nonroot
ARG GIT_VERSION=local
ARG BUILD_TAG
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

View File

@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
#
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+@echo "Configuring Postgres $* build"
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
exit 1; }
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \

View File

@@ -710,8 +710,12 @@ impl ComputeNode {
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
// have opened connection to Postgres and superuser access.
#[instrument(skip_all)]
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
client.simple_query("SELECT pg_reload_conf()")?;
fn pg_reload_conf(&self) -> Result<()> {
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
Command::new(pgctl_bin)
.args(["reload", "-D", &self.pgdata])
.output()
.expect("cannot run pg_ctl process");
Ok(())
}
@@ -724,9 +728,9 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
self.pg_reload_conf()?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
self.pg_reload_conf(&mut client)?;
// Proceed with post-startup configuration. Note, that order of operations is important.
// Disable DDL forwarding because control plane already knows about these roles/databases.

View File

@@ -78,7 +78,7 @@ use regex::Regex;
use remote_storage::*;
use serde_json;
use std::io::Read;
use std::num::{NonZeroU32, NonZeroUsize};
use std::num::NonZeroUsize;
use std::path::Path;
use std::str;
use tar::Archive;
@@ -281,8 +281,6 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)

View File

@@ -1,7 +1,7 @@
//!
//! Various tools and helpers to handle cluster / compute node (Postgres)
//! configuration.
//!
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod checker;
pub mod config;
pub mod configurator;

View File

@@ -262,7 +262,7 @@ where
P: Into<Utf8PathBuf>,
{
let path: Utf8PathBuf = path.into();
// SAFETY
// SAFETY:
// pre_exec is marked unsafe because it runs between fork and exec.
// Why is that dangerous in various ways?
// Long answer: https://github.com/rust-lang/rust/issues/39575

View File

@@ -1,11 +1,10 @@
//
// Local control plane.
//
// Can start, configure and stop postgres instances running as a local processes.
//
// Intended to be used in integration tests and in CLI tools for
// local installations.
//
//! Local control plane.
//!
//! Can start, configure and stop postgres instances running as a local processes.
//!
//! Intended to be used in integration tests and in CLI tools for
//! local installations.
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod attachment_service;
mod background_process;

View File

@@ -74,10 +74,30 @@ highlight = "all"
workspace-default-features = "allow"
external-default-features = "allow"
allow = []
deny = []
skip = []
skip-tree = []
[[bans.deny]]
# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
name = "async-std"
[[bans.deny]]
name = "async-io"
[[bans.deny]]
name = "async-waker"
[[bans.deny]]
name = "async-global-executor"
[[bans.deny]]
name = "async-executor"
[[bans.deny]]
name = "smol"
# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html

View File

@@ -1,3 +1,5 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod requests;
pub mod responses;
pub mod spec;

View File

@@ -1,6 +1,6 @@
//!
//! Shared code for consumption metics collection
//!
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use chrono::{DateTime, Utc};
use rand::Rng;
use serde::{Deserialize, Serialize};

View File

@@ -2,6 +2,7 @@
//! make sure that we use the same dep version everywhere.
//! Otherwise, we might not see all metrics registered via
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use once_cell::sync::Lazy;
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
pub use prometheus::opts;

View File

@@ -1,3 +1,5 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp;
/// Public API types

View File

@@ -2,6 +2,8 @@
//! To use, create PostgresBackend and run() it, passing the Handler
//! implementation determining how to process the queries. Currently its API
//! is rather narrow, but we can extend it once required.
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use anyhow::Context;
use bytes::Bytes;
use futures::pin_mut;
@@ -15,7 +17,7 @@ use std::{fmt, io};
use std::{future::Future, str::FromStr};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
use tracing::{debug, error, info, trace};
use tracing::{debug, error, info, trace, warn};
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
use pq_proto::{
@@ -33,6 +35,11 @@ pub enum QueryError {
/// We were instructed to shutdown while processing the query
#[error("Shutting down")]
Shutdown,
/// Authentication failure
#[error("Unauthorized: {0}")]
Unauthorized(std::borrow::Cow<'static, str>),
#[error("Simulated Connection Error")]
SimulatedConnectionError,
/// Some other error
#[error(transparent)]
Other(#[from] anyhow::Error),
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
impl QueryError {
pub fn pg_error_code(&self) -> &'static [u8; 5] {
match self {
Self::Disconnected(_) => b"08006", // connection failure
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
}
}
@@ -608,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
self.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
&short_error(&e),
Some(e.pg_error_code()),
))?;
return Err(e);
@@ -730,6 +738,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
if let Err(e) = handler.process_query(self, query_string).await {
match e {
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
QueryError::SimulatedConnectionError => {
return Err(QueryError::SimulatedConnectionError)
}
e => {
log_query_error(query_string, &e);
let short_error = short_error(&e);
@@ -964,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
match e {
QueryError::Disconnected(connection_error) => connection_error.to_string(),
QueryError::Shutdown => "shutdown".to_string(),
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
QueryError::Other(e) => format!("{e:#}"),
}
}
@@ -980,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
QueryError::Disconnected(other_connection_error) => {
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
}
QueryError::SimulatedConnectionError => {
error!("query handler for query '{query}' failed due to a simulated connection error")
}
QueryError::Shutdown => {
info!("query handler for '{query}' cancelled during tenant shutdown")
}
QueryError::Unauthorized(e) => {
warn!("query handler for '{query}' failed with authentication error: {e}");
}
QueryError::Other(e) => {
error!("query handler for '{query}' failed: {e:?}");
}

View File

@@ -1,3 +1,5 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use anyhow::{bail, Context};
use itertools::Itertools;
use std::borrow::Cow;

View File

@@ -8,6 +8,7 @@
// modules included with the postgres_ffi macro depend on the types of the specific version's
// types, and trigger a too eager lint.
#![allow(clippy::duplicate_mod)]
#![deny(clippy::undocumented_unsafe_blocks)]
use bytes::Bytes;
use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
pub mod bindings {
// bindgen generates bindings for a lot of stuff we don't need
#![allow(dead_code)]
#![allow(clippy::undocumented_unsafe_blocks)]
use serde::{Deserialize, Serialize};
include!(concat!(

View File

@@ -1,6 +1,7 @@
//! Postgres protocol messages serialization-deserialization. See
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
//! on message formats.
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod framed;

View File

@@ -8,6 +8,7 @@ license.workspace = true
anyhow.workspace = true
async-trait.workspace = true
once_cell.workspace = true
aws-smithy-async.workspace = true
aws-smithy-http.workspace = true
aws-types.workspace = true
aws-config.workspace = true

View File

@@ -6,19 +6,15 @@
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
//! * [`azure_blob`] allows to use Azure Blob storage as an external storage
//!
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
mod azure_blob;
mod local_fs;
mod s3_bucket;
mod simulate_failures;
use std::{
collections::HashMap,
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
pin::Pin,
sync::Arc,
};
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
use anyhow::{bail, Context};
use camino::{Utf8Path, Utf8PathBuf};
@@ -34,12 +30,6 @@ pub use self::{
};
use s3_bucket::RequestKind;
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
/// ~200 RPS for IAM services
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -441,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
/// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig {
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
pub max_concurrent_syncs: NonZeroUsize,
/// Max allowed errors before the sync task is considered failed and evicted.
pub max_sync_errors: NonZeroU32,
/// The storage connection configuration.
pub storage: RemoteStorageKind,
}
@@ -540,18 +526,6 @@ impl RemoteStorageConfig {
let use_azure = container_name.is_some() && container_region.is_some();
let max_concurrent_syncs = NonZeroUsize::new(
parse_optional_integer("max_concurrent_syncs", toml)?
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
)
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
let max_sync_errors = NonZeroU32::new(
parse_optional_integer("max_sync_errors", toml)?
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
)
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
let default_concurrency_limit = if use_azure {
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
} else {
@@ -633,11 +607,7 @@ impl RemoteStorageConfig {
}
};
Ok(Some(RemoteStorageConfig {
max_concurrent_syncs,
max_sync_errors,
storage,
}))
Ok(Some(RemoteStorageConfig { storage }))
}
}

View File

@@ -4,23 +4,27 @@
//! allowing multiple api users to independently work with the same S3 bucket, if
//! their bucket prefixes are both specified and different.
use std::borrow::Cow;
use std::{borrow::Cow, sync::Arc};
use anyhow::Context;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
meta::credentials::CredentialsProviderChain,
provider_config::ProviderConfig,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
};
use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{
config::{Config, Region},
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
primitives::ByteStream,
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_http::body::SdkBody;
use hyper::Body;
use scopeguard::ScopeGuard;
@@ -83,10 +87,23 @@ impl S3Bucket {
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
let mut retry_config = RetryConfigBuilder::new();
retry_config
.set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive));
let mut config_builder = Config::builder()
.region(region)
.credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider);
.credentials_provider(credentials_provider)
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
.retry_config(retry_config.build());
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet;
use std::env;
use std::num::{NonZeroU32, NonZeroUsize};
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::path::PathBuf;
use std::sync::Arc;
@@ -469,8 +469,6 @@ fn create_azure_client(
let random = rand::thread_rng().gen::<u32>();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AzureContainer(AzureConfig {
container_name: remote_storage_azure_container,
container_region: remote_storage_azure_region,

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet;
use std::env;
use std::num::{NonZeroU32, NonZeroUsize};
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::path::PathBuf;
use std::sync::Arc;
@@ -396,8 +396,6 @@ fn create_s3_client(
let random = rand::thread_rng().gen::<u32>();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,

View File

@@ -1,3 +1,5 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use const_format::formatcp;
/// Public API types

View File

@@ -1,4 +1,6 @@
//! Synthetic size calculation
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
mod calculation;
pub mod svg;

View File

@@ -32,6 +32,8 @@
//! .init();
//! }
//! ```
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use opentelemetry::sdk::Resource;
use opentelemetry::KeyValue;

View File

@@ -5,6 +5,7 @@ edition.workspace = true
license.workspace = true
[dependencies]
arc-swap.workspace = true
sentry.workspace = true
async-trait.workspace = true
anyhow.workspace = true

View File

@@ -1,7 +1,8 @@
// For details about authentication see docs/authentication.md
use arc_swap::ArcSwap;
use serde;
use std::fs;
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
use anyhow::Result;
use camino::Utf8Path;
@@ -10,7 +11,7 @@ use jsonwebtoken::{
};
use serde::{Deserialize, Serialize};
use crate::id::TenantId;
use crate::{http::error::ApiError, id::TenantId};
/// Algorithm to use. We require EdDSA.
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -44,31 +45,106 @@ impl Claims {
}
}
pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
impl SwappableJwtAuth {
pub fn new(jwt_auth: JwtAuth) -> Self {
SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
}
pub fn swap(&self, jwt_auth: JwtAuth) {
self.0.swap(Arc::new(jwt_auth));
}
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
self.0.load().decode(token)
}
}
impl std::fmt::Debug for SwappableJwtAuth {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Swappable({:?})", self.0.load())
}
}
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
pub struct AuthError(pub Cow<'static, str>);
impl Display for AuthError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<AuthError> for ApiError {
fn from(_value: AuthError) -> Self {
// Don't pass on the value of the AuthError as a precautionary measure.
// Being intentionally vague in public error communication hurts debugability
// but it is more secure.
ApiError::Forbidden("JWT authentication error".to_string())
}
}
pub struct JwtAuth {
decoding_key: DecodingKey,
decoding_keys: Vec<DecodingKey>,
validation: Validation,
}
impl JwtAuth {
pub fn new(decoding_key: DecodingKey) -> Self {
pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
let mut validation = Validation::default();
validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
// The default 'required_spec_claims' is 'exp'. But we don't want to require
// expiration.
validation.required_spec_claims = [].into();
Self {
decoding_key,
decoding_keys,
validation,
}
}
pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
let public_key = fs::read(key_path)?;
Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
let metadata = key_path.metadata()?;
let decoding_keys = if metadata.is_dir() {
let mut keys = Vec::new();
for entry in fs::read_dir(key_path)? {
let path = entry?.path();
if !path.is_file() {
// Ignore directories (don't recurse)
continue;
}
let public_key = fs::read(path)?;
keys.push(DecodingKey::from_ed_pem(&public_key)?);
}
keys
} else if metadata.is_file() {
let public_key = fs::read(key_path)?;
vec![DecodingKey::from_ed_pem(&public_key)?]
} else {
anyhow::bail!("path is neither a directory or a file")
};
if decoding_keys.is_empty() {
anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
}
Ok(Self::new(decoding_keys))
}
pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
Ok(decode(token, &self.decoding_key, &self.validation)?)
/// Attempt to decode the token with the internal decoding keys.
///
/// The function tries the stored decoding keys in succession,
/// and returns the first yielding a successful result.
/// If there is no working decoding key, it returns the last error.
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
let mut res = None;
for decoding_key in &self.decoding_keys {
res = Some(decode(token, decoding_key, &self.validation));
if let Some(Ok(res)) = res {
return Ok(res);
}
}
if let Some(res) = res {
res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
} else {
Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
}
}
}
@@ -108,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
"#;
#[test]
fn test_decode() -> Result<(), anyhow::Error> {
fn test_decode() {
let expected_claims = Claims {
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
scope: Scope::Tenant,
};
@@ -129,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
// Check it can be validated with the public key
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
let claims_from_token = auth.decode(encoded_eddsa)?.claims;
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
assert_eq!(claims_from_token, expected_claims);
Ok(())
}
#[test]
fn test_encode() -> Result<(), anyhow::Error> {
fn test_encode() {
let claims = Claims {
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
scope: Scope::Tenant,
};
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
// decode it back
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
let decoded = auth.decode(&encoded)?;
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
let decoded = auth.decode(&encoded).unwrap();
assert_eq!(decoded.claims, claims);
Ok(())
}
}

View File

@@ -1,4 +1,4 @@
use crate::auth::{Claims, JwtAuth};
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use anyhow::Context;
use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
}
pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
) -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move {
if let Some(auth) = provide_auth(&req) {
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
})?;
let token = parse_token(header_value)?;
let data = auth
.decode(token)
.map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
let data = auth.decode(token).map_err(|err| {
warn!("Authentication error: {err}");
// Rely on From<AuthError> for ApiError impl
err
})?;
req.set_context(data.claims);
}
None => {
@@ -450,12 +452,11 @@ where
pub fn check_permission_with(
req: &Request<Body>,
check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
) -> Result<(), ApiError> {
match req.context::<Claims>() {
Some(claims) => {
Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
}
Some(claims) => Ok(check_permission(&claims)
.map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
None => Ok(()), // claims is None because auth is disabled
}
}

View File

@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::error::Error as StdError;
use thiserror::Error;
use tracing::{error, info};
use tracing::{error, info, warn};
#[derive(Debug, Error)]
pub enum ApiError {
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
// Print a stack trace for Internal Server errors
match api_error {
ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
warn!("Error processing HTTP request: {api_error:#}")
}
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),

View File

@@ -120,6 +120,8 @@ impl Id {
chunk[0] = HEX[((b >> 4) & 0xf) as usize];
chunk[1] = HEX[(b & 0xf) as usize];
}
// SAFETY: vec constructed out of `HEX`, it can only be ascii
unsafe { String::from_utf8_unchecked(buf) }
}
}

View File

@@ -1,5 +1,6 @@
//! `utils` is intended to be a place to put code that is shared
//! between other crates in this repository.
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod backoff;

View File

@@ -125,6 +125,9 @@ where
// Wake everyone with an error.
let mut internal = self.internal.lock().unwrap();
// Block any future waiters from starting
internal.shutdown = true;
// This will steal the entire waiters map.
// When we drop it all waiters will be woken.
mem::take(&mut internal.waiters)

View File

@@ -1,6 +1,7 @@
/// Immediately terminate the calling process without calling
/// atexit callbacks, C runtime destructors etc. We mainly use
/// this to protect coverage data from concurrent writes.
pub fn exit_now(code: u8) {
pub fn exit_now(code: u8) -> ! {
// SAFETY: exiting is safe, the ffi is not safe
unsafe { nix::libc::_exit(code as _) };
}

View File

@@ -85,6 +85,13 @@ impl Gate {
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
}
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
/// the CancellationToken on such types is analogous to "Did shutdown start?"
pub fn close_complete(&self) -> bool {
self.sem.is_closed()
}
async fn do_close(&self) {
tracing::debug!(gate = self.name, "Closing Gate...");
match self.sem.acquire_many(Self::MAX_UNITS).await {

View File

@@ -19,13 +19,12 @@ inotify.workspace = true
serde.workspace = true
serde_json.workspace = true
sysinfo.workspace = true
tokio.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[target.'cfg(target_os = "linux")'.dependencies]
cgroups-rs = "0.3.3"

View File

@@ -1,3 +1,5 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
#![cfg(target_os = "linux")]
use anyhow::Context;

View File

@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
}
}
#[allow(clippy::unnecessary_cast)]
extern "C" fn wal_read(
sk: *mut Safekeeper,
buf: *mut ::std::os::raw::c_char,
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
}
/// Take ownership of `Vec<u8>` from StringInfoData.
#[allow(clippy::unnecessary_cast)]
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
if pg.data.is_null() {
return None;

View File

@@ -186,7 +186,7 @@ impl Wrapper {
.unwrap()
.into_bytes_with_nul();
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

View File

@@ -1,22 +1,21 @@
use anyhow::{bail, Result};
use utils::auth::{Claims, Scope};
use utils::auth::{AuthError, Claims, Scope};
use utils::id::TenantId;
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
match (&claims.scope, tenant_id) {
(Scope::Tenant, None) => {
bail!("Attempt to access management api with tenant scope. Permission denied")
}
(Scope::Tenant, None) => Err(AuthError(
"Attempt to access management api with tenant scope. Permission denied".into(),
)),
(Scope::Tenant, Some(tenant_id)) => {
if claims.tenant_id.unwrap() != tenant_id {
bail!("Tenant id mismatch. Permission denied")
return Err(AuthError("Tenant id mismatch. Permission denied".into()));
}
Ok(())
}
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
(Scope::SafekeeperData, _) => {
bail!("SafekeeperData scope makes no sense for Pageserver")
}
(Scope::SafekeeperData, _) => Err(AuthError(
"SafekeeperData scope makes no sense for Pageserver".into(),
)),
}
}

View File

@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
use utils::logging::TracingErrorLayerEnablement;
use utils::signals::ShutdownSignals;
use utils::{
auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
signals::Signal, tcp_listener,
auth::{JwtAuth, SwappableJwtAuth},
logging, project_build_tag, project_git_version,
sentry_init::init_sentry,
signals::Signal,
tcp_listener,
};
project_git_version!(GIT_VERSION);
@@ -321,13 +324,12 @@ fn start_pageserver(
let http_auth;
let pg_auth;
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
// unwrap is ok because check is performed when creating config, so path is set and file exists
// unwrap is ok because check is performed when creating config, so path is set and exists
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
info!(
"Loading public key for verifying JWT tokens from {:#?}",
key_path
);
let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
let jwt_auth = JwtAuth::from_key_path(key_path)?;
let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
http_auth = match &conf.http_auth_type {
AuthType::Trust => None,
@@ -410,7 +412,7 @@ fn start_pageserver(
// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
TenantSharedResources {
broker_client: broker_client.clone(),
@@ -420,6 +422,7 @@ fn start_pageserver(
order,
shutdown_pageserver.clone(),
))?;
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx;
@@ -548,6 +551,7 @@ fn start_pageserver(
let router_state = Arc::new(
http::routes::State::new(
conf,
tenant_manager,
http_auth.clone(),
remote_storage.clone(),
broker_client.clone(),

View File

@@ -161,7 +161,7 @@ pub struct PageServerConf {
pub http_auth_type: AuthType,
/// authentication method for libpq connections from compute
pub pg_auth_type: AuthType,
/// Path to a file containing public key for verifying JWT tokens.
/// Path to a file or directory containing public key(s) for verifying JWT tokens.
/// Used for both mgmt and compute auth, if enabled.
pub auth_validation_public_key_path: Option<Utf8PathBuf>,
@@ -1314,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
assert_eq!(
parsed_remote_storage_config,
RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
)
.unwrap(),
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
.unwrap(),
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
},
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1380,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
assert_eq!(
parsed_remote_storage_config,
RemoteStorageConfig {
max_concurrent_syncs,
max_sync_errors,
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: bucket_name.clone(),
bucket_region: bucket_region.clone(),

View File

@@ -893,14 +893,6 @@ mod test {
std::fs::create_dir_all(remote_fs_dir)?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
let storage_config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
)
.unwrap(),
max_sync_errors: std::num::NonZeroU32::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
)
.unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

View File

@@ -55,21 +55,24 @@ impl Deleter {
/// Wrap the remote `delete_objects` with a failpoint
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint hit"))
});
// A backoff::retry is used here for two reasons:
// - To provide a backoff rather than busy-polling the API on errors
// - To absorb transient 429/503 conditions without hitting our error
// logging path for issues deleting objects.
backoff::retry(
|| async { self.remote_storage.delete_objects(&self.accumulator).await },
|| async {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
});
self.remote_storage.delete_objects(&self.accumulator).await
},
|_| false,
3,
10,

View File

@@ -52,6 +52,31 @@ paths:
schema:
type: object
/v1/reload_auth_validation_keys:
post:
description: Reloads the JWT public keys from their pre-configured location on disk.
responses:
"200":
description: The reload completed successfully.
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error (also hits if no keys were found)
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}:
parameters:
- name: tenant_id
@@ -327,7 +352,8 @@ paths:
in: query
required: true
schema:
type: integer
type: string
format: hex
description: A LSN to get the timestamp
responses:
"200":

View File

@@ -20,6 +20,7 @@ use remote_storage::GenericRemoteStorage;
use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::JwtAuth;
use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -35,8 +36,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::{LocationConf, TenantConfOpt};
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
TenantSlotUpsertError, TenantStateError,
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
};
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -45,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use utils::{
auth::JwtAuth,
auth::SwappableJwtAuth,
generation::Generation,
http::{
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -63,7 +64,8 @@ use super::models::ConfigureFailpointsRequest;
pub struct State {
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
allowlist_routes: Vec<Uri>,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
@@ -74,7 +76,8 @@ pub struct State {
impl State {
pub fn new(
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -86,6 +89,7 @@ impl State {
.collect::<Vec<_>>();
Ok(Self {
conf,
tenant_manager,
auth,
allowlist_routes,
remote_storage,
@@ -299,11 +303,7 @@ async fn build_timeline_info(
// we're executing this function, we will outlive the timeline on-disk state.
info.current_logical_size_non_incremental = Some(
timeline
.get_current_logical_size_non_incremental(
info.last_record_lsn,
CancellationToken::new(),
ctx,
)
.get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
.await?,
);
}
@@ -389,6 +389,32 @@ async fn status_handler(
json_response(StatusCode::OK, StatusResponse { id: config.id })
}
async fn reload_auth_validation_keys_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let config = get_config(&request);
let state = get_state(&request);
let Some(shared_auth) = &state.auth else {
return json_response(StatusCode::BAD_REQUEST, ());
};
// unwrap is ok because check is performed when creating config, so path is set and exists
let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
match JwtAuth::from_key_path(key_path) {
Ok(new_auth) => {
shared_auth.swap(new_auth);
json_response(StatusCode::OK, ())
}
Err(e) => {
warn!("Error reloading public keys from {key_path:?}: {e:}");
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
}
}
}
async fn timeline_create_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
@@ -1140,20 +1166,14 @@ async fn put_tenant_location_config_handler(
let location_conf =
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
mgr::upsert_location(
state.conf,
tenant_id,
location_conf,
state.broker_client.clone(),
state.remote_storage.clone(),
state.deletion_queue_client.clone(),
&ctx,
)
.await
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
// principle we might have hit something like concurrent API calls to the same tenant,
// which is not a 400 but a 409.
.map_err(ApiError::BadRequest)?;
state
.tenant_manager
.upsert_location(tenant_id, location_conf, &ctx)
.await
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
// principle we might have hit something like concurrent API calls to the same tenant,
// which is not a 400 but a 409.
.map_err(ApiError::BadRequest)?;
json_response(StatusCode::OK, ())
}
@@ -1458,7 +1478,7 @@ async fn timeline_collect_keyspace(
let keys = timeline
.collect_keyspace(at_lsn, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
}
@@ -1647,6 +1667,8 @@ where
);
match handle.await {
// TODO: never actually return Err from here, always Ok(...) so that we can log
// spanned errors. Call api_error_handler instead and return appropriate Body.
Ok(result) => result,
Err(e) => {
// The handler task panicked. We have a global panic handler that logs the
@@ -1695,7 +1717,7 @@ where
pub fn make_router(
state: Arc<State>,
launch_ts: &'static LaunchTimestamp,
auth: Option<Arc<JwtAuth>>,
auth: Option<Arc<SwappableJwtAuth>>,
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
let spec = include_bytes!("openapi_spec.yml");
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1724,6 +1746,9 @@ pub fn make_router(
.put("/v1/failpoints", |r| {
testing_api_handler("manage failpoints", r, failpoints_handler)
})
.post("/v1/reload_auth_validation_keys", |r| {
api_handler(r, reload_auth_validation_keys_handler)
})
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))

View File

@@ -1,3 +1,5 @@
#![deny(clippy::undocumented_unsafe_blocks)]
mod auth;
pub mod basebackup;
pub mod config;

View File

@@ -1225,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the Postgres WAL redo process",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_records_histogram",
@@ -1261,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});
pub(crate) struct WalRedoProcessCounters {
pub(crate) started: IntCounter,
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
}
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
pub(crate) enum WalRedoKillCause {
WalRedoProcessDrop,
NoLeakChildDrop,
Startup,
}
impl Default for WalRedoProcessCounters {
fn default() -> Self {
let started = register_int_counter!(
"pageserver_wal_redo_process_started_total",
"Number of WAL redo processes started",
)
.unwrap();
let killed = register_int_counter_vec!(
"pageserver_wal_redo_process_stopped_total",
"Number of WAL redo processes stopped",
&["cause"],
)
.unwrap();
Self {
started,
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
let cause_str: &'static str = cause.into();
killed.with_label_values(&[cause_str])
})),
}
}
}
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default);
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics,
@@ -1928,7 +1959,6 @@ pub fn preinitialize_metrics() {
&READ_NUM_FS_LAYERS,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,
&WAL_REDO_WAIT_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,
&WAL_REDO_BYTES_HISTOGRAM,
]

View File

@@ -14,6 +14,7 @@ use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use bytes::Bytes;
use futures::Stream;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -39,7 +40,7 @@ use tracing::field;
use tracing::*;
use utils::id::ConnectionId;
use utils::{
auth::{Claims, JwtAuth, Scope},
auth::{Claims, Scope, SwappableJwtAuth},
id::{TenantId, TimelineId},
lsn::Lsn,
simple_rcu::RcuReadGuard,
@@ -121,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
pub async fn libpq_listener_main(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
auth: Option<Arc<SwappableJwtAuth>>,
listener: TcpListener,
auth_type: AuthType,
listener_ctx: RequestContext,
@@ -189,7 +190,7 @@ pub async fn libpq_listener_main(
async fn page_service_conn_main(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
auth: Option<Arc<SwappableJwtAuth>>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
connection_ctx: RequestContext,
@@ -217,9 +218,27 @@ async fn page_service_conn_main(
// no write timeout is used, because the kernel is assumed to error writes after some time.
let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
// timeout should be lower, but trying out multiple days for
// <https://github.com/neondatabase/neon/issues/4205>
socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
let socket_timeout_ms = (|| {
fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
// Exponential distribution for simulating
// poor network conditions, expect about avg_timeout_ms to be around 15
// in tests
if let Some(avg_timeout_ms) = avg_timeout_ms {
let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
let u = rand::random::<f32>();
((1.0 - u).ln() / (-avg)) as u64
} else {
default_timeout_ms
}
});
default_timeout_ms
})();
// A timeout here does not mean the client died, it can happen if it's just idle for
// a while: we will tear down this PageServerHandler and instantiate a new one if/when
// they reconnect.
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
let socket = std::pin::pin!(socket);
// XXX: pgbackend.run() should take the connection_ctx,
@@ -252,7 +271,7 @@ async fn page_service_conn_main(
struct PageServerHandler {
_conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
auth: Option<Arc<SwappableJwtAuth>>,
claims: Option<Claims>,
/// The context created for the lifetime of the connection
@@ -266,7 +285,7 @@ impl PageServerHandler {
pub fn new(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
auth: Option<Arc<SwappableJwtAuth>>,
connection_ctx: RequestContext,
) -> Self {
PageServerHandler {
@@ -493,7 +512,11 @@ impl PageServerHandler {
};
if let Err(e) = &response {
if timeline.cancel.is_cancelled() {
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
// because wait_lsn etc will drop out
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
// is_canceled(): [`Timeline::shutdown`]` has entered
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
@@ -897,7 +920,7 @@ impl PageServerHandler {
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
if self.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok
return Ok(());
@@ -909,7 +932,7 @@ impl PageServerHandler {
.claims
.as_ref()
.expect("claims presence already checked");
check_permission(claims, tenant_id)
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
@@ -948,16 +971,17 @@ where
.auth
.as_ref()
.unwrap()
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
.map_err(|e| QueryError::Unauthorized(e.0))?;
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing"
)));
return Err(QueryError::Unauthorized(
"jwt token scope is Tenant, but tenant id is missing".into(),
));
}
info!(
"jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
debug!(
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
data.claims.scope, data.claims.tenant_id,
);
@@ -979,9 +1003,13 @@ where
pgb: &mut PostgresBackend<IO>,
query_string: &str,
) -> Result<(), QueryError> {
fail::fail_point!("simulated-bad-compute-connection", |_| {
info!("Hit failpoint for bad connection");
Err(QueryError::SimulatedConnectionError)
});
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1330,6 +1358,9 @@ impl From<GetActiveTenantError> for QueryError {
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
),
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
QueryError::Shutdown
}
e => QueryError::Other(anyhow::anyhow!(e)),
}
}

View File

@@ -21,8 +21,8 @@ use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::exp_counter::ExpCounter;
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -34,18 +34,30 @@ pub enum LsnForTimestamp {
/// Found commits both before and after the given timestamp
Present(Lsn),
/// Found no commits after the given timestamp, this means
/// that the newest data in the branch is older than the given
/// timestamp.
///
/// All commits <= LSN happened before the given timestamp
Future(Lsn),
/// The queried timestamp is past our horizon we look back at (PITR)
///
/// All commits > LSN happened after the given timestamp,
/// but any commits older than the returned LSN
/// might have happened before or after the given timestamp.
/// We don't know because no data before the given lsn is available
/// but any commits < LSN might have happened before or after
/// the given timestamp. We don't know because no data before
/// the given lsn is available.
Past(Lsn),
/// We have found no commit with a timestamp,
/// so we can't return anything meaningful.
/// The associated LSN is a lower bound value.
///
/// The associated LSN is the lower bound value we can safely
/// create branches on, but no statement is made if it is
/// older or newer than the timestamp.
///
/// This variant can e.g. be returned right after a
/// cluster import.
NoData(Lsn),
}
@@ -57,6 +69,25 @@ pub enum CalculateLogicalSizeError {
Other(#[from] anyhow::Error),
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum CollectKeySpaceError {
#[error(transparent)]
Decode(#[from] DeserializeError),
#[error(transparent)]
PageRead(PageReconstructError),
#[error("cancelled")]
Cancelled,
}
impl From<PageReconstructError> for CollectKeySpaceError {
fn from(err: PageReconstructError) -> Self {
match err {
PageReconstructError::Cancelled => Self::Cancelled,
err => Self::PageRead(err),
}
}
}
impl From<PageReconstructError> for CalculateLogicalSizeError {
fn from(pre: PageReconstructError) -> Self {
match pre {
@@ -407,6 +438,11 @@ impl Timeline {
}
// If `found_smaller == true`, `low` is the LSN of the last commit record
// before or at `search_timestamp`
//
// FIXME: it would be better to get the LSN of the previous commit.
// Otherwise, if you restore to the returned LSN, the database will
// include physical changes from later commits that will be marked
// as aborted, and will need to be vacuumed away.
let commit_lsn = Lsn(low * 8);
match (found_smaller, found_larger) {
(false, false) => {
@@ -419,7 +455,7 @@ impl Timeline {
Ok(LsnForTimestamp::Past(min_lsn))
}
(true, false) => {
// Only found a commit with timestamp smaller than the request.
// Only found commits with timestamps smaller than the request.
// It's still a valid case for branch creation, return it.
// And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
// case, anyway.
@@ -599,7 +635,6 @@ impl Timeline {
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<u64, CalculateLogicalSizeError> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -610,12 +645,8 @@ impl Timeline {
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self
.list_rels(*spcnode, *dbnode, lsn, ctx)
.await
.context("list rels")?
{
if cancel.is_cancelled() {
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
if self.cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled);
}
let relsize_key = rel_size_to_key(rel);
@@ -632,11 +663,11 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
pub async fn collect_keyspace(
pub(crate) async fn collect_keyspace(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<KeySpace> {
) -> Result<KeySpace, CollectKeySpaceError> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -645,7 +676,7 @@ impl Timeline {
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
let dbdir = DbDirectory::des(&buf)?;
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
dbs.sort_unstable();
@@ -678,7 +709,7 @@ impl Timeline {
let slrudir_key = slru_dir_to_key(kind);
result.add_key(slrudir_key);
let buf = self.get(slrudir_key, lsn, ctx).await?;
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
let dir = SlruSegmentDirectory::des(&buf)?;
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
segments.sort_unstable();
for segno in segments {
@@ -696,7 +727,7 @@ impl Timeline {
// Then pg_twophase
result.add_key(TWOPHASEDIR_KEY);
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
xids.sort_unstable();
for xid in xids {

View File

@@ -1841,7 +1841,13 @@ impl Tenant {
timelines.values().for_each(|timeline| {
let timeline = Arc::clone(timeline);
let span = Span::current();
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
js.spawn(async move {
if freeze_and_flush {
timeline.flush_and_shutdown().instrument(span).await
} else {
timeline.shutdown().instrument(span).await
}
});
})
};
tracing::info!("Waiting for timelines...");
@@ -3528,10 +3534,6 @@ pub(crate) mod harness {
let remote_fs_dir = conf.workdir.join("localfs");
std::fs::create_dir_all(&remote_fs_dir).unwrap();
let config = RemoteStorageConfig {
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -4731,7 +4733,7 @@ mod tests {
// Keeps uninit mark in place
let raw_tline = tline.raw_timeline().unwrap();
raw_tline
.shutdown(false)
.shutdown()
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
.await;
std::mem::forget(tline);

View File

@@ -327,7 +327,7 @@ mod tests {
let mut sz: u16 = rng.gen();
// Make 50% of the arrays small
if rng.gen() {
sz |= 63;
sz &= 63;
}
random_array(sz.into())
})

View File

@@ -200,6 +200,22 @@ async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
Ok(())
}
/// The TenantManager is responsible for storing and mutating the collection of all tenants
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
/// lives inside the TenantManager.
///
/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
/// the same tenant twice concurrently, or trying to configure the same tenant into secondary
/// and attached modes concurrently.
pub struct TenantManager {
conf: &'static PageServerConf,
// TODO: currently this is a &'static pointing to TENANTs. When we finish refactoring
// out of that static variable, the TenantManager can own this.
// See https://github.com/neondatabase/neon/issues/5796
tenants: &'static std::sync::RwLock<TenantsMap>,
resources: TenantSharedResources,
}
fn emergency_generations(
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
) -> HashMap<TenantId, Generation> {
@@ -366,7 +382,7 @@ pub async fn init_tenant_mgr(
resources: TenantSharedResources,
init_order: InitializationOrder,
cancel: CancellationToken,
) -> anyhow::Result<()> {
) -> anyhow::Result<TenantManager> {
let mut tenants = HashMap::new();
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
@@ -468,7 +484,12 @@ pub async fn init_tenant_mgr(
assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
METRICS.tenant_slots.set(tenants.len() as u64);
*tenants_map = TenantsMap::Open(tenants);
Ok(())
Ok(TenantManager {
conf,
tenants: &TENANTS,
resources,
})
}
/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
@@ -545,8 +566,10 @@ pub(crate) async fn shutdown_all_tenants() {
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
use utils::completion;
// Atomically, 1. extract the list of tenants to shut down and 2. prevent creation of new tenants.
let (in_progress_ops, tenants_to_shut_down) = {
let mut join_set = JoinSet::new();
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
let (total_in_progress, total_attached) = {
let mut m = tenants.write().unwrap();
match &mut *m {
TenantsMap::Initializing => {
@@ -556,78 +579,67 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
}
TenantsMap::Open(tenants) => {
let mut shutdown_state = HashMap::new();
let mut in_progress_ops = Vec::new();
let mut tenants_to_shut_down = Vec::new();
let mut total_in_progress = 0;
let mut total_attached = 0;
for (k, v) in tenants.drain() {
for (tenant_id, v) in tenants.drain() {
match v {
TenantSlot::Attached(t) => {
tenants_to_shut_down.push(t.clone());
shutdown_state.insert(k, TenantSlot::Attached(t));
shutdown_state.insert(tenant_id, TenantSlot::Attached(t.clone()));
join_set.spawn(
async move {
let freeze_and_flush = true;
let res = {
let (_guard, shutdown_progress) = completion::channel();
t.shutdown(shutdown_progress, freeze_and_flush).await
};
if let Err(other_progress) = res {
// join the another shutdown in progress
other_progress.wait().await;
}
// we cannot afford per tenant logging here, because if s3 is degraded, we are
// going to log too many lines
debug!("tenant successfully stopped");
}
.instrument(info_span!("shutdown", %tenant_id)),
);
total_attached += 1;
}
TenantSlot::Secondary => {
shutdown_state.insert(k, TenantSlot::Secondary);
shutdown_state.insert(tenant_id, TenantSlot::Secondary);
}
TenantSlot::InProgress(notify) => {
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
// wait for their notifications to fire in this function.
in_progress_ops.push(notify);
join_set.spawn(async move {
notify.wait().await;
});
total_in_progress += 1;
}
}
}
*m = TenantsMap::ShuttingDown(shutdown_state);
(in_progress_ops, tenants_to_shut_down)
(total_in_progress, total_attached)
}
TenantsMap::ShuttingDown(_) => {
// TODO: it is possible that detach and shutdown happen at the same time. as a
// result, during shutdown we do not wait for detach.
error!("already shutting down, this function isn't supposed to be called more than once");
return;
}
}
};
let started_at = std::time::Instant::now();
info!(
"Waiting for {} InProgress tenants and {} Attached tenants to shut down",
in_progress_ops.len(),
tenants_to_shut_down.len()
total_in_progress, total_attached
);
for barrier in in_progress_ops {
barrier.wait().await;
}
info!(
"InProgress tenants shut down, waiting for {} Attached tenants to shut down",
tenants_to_shut_down.len()
);
let started_at = std::time::Instant::now();
let mut join_set = JoinSet::new();
for tenant in tenants_to_shut_down {
let tenant_id = tenant.get_tenant_id();
join_set.spawn(
async move {
let freeze_and_flush = true;
let res = {
let (_guard, shutdown_progress) = completion::channel();
tenant.shutdown(shutdown_progress, freeze_and_flush).await
};
if let Err(other_progress) = res {
// join the another shutdown in progress
other_progress.wait().await;
}
// we cannot afford per tenant logging here, because if s3 is degraded, we are
// going to log too many lines
debug!("tenant successfully stopped");
}
.instrument(info_span!("shutdown", %tenant_id)),
);
}
let total = join_set.len();
let mut panicked = 0;
let mut buffering = true;
@@ -640,7 +652,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
match joined {
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures");
unreachable!("we are not cancelling any of the tasks");
}
Err(join_error) if join_error.is_panic() => {
// cannot really do anything, as this panic is likely a bug
@@ -742,139 +754,134 @@ pub(crate) async fn set_new_tenant_config(
Ok(())
}
#[instrument(skip_all, fields(%tenant_id))]
pub(crate) async fn upsert_location(
conf: &'static PageServerConf,
tenant_id: TenantId,
new_location_config: LocationConf,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), anyhow::Error> {
info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
impl TenantManager {
#[instrument(skip_all, fields(%tenant_id))]
pub(crate) async fn upsert_location(
&self,
tenant_id: TenantId,
new_location_config: LocationConf,
ctx: &RequestContext,
) -> Result<(), anyhow::Error> {
info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
// then we do not need to set the slot to InProgress, we can just call into the
// existng tenant.
{
let locked = TENANTS.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
match (&new_location_config.mode, peek_slot) {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
if attach_conf.generation == tenant.generation {
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(AttachedTenantConf::try_from(
new_location_config,
)?);
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
// then we do not need to set the slot to InProgress, we can just call into the
// existng tenant.
{
let locked = self.tenants.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
match (&new_location_config.mode, peek_slot) {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
if attach_conf.generation == tenant.generation {
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(AttachedTenantConf::try_from(
new_location_config,
)?);
// Persist the new config in the background, to avoid holding up any
// locks while we do so.
// TODO
// Persist the new config in the background, to avoid holding up any
// locks while we do so.
// TODO
return Ok(());
} else {
// Different generations, fall through to general case
return Ok(());
} else {
// Different generations, fall through to general case
}
}
_ => {
// Not an Attached->Attached transition, fall through to general case
}
}
_ => {
// Not an Attached->Attached transition, fall through to general case
}
}
}
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
// InProgress value to the slot while we make whatever changes are required. The state for
// the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
// the state is ill-defined while we're in transition. Transitions are async, but fast: we do
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
// InProgress value to the slot while we make whatever changes are required. The state for
// the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
// the state is ill-defined while we're in transition. Transitions are async, but fast: we do
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
deletion_queue_client.flush_advisory()
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
self.resources.deletion_queue_client.flush_advisory()
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
}
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
slot_guard.drop_old_value().expect("We just shut it down");
}
let tenant_path = self.conf.tenant_path(&tenant_id);
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => {
let tenant_path = self.conf.tenant_path(&tenant_id);
// Directory doesn't need to be fsync'd because if we crash it can
// safely be recreated next time this tenant location is configured.
unsafe_create_dir_all(&tenant_path)
.await
.with_context(|| format!("Creating {tenant_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
TenantSlot::Secondary
}
LocationMode::Attached(_attach_config) => {
let timelines_path = self.conf.timelines_path(&tenant_id);
// Directory doesn't need to be fsync'd because we do not depend on
// it to exist after crashes: it may be recreated when tenant is
// re-attached, see https://github.com/neondatabase/neon/issues/5550
unsafe_create_dir_all(&timelines_path)
.await
.with_context(|| format!("Creating {timelines_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let tenant = tenant_spawn(
self.conf,
tenant_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(new_location_config)?,
None,
self.tenants,
SpawnMode::Normal,
ctx,
)?;
TenantSlot::Attached(tenant)
}
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
slot_guard.drop_old_value().expect("We just shut it down");
slot_guard.upsert(new_slot)?;
Ok(())
}
let tenant_path = conf.tenant_path(&tenant_id);
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => {
let tenant_path = conf.tenant_path(&tenant_id);
// Directory doesn't need to be fsync'd because if we crash it can
// safely be recreated next time this tenant location is configured.
unsafe_create_dir_all(&tenant_path)
.await
.with_context(|| format!("Creating {tenant_path}"))?;
Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
TenantSlot::Secondary
}
LocationMode::Attached(_attach_config) => {
let timelines_path = conf.timelines_path(&tenant_id);
// Directory doesn't need to be fsync'd because we do not depend on
// it to exist after crashes: it may be recreated when tenant is
// re-attached, see https://github.com/neondatabase/neon/issues/5550
unsafe_create_dir_all(&timelines_path)
.await
.with_context(|| format!("Creating {timelines_path}"))?;
Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let tenant = tenant_spawn(
conf,
tenant_id,
&tenant_path,
TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client,
},
AttachedTenantConf::try_from(new_location_config)?,
None,
&TENANTS,
SpawnMode::Normal,
ctx,
)?;
TenantSlot::Attached(tenant)
}
};
slot_guard.upsert(new_slot)?;
Ok(())
}
#[derive(Debug, thiserror::Error)]
@@ -1430,9 +1437,6 @@ pub struct SlotGuard {
_completion: utils::completion::Completion,
}
unsafe impl Send for SlotGuard {}
unsafe impl Sync for SlotGuard {}
impl SlotGuard {
fn new(
tenant_id: TenantId,
@@ -1539,14 +1543,7 @@ impl SlotGuard {
/// is responsible for protecting
fn old_value_is_shutdown(&self) -> bool {
match self.old_value.as_ref() {
Some(TenantSlot::Attached(tenant)) => {
// TODO: PR #5711 will add a gate that enables properly checking that
// shutdown completed.
matches!(
tenant.current_state(),
TenantState::Stopping { .. } | TenantState::Broken { .. }
)
}
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
Some(TenantSlot::Secondary) => {
// TODO: when adding secondary mode tenants, this will check for shutdown
// in the same way that we do for `Tenant` above

View File

@@ -6,7 +6,6 @@ use std::sync::Arc;
use anyhow::{bail, Context};
use tokio::sync::oneshot::error::RecvError;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -350,10 +349,6 @@ async fn fill_logical_sizes(
// our advantage with `?` error handling.
let mut joinset = tokio::task::JoinSet::new();
let cancel = tokio_util::sync::CancellationToken::new();
// be sure to cancel all spawned tasks if we are dropped
let _dg = cancel.clone().drop_guard();
// For each point that would benefit from having a logical size available,
// spawn a Task to fetch it, unless we have it cached already.
for seg in segments.iter() {
@@ -371,15 +366,8 @@ async fn fill_logical_sizes(
let parallel_size_calcs = Arc::clone(limit);
let ctx = ctx.attached_child();
joinset.spawn(
calculate_logical_size(
parallel_size_calcs,
timeline,
lsn,
cause,
ctx,
cancel.child_token(),
)
.in_current_span(),
calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
.in_current_span(),
);
}
e.insert(cached_size);
@@ -487,14 +475,13 @@ async fn calculate_logical_size(
lsn: utils::lsn::Lsn,
cause: LogicalSizeCalculationCause,
ctx: RequestContext,
cancel: CancellationToken,
) -> Result<TimelineAtLsnSizeResult, RecvError> {
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
.await
.expect("global semaphore should not had been closed");
let size_res = timeline
.spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
.spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
.await?;
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))

View File

@@ -251,6 +251,7 @@ impl Layer {
layer
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
.instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
.await
}
@@ -1211,8 +1212,10 @@ impl DownloadedLayer {
// this will be a permanent failure
.context("load layer");
if res.is_err() {
if let Err(e) = res.as_ref() {
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
// TODO(#5815): we are not logging all errors, so temporarily log them here as well
tracing::error!("layer loading failed permanently: {e:#}");
}
res
};
@@ -1291,6 +1294,7 @@ impl ResidentLayer {
}
/// Loads all keys stored in the layer. Returns key, lsn and value size.
#[tracing::instrument(skip_all, fields(layer=%self))]
pub(crate) async fn load_keys<'a>(
&'a self,
ctx: &RequestContext,

View File

@@ -36,7 +36,6 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
@@ -50,6 +49,7 @@ use crate::tenant::{
metadata::{save_metadata, TimelineMetadata},
par_fsync,
};
use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
@@ -247,7 +247,7 @@ pub struct Timeline {
/// the flush finishes. You can use that to wait for the flush to finish.
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -374,6 +374,19 @@ pub enum PageReconstructError {
WalRedo(anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
enum FlushLayerError {
/// Timeline cancellation token was cancelled
#[error("timeline shutting down")]
Cancelled,
#[error(transparent)]
PageReconstructError(#[from] PageReconstructError),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl std::fmt::Debug for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
@@ -891,15 +904,16 @@ impl Timeline {
self.launch_eviction_task(background_jobs_can_start);
}
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
///
/// While we are flushing, we continue to accept read I/O.
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
pub(crate) async fn flush_and_shutdown(&self) {
debug_assert_current_span_has_tenant_and_timeline_id();
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// prevent writes to the InMemoryLayer
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
// trying to flush
tracing::debug!("Waiting for WalReceiverManager...");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
@@ -908,40 +922,70 @@ impl Timeline {
)
.await;
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
self.last_record_lsn.shutdown();
// now all writers to InMemory layer are gone, do the final flush if requested
if freeze_and_flush {
match self.freeze_and_flush().await {
Ok(()) => {}
Err(e) => {
warn!("failed to freeze and flush: {e:#}");
return; // TODO: should probably drain remote timeline client anyways?
match self.freeze_and_flush().await {
Ok(_) => {
// drain the upload queue
if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
if let Err(e) = client.wait_completion().await {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to flush to remote storage: {e:#}");
}
}
}
// drain the upload queue
let res = if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.wait_completion().await
} else {
Ok(())
};
if let Err(e) = res {
warn!("failed to await for frozen and flushed uploads: {e:#}");
Err(e) => {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to freeze and flush: {e:#}");
}
}
self.shutdown().await;
}
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
/// the graceful [`Timeline::flush_and_shutdown`] function.
pub(crate) async fn shutdown(&self) {
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
// while doing so.
self.last_record_lsn.shutdown();
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(self.tenant_id),
Some(self.timeline_id),
)
.await;
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
// case our caller wants to use that for a deletion
if let Some(remote_client) = self.remote_client.as_ref() {
match remote_client.stop() {
Ok(()) => {}
Err(StopError::QueueUninitialized) => {
// Shutting down during initialization is legal
}
}
}
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
// Finally wait until any gate-holders are complete
@@ -985,7 +1029,12 @@ impl Timeline {
reason,
backtrace: backtrace_str,
};
self.set_state(broken_state)
self.set_state(broken_state);
// Although the Broken state is not equivalent to shutdown() (shutdown will be called
// later when this tenant is detach or the process shuts down), firing the cancellation token
// here avoids the need for other tasks to watch for the Broken state explicitly.
self.cancel.cancel();
}
pub fn current_state(&self) -> TimelineState {
@@ -1741,12 +1790,8 @@ impl Timeline {
// delay will be terminated by a timeout regardless.
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
// no extra cancellation here, because nothing really waits for this to complete compared
// to spawn_ondemand_logical_size_calculation.
let cancel = CancellationToken::new();
let calculated_size = match self_clone
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
.await
{
Ok(s) => s,
@@ -1815,7 +1860,6 @@ impl Timeline {
lsn: Lsn,
cause: LogicalSizeCalculationCause,
ctx: RequestContext,
cancel: CancellationToken,
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
let (sender, receiver) = oneshot::channel();
let self_clone = Arc::clone(self);
@@ -1836,7 +1880,7 @@ impl Timeline {
false,
async move {
let res = self_clone
.logical_size_calculation_task(lsn, cause, &ctx, cancel)
.logical_size_calculation_task(lsn, cause, &ctx)
.await;
let _ = sender.send(res).ok();
Ok(()) // Receiver is responsible for handling errors
@@ -1852,58 +1896,28 @@ impl Timeline {
lsn: Lsn,
cause: LogicalSizeCalculationCause,
ctx: &RequestContext,
cancel: CancellationToken,
) -> Result<u64, CalculateLogicalSizeError> {
span::debug_assert_current_span_has_tenant_and_timeline_id();
let mut timeline_state_updates = self.subscribe_for_state_updates();
let _guard = self.gate.enter();
let self_calculation = Arc::clone(self);
let mut calculation = pin!(async {
let cancel = cancel.child_token();
let ctx = ctx.attached_child();
self_calculation
.calculate_logical_size(lsn, cause, cancel, &ctx)
.calculate_logical_size(lsn, cause, &ctx)
.await
});
let timeline_state_cancellation = async {
loop {
match timeline_state_updates.changed().await {
Ok(()) => {
let new_state = timeline_state_updates.borrow().clone();
match new_state {
// we're running this job for active timelines only
TimelineState::Active => continue,
TimelineState::Broken { .. }
| TimelineState::Stopping
| TimelineState::Loading => {
break format!("aborted because timeline became inactive (new state: {new_state:?})")
}
}
}
Err(_sender_dropped_error) => {
// can't happen, the sender is not dropped as long as the Timeline exists
break "aborted because state watch was dropped".to_string();
}
}
}
};
let taskmgr_shutdown_cancellation = async {
task_mgr::shutdown_watcher().await;
"aborted because task_mgr shutdown requested".to_string()
};
tokio::select! {
res = &mut calculation => { res }
reason = timeline_state_cancellation => {
debug!(reason = reason, "cancelling calculation");
cancel.cancel();
_ = self.cancel.cancelled() => {
debug!("cancelling logical size calculation for timeline shutdown");
calculation.await
}
reason = taskmgr_shutdown_cancellation => {
debug!(reason = reason, "cancelling calculation");
cancel.cancel();
_ = task_mgr::shutdown_watcher() => {
debug!("cancelling logical size calculation for task shutdown");
calculation.await
}
}
@@ -1917,7 +1931,6 @@ impl Timeline {
&self,
up_to_lsn: Lsn,
cause: LogicalSizeCalculationCause,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<u64, CalculateLogicalSizeError> {
info!(
@@ -1960,7 +1973,7 @@ impl Timeline {
};
let timer = storage_time_metrics.start_timer();
let logical_size = self
.get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
.get_current_logical_size_non_incremental(up_to_lsn, ctx)
.await?;
debug!("calculated logical size: {logical_size}");
timer.stop_and_record();
@@ -2373,6 +2386,10 @@ impl Timeline {
info!("started flush loop");
loop {
tokio::select! {
_ = self.cancel.cancelled() => {
info!("shutting down layer flush task");
break;
},
_ = task_mgr::shutdown_watcher() => {
info!("shutting down layer flush task");
break;
@@ -2384,6 +2401,14 @@ impl Timeline {
let timer = self.metrics.flush_time_histo.start_timer();
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
if self.cancel.is_cancelled() {
info!("dropping out of flush loop for timeline shutdown");
// Note: we do not bother transmitting into [`layer_flush_done_tx`], because
// anyone waiting on that will respect self.cancel as well: they will stop
// waiting at the same time we as drop out of this loop.
return;
}
let layer_to_flush = {
let guard = self.layers.read().await;
guard.layer_map().frozen_layers.front().cloned()
@@ -2392,9 +2417,18 @@ impl Timeline {
let Some(layer_to_flush) = layer_to_flush else {
break Ok(());
};
if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
error!("could not flush frozen layer: {err:?}");
break Err(err);
match self.flush_frozen_layer(layer_to_flush, ctx).await {
Ok(()) => {}
Err(FlushLayerError::Cancelled) => {
info!("dropping out of flush loop for timeline shutdown");
return;
}
err @ Err(
FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
) => {
error!("could not flush frozen layer: {err:?}");
break err;
}
}
};
// Notify any listeners that we're done
@@ -2443,7 +2477,17 @@ impl Timeline {
}
}
trace!("waiting for flush to complete");
rx.changed().await?;
tokio::select! {
rx_e = rx.changed() => {
rx_e?;
},
// Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
// the notification from [`flush_loop`] that it completed.
_ = self.cancel.cancelled() => {
tracing::info!("Cancelled layer flush due on timeline shutdown");
return Ok(())
}
};
trace!("done")
}
}
@@ -2458,7 +2502,7 @@ impl Timeline {
self: &Arc<Self>,
frozen_layer: Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), FlushLayerError> {
// As a special case, when we have just imported an image into the repository,
// instead of writing out a L0 delta layer, we directly write out image layer
// files instead. This is possible as long as *all* the data imported into the
@@ -2483,6 +2527,11 @@ impl Timeline {
let (partitioning, _lsn) = self
.repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
.await?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
// For image layers, we add them immediately into the layer map.
(
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
@@ -2514,6 +2563,10 @@ impl Timeline {
)
};
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
@@ -2523,6 +2576,10 @@ impl Timeline {
let metadata = {
let mut guard = self.layers.write().await;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
if disk_consistent_lsn != old_disk_consistent_lsn {

View File

@@ -26,6 +26,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
use crate::{
context::{DownloadBehavior, RequestContext},
pgdatadir_mapping::CollectKeySpaceError,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -326,8 +327,7 @@ impl Timeline {
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
_ => {
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
.await;
self.imitate_timeline_cached_layer_accesses(ctx).await;
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
}
}
@@ -367,21 +367,12 @@ impl Timeline {
/// Recompute the values which would cause on-demand downloads during restart.
#[instrument(skip_all)]
async fn imitate_timeline_cached_layer_accesses(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) {
async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
let lsn = self.get_last_record_lsn();
// imitiate on-restart initial logical size
let size = self
.calculate_logical_size(
lsn,
LogicalSizeCalculationCause::EvictionTaskImitation,
cancel.clone(),
ctx,
)
.calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
.instrument(info_span!("calculate_logical_size"))
.await;
@@ -407,9 +398,16 @@ impl Timeline {
if size.is_err() {
// ignore, see above comment
} else {
warn!(
"failed to collect keyspace but succeeded in calculating logical size: {e:#}"
);
match e {
CollectKeySpaceError::Cancelled => {
// Shutting down, ignore
}
err => {
warn!(
"failed to collect keyspace but succeeded in calculating logical size: {err:#}"
);
}
}
}
}
}

View File

@@ -43,8 +43,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
use crate::config::PageServerConf;
use crate::metrics::{
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
WAL_REDO_WAIT_TIME,
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
};
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
use crate::repository::Key;
@@ -207,11 +207,8 @@ impl PostgresRedoManager {
) -> anyhow::Result<Bytes> {
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
const MAX_RETRY_ATTEMPTS: u32 = 1;
let start_time = Instant::now();
let mut n_attempts = 0u32;
loop {
let lock_time = Instant::now();
// launch the WAL redo process on first use
let proc: Arc<WalRedoProcess> = {
let proc_guard = self.redo_process.read().unwrap();
@@ -236,7 +233,7 @@ impl PostgresRedoManager {
}
};
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
let started_at = std::time::Instant::now();
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
@@ -244,8 +241,7 @@ impl PostgresRedoManager {
.apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
.context("apply_wal_records");
let end_time = Instant::now();
let duration = end_time.duration_since(lock_time);
let duration = started_at.elapsed();
let len = records.len();
let nbytes = records.iter().fold(0, |acumulator, record| {
@@ -596,21 +592,21 @@ trait CloseFileDescriptors: CommandExt {
impl<C: CommandExt> CloseFileDescriptors for C {
fn close_fds(&mut self) -> &mut Command {
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
// which means it should be safe to execute inside a signal handler.
// The precise meaning depends on platform. See `man signal-safety`
// for the linux definition.
//
// The set_fds_cloexec_threadsafe function is documented to be
// async-signal-safe.
//
// Aside from this function, the rest of the code is re-entrant and
// doesn't make any syscalls. We're just passing constants.
//
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
// which is not async-signal-safe. Be careful.
unsafe {
self.pre_exec(move || {
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
// which means it should be safe to execute inside a signal handler.
// The precise meaning depends on platform. See `man signal-safety`
// for the linux definition.
//
// The set_fds_cloexec_threadsafe function is documented to be
// async-signal-safe.
//
// Aside from this function, the rest of the code is re-entrant and
// doesn't make any syscalls. We're just passing constants.
//
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
// which is not async-signal-safe. Be careful.
close_fds::set_fds_cloexec_threadsafe(3, &[]);
Ok(())
})
@@ -667,10 +663,10 @@ impl WalRedoProcess {
.close_fds()
.spawn_no_leak_child(tenant_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait();
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
@@ -1001,7 +997,7 @@ impl Drop for WalRedoProcess {
self.child
.take()
.expect("we only do this once")
.kill_and_wait();
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
self.stderr_logger_cancel.cancel();
// no way to wait for stderr_logger_task from Drop because that is async only
}
@@ -1037,16 +1033,19 @@ impl NoLeakChild {
})
}
fn kill_and_wait(mut self) {
fn kill_and_wait(mut self, cause: WalRedoKillCause) {
let child = match self.child.take() {
Some(child) => child,
None => return,
};
Self::kill_and_wait_impl(child);
Self::kill_and_wait_impl(child, cause);
}
#[instrument(skip_all, fields(pid=child.id()))]
fn kill_and_wait_impl(mut child: Child) {
#[instrument(skip_all, fields(pid=child.id(), ?cause))]
fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
scopeguard::defer! {
WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
}
let res = child.kill();
if let Err(e) = res {
// This branch is very unlikely because:
@@ -1091,7 +1090,7 @@ impl Drop for NoLeakChild {
// This thread here is going to outlive of our dropper.
let span = tracing::info_span!("walredo", %tenant_id);
let _entered = span.enter();
Self::kill_and_wait_impl(child);
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
})
.await
});

View File

@@ -19,7 +19,10 @@
#include "access/xlog.h"
#include "access/xlogutils.h"
#include "storage/buf_internals.h"
#include "storage/lwlock.h"
#include "storage/ipc.h"
#include "c.h"
#include "postmaster/interrupt.h"
#include "libpq-fe.h"
#include "libpq/pqformat.h"
@@ -61,23 +64,63 @@ int flush_every_n_requests = 8;
int n_reconnect_attempts = 0;
int max_reconnect_attempts = 60;
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
typedef struct
{
LWLockId lock;
pg_atomic_uint64 update_counter;
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
} PagestoreShmemState;
#if PG_VERSION_NUM >= 150000
static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void walproposer_shmem_request(void);
#endif
static shmem_startup_hook_type prev_shmem_startup_hook;
static PagestoreShmemState *pagestore_shared;
static uint64 pagestore_local_counter = 0;
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
static bool pageserver_flush(void);
static void pageserver_disconnect(void);
static pqsigfunc prev_signal_handler;
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
}
static void
pageserver_sighup_handler(SIGNAL_ARGS)
AssignPageserverConnstring(const char *newval, void *extra)
{
if (prev_signal_handler)
{
prev_signal_handler(postgres_signal_arg);
}
neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
pageserver_disconnect();
if(!pagestore_shared)
return;
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
LWLockRelease(pagestore_shared->lock);
}
static bool
CheckConnstringUpdated()
{
if(!pagestore_shared)
return false;
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
}
static void
ReloadConnstring()
{
if(!pagestore_shared)
return;
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
LWLockRelease(pagestore_shared->lock);
}
static bool
@@ -91,6 +134,11 @@ pageserver_connect(int elevel)
Assert(!connected);
if(CheckConnstringUpdated())
{
ReloadConnstring();
}
/*
* Connect using the connection string we got from the
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -110,7 +158,7 @@ pageserver_connect(int elevel)
n++;
}
keywords[n] = "dbname";
values[n] = page_server_connstring;
values[n] = local_pageserver_connstring;
n++;
keywords[n] = NULL;
values[n] = NULL;
@@ -254,6 +302,12 @@ pageserver_send(NeonRequest * request)
{
StringInfoData req_buff;
if(CheckConnstringUpdated())
{
pageserver_disconnect();
ReloadConnstring();
}
/* If the connection was lost for some reason, reconnect */
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
{
@@ -274,6 +328,7 @@ pageserver_send(NeonRequest * request)
{
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
{
HandleMainLoopInterrupts();
n_reconnect_attempts += 1;
pg_usleep(RECONNECT_INTERVAL_USEC);
}
@@ -391,7 +446,8 @@ pageserver_flush(void)
return true;
}
page_server_api api = {
page_server_api api =
{
.send = pageserver_send,
.flush = pageserver_flush,
.receive = pageserver_receive
@@ -405,12 +461,72 @@ check_neon_id(char **newval, void **extra, GucSource source)
return **newval == '\0' || HexDecodeString(id, *newval, 16);
}
static Size
PagestoreShmemSize(void)
{
return sizeof(PagestoreShmemState);
}
static bool
PagestoreShmemInit(void)
{
bool found;
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pagestore_shared = ShmemInitStruct("libpagestore shared state",
PagestoreShmemSize(),
&found);
if(!found)
{
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
AssignPageserverConnstring(page_server_connstring, NULL);
}
LWLockRelease(AddinShmemInitLock);
return found;
}
static void
pagestore_shmem_startup_hook(void)
{
if(prev_shmem_startup_hook)
prev_shmem_startup_hook();
PagestoreShmemInit();
}
static void
pagestore_shmem_request(void)
{
#if PG_VERSION_NUM >= 150000
if(prev_shmem_request_hook)
prev_shmem_request_hook();
#endif
RequestAddinShmemSpace(PagestoreShmemSize());
RequestNamedLWLockTranche("neon_libpagestore", 1);
}
static void
pagestore_prepare_shmem(void)
{
#if PG_VERSION_NUM >= 150000
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = pagestore_shmem_request;
#else
pagestore_shmem_request();
#endif
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = pagestore_shmem_startup_hook;
}
/*
* Module initialization function
*/
void
pg_init_libpagestore(void)
{
pagestore_prepare_shmem();
DefineCustomStringVariable("neon.pageserver_connstring",
"connection string to the page server",
NULL,
@@ -418,7 +534,7 @@ pg_init_libpagestore(void)
"",
PGC_SIGHUP,
0, /* no flags required */
NULL, NULL, NULL);
CheckPageserverConnstring, AssignPageserverConnstring, NULL);
DefineCustomStringVariable("neon.timeline_id",
"Neon timeline_id the server is running on",
@@ -499,7 +615,5 @@ pg_init_libpagestore(void)
redo_read_buffer_filter = neon_redo_read_buffer_filter;
}
prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
lfc_init();
}

View File

@@ -1,6 +1,8 @@
//! User credentials used in authentication.
use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
use crate::{
auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
use std::collections::HashSet;
@@ -38,6 +40,8 @@ pub struct ClientCredentials<'a> {
pub user: &'a str,
// TODO: this is a severe misnomer! We should think of a new name ASAP.
pub project: Option<String>,
pub cache_key: String,
}
impl ClientCredentials<'_> {
@@ -53,6 +57,7 @@ impl<'a> ClientCredentials<'a> {
ClientCredentials {
user: "",
project: None,
cache_key: "".to_string(),
}
}
@@ -120,7 +125,17 @@ impl<'a> ClientCredentials<'a> {
info!(user, project = project.as_deref(), "credentials");
Ok(Self { user, project })
let cache_key = format!(
"{}{}",
project.as_deref().unwrap_or(""),
neon_options(params).unwrap_or("".to_string())
);
Ok(Self {
user,
project,
cache_key,
})
}
}
@@ -176,6 +191,7 @@ mod tests {
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("foo"));
assert_eq!(creds.cache_key, "foo");
Ok(())
}
@@ -303,4 +319,23 @@ mod tests {
_ => panic!("bad error: {err:?}"),
}
}
#[test]
fn parse_neon_options() -> anyhow::Result<()> {
let options = StartupMessageParams::new([
("user", "john_doe"),
("options", "neon_lsn:0/2 neon_endpoint_type:read_write"),
]);
let sni = Some("project.localhost");
let common_names = Some(["localhost".into()].into());
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("project"));
assert_eq!(
creds.cache_key,
"projectneon_endpoint_type:read_write neon_lsn:0/2"
);
Ok(())
}
}

View File

@@ -80,6 +80,9 @@ struct ProxyCliArgs {
/// cache for `wake_compute` api method (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
wake_compute_cache: String,
/// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
#[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
wake_compute_lock: String,
/// Allow self-signed certificates for compute nodes (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
allow_self_signed_compute: bool,
@@ -220,10 +223,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
}));
let config::WakeComputeLockOptions {
shards,
permits,
epoch,
timeout,
} = args.wake_compute_lock.parse()?;
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new(
console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
.unwrap(),
));
tokio::spawn(locks.garbage_collect_worker(epoch));
let url = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client());
let api = console::provider::neon::Api::new(endpoint, caches);
let api = console::provider::neon::Api::new(endpoint, caches, locks);
auth::BackendType::Console(Cow::Owned(api), ())
}
AuthBackend::Postgres => {

View File

@@ -3,6 +3,7 @@ use crate::{
cancellation::CancelClosure,
console::errors::WakeComputeError,
error::{io_error, UserFacingError},
proxy::is_neon_param,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
@@ -278,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
#[allow(unstable_name_collisions)]
let options: String = params
.options_raw()?
.filter(|opt| parse_endpoint_param(opt).is_none())
.filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
.intersperse(" ") // TODO: use impl from std once it's stabilized
.collect();
@@ -313,5 +314,11 @@ mod tests {
let params = StartupMessageParams::new([("options", "project = foo")]);
assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
let params = StartupMessageParams::new([(
"options",
"project = foo neon_endpoint_type:read_write neon_lsn:0/2",
)]);
assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
}
}

View File

@@ -264,6 +264,79 @@ impl FromStr for CacheOptions {
}
}
/// Helper for cmdline cache options parsing.
pub struct WakeComputeLockOptions {
/// The number of shards the lock map should have
pub shards: usize,
/// The number of allowed concurrent requests for each endpoitn
pub permits: usize,
/// Garbage collection epoch
pub epoch: Duration,
/// Lock timeout
pub timeout: Duration,
}
impl WakeComputeLockOptions {
/// Default options for [`crate::console::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
/// Parse lock options passed via cmdline.
/// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
fn parse(options: &str) -> anyhow::Result<Self> {
let mut shards = None;
let mut permits = None;
let mut epoch = None;
let mut timeout = None;
for option in options.split(',') {
let (key, value) = option
.split_once('=')
.with_context(|| format!("bad key-value pair: {option}"))?;
match key {
"shards" => shards = Some(value.parse()?),
"permits" => permits = Some(value.parse()?),
"epoch" => epoch = Some(humantime::parse_duration(value)?),
"timeout" => timeout = Some(humantime::parse_duration(value)?),
unknown => bail!("unknown key: {unknown}"),
}
}
// these dont matter if lock is disabled
if let Some(0) = permits {
timeout = Some(Duration::default());
epoch = Some(Duration::default());
shards = Some(2);
}
let out = Self {
shards: shards.context("missing `shards`")?,
permits: permits.context("missing `permits`")?,
epoch: epoch.context("missing `epoch`")?,
timeout: timeout.context("missing `timeout`")?,
};
ensure!(out.shards > 1, "shard count must be > 1");
ensure!(
out.shards.is_power_of_two(),
"shard count must be a power of two"
);
Ok(out)
}
}
impl FromStr for WakeComputeLockOptions {
type Err = anyhow::Error;
fn from_str(options: &str) -> Result<Self, Self::Err> {
let error = || format!("failed to parse cache lock options '{options}'");
Self::parse(options).with_context(error)
}
}
#[cfg(test)]
mod tests {
use super::*;
@@ -288,4 +361,42 @@ mod tests {
Ok(())
}
#[test]
fn test_parse_lock_options() -> anyhow::Result<()> {
let WakeComputeLockOptions {
epoch,
permits,
shards,
timeout,
} = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
assert_eq!(epoch, Duration::from_secs(10 * 60));
assert_eq!(timeout, Duration::from_secs(1));
assert_eq!(shards, 32);
assert_eq!(permits, 4);
let WakeComputeLockOptions {
epoch,
permits,
shards,
timeout,
} = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
assert_eq!(epoch, Duration::from_secs(60));
assert_eq!(timeout, Duration::from_millis(100));
assert_eq!(shards, 16);
assert_eq!(permits, 8);
let WakeComputeLockOptions {
epoch,
permits,
shards,
timeout,
} = "permits=0".parse()?;
assert_eq!(epoch, Duration::ZERO);
assert_eq!(timeout, Duration::ZERO);
assert_eq!(shards, 2);
assert_eq!(permits, 0);
Ok(())
}
}

View File

@@ -13,5 +13,10 @@ pub mod caches {
pub use super::provider::{ApiCaches, NodeInfoCache};
}
/// Various cache-related types.
pub mod locks {
pub use super::provider::ApiLocks;
}
/// Console's management API.
pub mod mgmt;

View File

@@ -8,7 +8,13 @@ use crate::{
compute, scram,
};
use async_trait::async_trait;
use std::sync::Arc;
use dashmap::DashMap;
use std::{sync::Arc, time::Duration};
use tokio::{
sync::{OwnedSemaphorePermit, Semaphore},
time::Instant,
};
use tracing::info;
pub mod errors {
use crate::{
@@ -149,6 +155,9 @@ pub mod errors {
#[error(transparent)]
ApiError(ApiError),
#[error("Timeout waiting to acquire wake compute lock")]
TimeoutError,
}
// This allows more useful interactions than `#[from]`.
@@ -158,6 +167,17 @@ pub mod errors {
}
}
impl From<tokio::sync::AcquireError> for WakeComputeError {
fn from(_: tokio::sync::AcquireError) -> Self {
WakeComputeError::TimeoutError
}
}
impl From<tokio::time::error::Elapsed> for WakeComputeError {
fn from(_: tokio::time::error::Elapsed) -> Self {
WakeComputeError::TimeoutError
}
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
use WakeComputeError::*;
@@ -167,6 +187,8 @@ pub mod errors {
BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
ApiError(e) => e.to_string_client(),
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
}
}
}
@@ -178,6 +200,7 @@ pub struct ConsoleReqExtra<'a> {
pub session_id: uuid::Uuid,
/// Name of client application, if set.
pub application_name: Option<&'a str>,
pub options: Option<&'a str>,
}
/// Auth secret which is managed by the cloud.
@@ -232,3 +255,145 @@ pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub node_info: NodeInfoCache,
}
/// Various caches for [`console`](super).
pub struct ApiLocks {
name: &'static str,
node_locks: DashMap<Arc<str>, Arc<Semaphore>>,
permits: usize,
timeout: Duration,
registered: prometheus::IntCounter,
unregistered: prometheus::IntCounter,
reclamation_lag: prometheus::Histogram,
lock_acquire_lag: prometheus::Histogram,
}
impl ApiLocks {
pub fn new(
name: &'static str,
permits: usize,
shards: usize,
timeout: Duration,
) -> prometheus::Result<Self> {
let registered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_registered",
"Number of semaphores registered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(registered.clone()))?;
let unregistered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_unregistered",
"Number of semaphores unregistered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(unregistered.clone()))?;
let reclamation_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"reclamation_lag_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 1us -> 65ms
// benchmarks on my mac indicate it's usually in the range of 256us and 512us
.buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
)?;
prometheus::register(Box::new(reclamation_lag.clone()))?;
let lock_acquire_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"semaphore_acquire_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 0.1ms -> 6s
.buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
)?;
prometheus::register(Box::new(lock_acquire_lag.clone()))?;
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
permits,
timeout,
lock_acquire_lag,
registered,
unregistered,
reclamation_lag,
})
}
pub async fn get_wake_compute_permit(
&self,
key: &Arc<str>,
) -> Result<WakeComputePermit, errors::WakeComputeError> {
if self.permits == 0 {
return Ok(WakeComputePermit { permit: None });
}
let now = Instant::now();
let semaphore = {
// get fast path
if let Some(semaphore) = self.node_locks.get(key) {
semaphore.clone()
} else {
self.node_locks
.entry(key.clone())
.or_insert_with(|| {
self.registered.inc();
Arc::new(Semaphore::new(self.permits))
})
.clone()
}
};
let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
self.lock_acquire_lag
.observe((Instant::now() - now).as_secs_f64());
Ok(WakeComputePermit {
permit: Some(permit??),
})
}
pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
if self.permits == 0 {
return;
}
let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await;
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
// therefore releasing it is safe from race conditions
info!(
name = self.name,
shard = i,
"performing epoch reclamation on api lock"
);
let mut lock = shard.write();
let timer = self.reclamation_lag.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
drop(lock);
self.unregistered.inc_by(count as u64);
timer.observe_duration()
}
}
}
}
pub struct WakeComputePermit {
// None if the lock is disabled
permit: Option<OwnedSemaphorePermit>,
}
impl WakeComputePermit {
pub fn should_check_cache(&self) -> bool {
self.permit.is_some()
}
}

View File

@@ -3,12 +3,12 @@
use super::{
super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
errors::{ApiError, GetAuthInfoError, WakeComputeError},
ApiCaches, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
use crate::{auth::ClientCredentials, compute, http, scram};
use async_trait::async_trait;
use futures::TryFutureExt;
use std::net::SocketAddr;
use std::{net::SocketAddr, sync::Arc};
use tokio::time::Instant;
use tokio_postgres::config::SslMode;
use tracing::{error, info, info_span, warn, Instrument};
@@ -17,12 +17,17 @@ use tracing::{error, info, info_span, warn, Instrument};
pub struct Api {
endpoint: http::Endpoint,
caches: &'static ApiCaches,
locks: &'static ApiLocks,
jwt: String,
}
impl Api {
/// Construct an API object containing the auth parameters.
pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
pub fn new(
endpoint: http::Endpoint,
caches: &'static ApiCaches,
locks: &'static ApiLocks,
) -> Self {
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
@@ -30,6 +35,7 @@ impl Api {
Self {
endpoint,
caches,
locks,
jwt,
}
}
@@ -99,6 +105,7 @@ impl Api {
.query(&[
("application_name", extra.application_name),
("project", Some(project)),
("options", extra.options),
])
.build()?;
@@ -151,7 +158,7 @@ impl super::Api for Api {
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<CachedNodeInfo, WakeComputeError> {
let key = creds.project().expect("impossible");
let key: &str = &creds.cache_key;
// Every time we do a wakeup http request, the compute node will stay up
// for some time (highly depends on the console's scale-to-zero policy);
@@ -162,9 +169,22 @@ impl super::Api for Api {
return Ok(cached);
}
let key: Arc<str> = key.into();
let permit = self.locks.get_wake_compute_permit(&key).await?;
// after getting back a permit - it's possible the cache was filled
// double check
if permit.should_check_cache() {
if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info");
return Ok(cached);
}
}
let node = self.do_wake_compute(extra, creds).await?;
let (_, cached) = self.caches.node_info.insert(key.into(), node);
info!(key = key, "created a cache entry for compute node info");
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
info!(key = &*key, "created a cache entry for compute node info");
Ok(cached)
}

View File

@@ -1,3 +1,5 @@
#![deny(clippy::undocumented_unsafe_blocks)]
use std::convert::Infallible;
use anyhow::{bail, Context};

View File

@@ -15,10 +15,12 @@ use crate::{
use anyhow::{bail, Context};
use async_trait::async_trait;
use futures::TryFutureExt;
use itertools::Itertools;
use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
use once_cell::sync::Lazy;
use once_cell::sync::{Lazy, OnceCell};
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use prometheus::{register_histogram_vec, HistogramVec};
use regex::Regex;
use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
use tokio::{
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
@@ -568,6 +570,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
"api_console_other_server_error"
}
WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
WakeComputeError::TimeoutError => "timeout_error",
};
NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
}
@@ -881,9 +884,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
allow_self_signed_compute,
} = self;
let console_options = neon_options(params);
let extra = console::ConsoleReqExtra {
session_id, // aka this connection's id
application_name: params.get("application_name"),
options: console_options.as_deref(),
};
let mut latency_timer = LatencyTimer::new(mode.protocol_label());
@@ -945,3 +951,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
proxy_pass(stream, node.stream, &aux).await
}
}
pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
#[allow(unstable_name_collisions)]
let options: String = params
.options_raw()?
.filter(|opt| is_neon_param(opt))
.sorted() // we sort it to use as cache key
.intersperse(" ") // TODO: use impl from std once it's stabilized
.collect();
// Don't even bother with empty options.
if options.is_empty() {
return None;
}
Some(options)
}
pub fn is_neon_param(bytes: &str) -> bool {
static RE: OnceCell<Regex> = OnceCell::new();
RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
RE.get().unwrap().is_match(bytes)
}

View File

@@ -440,6 +440,7 @@ fn helper_create_connect_info(
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: Some("TEST"),
options: None,
};
let creds = auth::BackendType::Test(mechanism);
(cache, extra, creds)

View File

@@ -22,7 +22,10 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth, console,
proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
proxy::{
neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
NUM_DB_CONNECTIONS_OPENED_COUNTER,
},
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
};
use crate::{compute, config};
@@ -41,6 +44,7 @@ pub struct ConnInfo {
pub dbname: String,
pub hostname: String,
pub password: String,
pub options: Option<String>,
}
impl ConnInfo {
@@ -401,26 +405,25 @@ async fn connect_to_compute(
let tls = config.tls_config.as_ref();
let common_names = tls.and_then(|tls| tls.common_names.clone());
let credential_params = StartupMessageParams::new([
let params = StartupMessageParams::new([
("user", &conn_info.username),
("database", &conn_info.dbname),
("application_name", APP_NAME),
("options", conn_info.options.as_deref().unwrap_or("")),
]);
let creds = config
.auth_backend
.as_ref()
.map(|_| {
auth::ClientCredentials::parse(
&credential_params,
Some(&conn_info.hostname),
common_names,
)
})
.map(|_| auth::ClientCredentials::parse(&params, Some(&conn_info.hostname), common_names))
.transpose()?;
let console_options = neon_options(&params);
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: Some(APP_NAME),
options: console_options.as_deref(),
};
let node_info = creds

View File

@@ -174,11 +174,23 @@ fn get_conn_info(
}
}
let pairs = connection_url.query_pairs();
let mut options = Option::None;
for (key, value) in pairs {
if key == "options" {
options = Some(value.to_string());
break;
}
}
Ok(ConnInfo {
username: username.to_owned(),
dbname: dbname.to_owned(),
hostname: hostname.to_owned(),
password: password.to_owned(),
options,
})
}

View File

@@ -1,3 +1,5 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod checks;
pub mod cloud_admin_api;
pub mod garbage;

View File

@@ -1,19 +1,20 @@
use anyhow::{bail, Result};
use utils::auth::{Claims, Scope};
use utils::auth::{AuthError, Claims, Scope};
use utils::id::TenantId;
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
match (&claims.scope, tenant_id) {
(Scope::Tenant, None) => {
bail!("Attempt to access management api with tenant scope. Permission denied")
}
(Scope::Tenant, None) => Err(AuthError(
"Attempt to access management api with tenant scope. Permission denied".into(),
)),
(Scope::Tenant, Some(tenant_id)) => {
if claims.tenant_id.unwrap() != tenant_id {
bail!("Tenant id mismatch. Permission denied")
return Err(AuthError("Tenant id mismatch. Permission denied".into()));
}
Ok(())
}
(Scope::PageServerApi, _) => bail!("PageServerApi scope makes no sense for Safekeeper"),
(Scope::PageServerApi, _) => Err(AuthError(
"PageServerApi scope makes no sense for Safekeeper".into(),
)),
(Scope::SafekeeperData, _) => Ok(()),
}
}

View File

@@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
use safekeeper::{wal_backup, HTTP_RUNTIME};
use storage_broker::DEFAULT_ENDPOINT;
use utils::auth::{JwtAuth, Scope};
use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
use utils::{
id::NodeId,
logging::{self, LogFormat},
@@ -251,10 +251,9 @@ async fn main() -> anyhow::Result<()> {
None
}
Some(path) => {
info!("loading http auth JWT key from {path}");
Some(Arc::new(
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
))
info!("loading http auth JWT key(s) from {path}");
let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?;
Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
}
};

View File

@@ -6,7 +6,7 @@ use std::str::FromStr;
use std::str::{self};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, info_span, Instrument};
use tracing::{debug, info, info_span, Instrument};
use crate::auth::check_permission;
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -165,26 +165,27 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
.auth
.as_ref()
.expect("auth_type is configured but .auth of handler is missing");
let data =
auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
let data = auth
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
.map_err(|e| QueryError::Unauthorized(e.0))?;
// The handler might be configured to allow only tenant scope tokens.
if matches!(allowed_auth_scope, Scope::Tenant)
&& !matches!(data.claims.scope, Scope::Tenant)
{
return Err(QueryError::Other(anyhow::anyhow!(
"passed JWT token is for full access, but only tenant scope is allowed"
)));
return Err(QueryError::Unauthorized(
"passed JWT token is for full access, but only tenant scope is allowed".into(),
));
}
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing"
)));
return Err(QueryError::Unauthorized(
"jwt token scope is Tenant, but tenant id is missing".into(),
));
}
info!(
"jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
debug!(
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
data.claims.scope, data.claims.tenant_id,
);
@@ -263,7 +264,7 @@ impl SafekeeperPostgresHandler {
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
if self.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok
return Ok(());
@@ -275,7 +276,7 @@ impl SafekeeperPostgresHandler {
.claims
.as_ref()
.expect("claims presence already checked");
check_permission(claims, tenant_id)
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
}
async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(

View File

@@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult;
use crate::GlobalTimelines;
use crate::SafeKeeperConf;
use utils::{
auth::JwtAuth,
auth::SwappableJwtAuth,
http::{
endpoint::{self, auth_middleware, check_permission_with},
error::ApiError,
@@ -428,8 +428,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
if ALLOWLIST_ROUTES.contains(request.uri()) {
None
} else {
// Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
// Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().
request
.data::<Option<Arc<SwappableJwtAuth>>>()
.unwrap()
.as_deref()
}
}))
}

View File

@@ -1,3 +1,4 @@
#![deny(clippy::undocumented_unsafe_blocks)]
use camino::Utf8PathBuf;
use once_cell::sync::Lazy;
use remote_storage::RemoteStorageConfig;
@@ -6,7 +7,10 @@ use tokio::runtime::Runtime;
use std::time::Duration;
use storage_broker::Uri;
use utils::id::{NodeId, TenantId, TenantTimelineId};
use utils::{
auth::SwappableJwtAuth,
id::{NodeId, TenantId, TenantTimelineId},
};
mod auth;
pub mod broker;
@@ -69,7 +73,7 @@ pub struct SafeKeeperConf {
pub wal_backup_enabled: bool,
pub pg_auth: Option<Arc<JwtAuth>>,
pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
pub http_auth: Option<Arc<JwtAuth>>,
pub http_auth: Option<Arc<SwappableJwtAuth>>,
pub current_thread_runtime: bool,
}

View File

@@ -111,7 +111,7 @@ impl WalReceivers {
.count()
}
/// Unregister walsender.
/// Unregister walreceiver.
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
let mut shared = self.mutex.lock();
shared.slots[id] = None;
@@ -138,8 +138,8 @@ pub enum WalReceiverStatus {
Streaming,
}
/// Scope guard to access slot in WalSenders registry and unregister from it in
/// Drop.
/// Scope guard to access slot in WalReceivers registry and unregister from
/// it in Drop.
pub struct WalReceiverGuard {
id: WalReceiverId,
walreceivers: Arc<WalReceivers>,

View File

@@ -361,7 +361,6 @@ class PgProtocol:
@dataclass
class AuthKeys:
pub: str
priv: str
def generate_token(self, *, scope: str, **token_data: str) -> str:
@@ -877,9 +876,31 @@ class NeonEnv:
@cached_property
def auth_keys(self) -> AuthKeys:
pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
return AuthKeys(pub=pub, priv=priv)
return AuthKeys(priv=priv)
def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path):
# compare generate_auth_keys() in local_env.rs
subprocess.run(
["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path],
cwd=self.repo_dir,
check=True,
)
subprocess.run(
[
"openssl",
"pkey",
"-in",
privkey_path,
"-pubout",
"-out",
pubkey_path,
],
cwd=self.repo_dir,
check=True,
)
del self.auth_keys
def generate_endpoint_id(self) -> str:
"""

View File

@@ -189,6 +189,10 @@ class PageserverHttpClient(requests.Session):
assert res_json is None
return res_json
def reload_auth_validation_keys(self):
res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
self.verbose_error(res)
def tenant_list(self) -> List[Dict[Any, Any]]:
res = self.get(f"http://localhost:{self.port}/v1/tenant")
self.verbose_error(res)

View File

@@ -1,12 +1,35 @@
import os
from contextlib import closing
from pathlib import Path
import psycopg2
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
from fixtures.pageserver.http import PageserverApiException
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgProtocol,
)
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.types import TenantId, TimelineId
def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
http_client.timeline_create(
pg_version=env.pg_version,
tenant_id=env.initial_tenant,
new_timeline_id=TimelineId.generate(),
ancestor_timeline_id=env.initial_timeline,
)
def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
with pytest.raises(
PageserverApiException,
match="Forbidden: JWT authentication error",
):
assert_client_authorized(env, http_client)
def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
@@ -27,30 +50,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
ps.safe_psql("set FOO", password=pageserver_token)
# tenant can create branches
tenant_http_client.timeline_create(
pg_version=env.pg_version,
tenant_id=env.initial_tenant,
new_timeline_id=TimelineId.generate(),
ancestor_timeline_id=env.initial_timeline,
)
assert_client_authorized(env, tenant_http_client)
# console can create branches for tenant
pageserver_http_client.timeline_create(
pg_version=env.pg_version,
tenant_id=env.initial_tenant,
new_timeline_id=TimelineId.generate(),
ancestor_timeline_id=env.initial_timeline,
)
assert_client_authorized(env, pageserver_http_client)
# fail to create branch using token with different tenant_id
with pytest.raises(
PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
):
invalid_tenant_http_client.timeline_create(
pg_version=env.pg_version,
tenant_id=env.initial_tenant,
new_timeline_id=TimelineId.generate(),
ancestor_timeline_id=env.initial_timeline,
)
with pytest.raises(PageserverApiException, match="Forbidden: JWT authentication error"):
assert_client_authorized(env, invalid_tenant_http_client)
# create tenant using management token
pageserver_http_client.tenant_create(TenantId.generate())
@@ -58,7 +65,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
# fail to create tenant using tenant token
with pytest.raises(
PageserverApiException,
match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
match="Forbidden: JWT authentication error",
):
tenant_http_client.tenant_create(TenantId.generate())
@@ -82,6 +89,96 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
assert cur.fetchone() == (5000050000,)
def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
pageserver_token_old = env.auth_keys.generate_pageserver_token()
pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
pageserver_http_client_old.reload_auth_validation_keys()
# This test is to ensure that the pageserver supports multiple keys.
# The neon_local tool generates one key pair at a hardcoded path by default.
# As a preparation for our test, move the public key of the key pair into a
# directory at the same location as the hardcoded path by:
# 1. moving the the file at `configured_pub_key_path` to a temporary location
# 2. creating a new directory at `configured_pub_key_path`
# 3. moving the file from the temporary location into the newly created directory
configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file")
os.mkdir(configured_pub_key_path)
os.rename(
Path(env.repo_dir) / "auth_public_key.pem.file",
configured_pub_key_path / "auth_public_key_old.pem",
)
# Add a new key pair
# This invalidates env.auth_keys and makes them be regenerated
env.regenerate_keys_at(
Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem")
)
# Reload the keys on the pageserver side
pageserver_http_client_old.reload_auth_validation_keys()
# We can continue doing things using the old token
assert_client_authorized(env, pageserver_http_client_old)
pageserver_token_new = env.auth_keys.generate_pageserver_token()
pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
# The new token also works
assert_client_authorized(env, pageserver_http_client_new)
# Remove the old token and reload
os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem")
pageserver_http_client_old.reload_auth_validation_keys()
# Reloading fails now with the old token, but the new token still works
assert_client_not_authorized(env, pageserver_http_client_old)
assert_client_authorized(env, pageserver_http_client_new)
def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
pageserver_token_old = env.auth_keys.generate_pageserver_token()
pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
pageserver_http_client_old.reload_auth_validation_keys()
# Regenerate the keys
env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem"))
# Reload the keys on the pageserver side
pageserver_http_client_old.reload_auth_validation_keys()
# Next attempt fails as we use the old auth token
with pytest.raises(
PageserverApiException,
match="Forbidden: JWT authentication error",
):
pageserver_http_client_old.reload_auth_validation_keys()
# same goes for attempts trying to create a timeline
assert_client_not_authorized(env, pageserver_http_client_old)
pageserver_token_new = env.auth_keys.generate_pageserver_token()
pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
# timeline creation works with the new token
assert_client_authorized(env, pageserver_http_client_new)
# reloading also works with the new token
pageserver_http_client_new.reload_auth_validation_keys()
@pytest.mark.parametrize("auth_enabled", [False, True])
def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
neon_env_builder.auth_enabled = auth_enabled

View File

@@ -0,0 +1,60 @@
import random
import time
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*simulated connection error.*")
pageserver_http = env.pageserver.http_client()
env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
# Enable failpoint after starting everything else up so that loading initial
# basebackup doesn't fail
pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
cur.execute("CREATE TABLE foo (t text)")
cur.execute(
"""
INSERT INTO foo
SELECT 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Verify that the table is larger than shared_buffers
cur.execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
from pg_settings where name = 'shared_buffers'
"""
)
row = cur.fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
end_time = time.time() + 30
times_executed = 0
while time.time() < end_time:
if random.random() < 0.5:
cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
else:
cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
cur.fetchall()
times_executed += 1
log.info(f"Workload executed {times_executed} times")

View File

@@ -26,6 +26,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
".*will not become active. Current state: Broken.*",
".*failed to load metadata.*",
".*load failed.*load local timeline.*",
".*layer loading failed permanently: load layer: .*",
]
)

View File

@@ -1,9 +1,13 @@
import asyncio
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.remote_storage import RemoteStorageKind
def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
num_connections = 3
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -16,15 +20,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
alt_pageserver_id = env.pageservers[1].id
env.pageservers[1].tenant_attach(env.initial_tenant)
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
pg_conns = [endpoint.connect() for i in range(num_connections)]
curs = [pg_conn.cursor() for pg_conn in pg_conns]
def execute(statement: str):
for cur in curs:
cur.execute(statement)
def fetchone():
results = [cur.fetchone() for cur in curs]
assert all(result == results[0] for result in results)
return results[0]
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
cur.execute("CREATE TABLE foo (t text)")
cur.execute(
curs[0].execute("CREATE TABLE foo (t text)")
curs[0].execute(
"""
INSERT INTO foo
SELECT 'long string to consume some space' || g
@@ -33,25 +46,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
)
# Verify that the table is larger than shared_buffers
cur.execute(
curs[0].execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
from pg_settings where name = 'shared_buffers'
"""
)
row = cur.fetchone()
row = curs[0].fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)
endpoint.reconfigure(pageserver_id=alt_pageserver_id)
# Verify that the neon.pageserver_connstring GUC is set to the correct thing
cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
connstring = cur.fetchone()
execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
connstring = fetchone()
assert connstring is not None
expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
assert expected_connstring == expected_connstring
@@ -60,5 +73,45 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
0
].stop() # Stop the old pageserver just to make sure we're reading from the new one
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)
# Try failing back, and this time we will stop the current pageserver before reconfiguring
# the endpoint. Whereas the previous reconfiguration was like a healthy migration, this
# is more like what happens in an unexpected pageserver failure.
env.pageservers[0].start()
env.pageservers[1].stop()
endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)
env.pageservers[0].stop()
env.pageservers[1].start()
# Test a (former) bug where a child process spins without updating its connection string
# by executing a query separately. This query will hang until we issue the reconfigure.
async def reconfigure_async():
await asyncio.sleep(
1
) # Sleep for 1 second just to make sure we actually started our count(*) query
endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
def execute_count():
execute("SELECT count(*) FROM FOO")
async def execute_and_reconfigure():
task_exec = asyncio.to_thread(execute_count)
task_reconfig = asyncio.create_task(reconfigure_async())
await asyncio.gather(
task_exec,
task_reconfig,
)
asyncio.run(execute_and_reconfigure())
assert fetchone() == (100000,)
# One final check that nothing hangs
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)

View File

@@ -79,11 +79,27 @@ def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
endpoint_main = env.endpoints.create_start("test_lsn_mapping")
log.info("postgres is running on 'test_lsn_mapping' branch")
tenant_id, _ = env.neon_cli.create_tenant(
conf={
# disable default GC and compaction
"gc_period": "1000 m",
"compaction_period": "0 s",
"gc_horizon": f"{1024 ** 2}",
"checkpoint_distance": f"{1024 ** 2}",
"compaction_target_size": f"{1024 ** 2}",
}
)
timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id)
endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id)
timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0]
log.info("postgres is running on 'main' branch")
cur = endpoint_main.connect().cursor()
# Obtain an lsn before all write operations on this branch
start_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_lsn()"))
# Create table, and insert rows, each in a separate transaction
# Disable `synchronous_commit` to make this initialization go faster.
# XXX: on my laptop this test takes 7s, and setting `synchronous_commit=off`
@@ -106,29 +122,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
cur.execute("INSERT INTO foo VALUES (-1)")
# Wait until WAL is received by pageserver
wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
last_flush_lsn = wait_for_last_flush_lsn(env, endpoint_main, tenant_id, timeline_id)
with env.pageserver.http_client() as client:
# Check edge cases
# Timestamp is in the future
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
)
assert result["kind"] in ["present", "future"]
# make sure that we return a well advanced lsn here
assert Lsn(result["lsn"]) > start_lsn
# Timestamp is in the unreachable past
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
)
assert result["kind"] == "past"
# make sure that we return the minimum lsn here at the start of the range
assert Lsn(result["lsn"]) < start_lsn
# Probe a bunch of timestamps in the valid range
for i in range(1, len(tbl), 100):
probe_timestamp = tbl[i][1]
result = client.timeline_get_lsn_by_timestamp(
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
)
assert result["kind"] not in ["past", "nodata"]
lsn = result["lsn"]
@@ -136,12 +156,29 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
# Launch a new read-only node at that LSN, and check that only the rows
# that were supposed to be committed at that point in time are visible.
endpoint_here = env.endpoints.create_start(
branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
branch_name="test_lsn_mapping",
endpoint_id="ep-lsn_mapping_read",
lsn=lsn,
tenant_id=tenant_id,
)
assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
endpoint_here.stop_and_destroy()
# Do the "past" check again at a new branch to ensure that we don't return something before the branch cutoff
timeline_id_child = env.neon_cli.create_branch(
"test_lsn_mapping_child", tenant_id=tenant_id, ancestor_branch_name="test_lsn_mapping"
)
# Timestamp is in the unreachable past
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
)
assert result["kind"] == "past"
# make sure that we return the minimum lsn here at the start of the range
assert Lsn(result["lsn"]) >= last_flush_lsn
# Test pageserver get_timestamp_of_lsn API
def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):

View File

@@ -366,11 +366,17 @@ def test_deletion_queue_recovery(
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
if validate_before == ValidateBefore.VALIDATE:
# At this point, one or more DeletionLists have been written. We have set a failpoint
# to prevent them successfully executing, but we want to see them get validated.
#
# We await _some_ validations instead of _all_ validations, because our execution failpoint
# will prevent validation proceeding for any but the first DeletionList. Usually the workload
# just generates one, but if it generates two due to timing, then we must not expect that the
# second one will be validated.
def assert_some_validations():
assert get_deletion_queue_validated(ps_http) > 0
def assert_validation_complete():
assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
wait_until(20, 1, assert_validation_complete)
wait_until(20, 1, assert_some_validations)
# The validatated keys statistic advances before the header is written, so we
# also wait to see the header hit the disk: this seems paranoid but the race
@@ -380,6 +386,11 @@ def test_deletion_queue_recovery(
wait_until(20, 1, assert_header_written)
# If we will lose attachment, then our expectation on restart is that only the ones
# we already validated will execute. Act like only those were present in the queue.
if keep_attachment == KeepAttachment.LOSE:
before_restart_depth = get_deletion_queue_validated(ps_http)
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
env.pageserver.stop(immediate=True)
@@ -402,11 +413,13 @@ def test_deletion_queue_recovery(
ps_http.deletion_queue_flush(execute=True)
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
if keep_attachment == KeepAttachment.KEEP:
# - If we kept the attachment, then our pre-restart deletions should execute
# because on re-attach they were from the immediately preceding generation
# - If we validated before restart, then the deletions should execute because the
# deletion queue header records a validated deletion list sequence number.
assert get_deletion_queue_executed(ps_http) == before_restart_depth
elif validate_before == ValidateBefore.VALIDATE:
# - If we validated before restart, then we should execute however many keys were
# validated before restart.
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

View File

@@ -17,10 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
n_restarts = 10
scale = 10
# Pageserver currently logs requests on non-active tenants at error level
# https://github.com/neondatabase/neon/issues/5784
env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])

View File

@@ -25,6 +25,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
clap = { version = "4", features = ["derive", "string"] }
clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
crossbeam-utils = { version = "0.8" }
dashmap = { version = "5", default-features = false, features = ["raw-api"] }
either = { version = "1" }
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
futures = { version = "0.3" }
@@ -38,7 +39,7 @@ hex = { version = "0.4", features = ["serde"] }
hyper = { version = "0.14", features = ["full"] }
itertools = { version = "0.10" }
libc = { version = "0.2", features = ["extra_traits"] }
log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
log = { version = "0.4", default-features = false, features = ["std"] }
memchr = { version = "2" }
nom = { version = "7" }
num-bigint = { version = "0.4" }
@@ -55,7 +56,6 @@ scopeguard = { version = "1" }
serde = { version = "1", features = ["alloc", "derive"] }
serde_json = { version = "1", features = ["raw_value"] }
smallvec = { version = "1", default-features = false, features = ["write"] }
standback = { version = "0.2", default-features = false, features = ["std"] }
time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
tokio-rustls = { version = "0.24" }
@@ -76,14 +76,13 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
either = { version = "1" }
itertools = { version = "0.10" }
libc = { version = "0.2", features = ["extra_traits"] }
log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
log = { version = "0.4", default-features = false, features = ["std"] }
memchr = { version = "2" }
nom = { version = "7" }
prost = { version = "0.11" }
regex = { version = "1" }
regex-syntax = { version = "0.7" }
serde = { version = "1", features = ["alloc", "derive"] }
standback = { version = "0.2", default-features = false, features = ["std"] }
syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }