Compare commits

..

1 Commits

Author SHA1 Message Date
Conrad Ludgate
ca461ce2fd proxy: force no retry some errors 2023-09-29 17:33:13 +01:00
28 changed files with 247 additions and 562 deletions

View File

@@ -5,7 +5,7 @@
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
/pageserver/ @neondatabase/storage
/pageserver/ @neondatabase/compute @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/proxy
/safekeeper/ @neondatabase/safekeepers

23
Cargo.lock generated
View File

@@ -1780,9 +1780,18 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
version = "0.3.3"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "hex"
@@ -2044,7 +2053,7 @@ version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.1",
"libc",
"windows-sys 0.48.0",
]
@@ -2061,7 +2070,7 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.1",
"io-lifetimes",
"rustix 0.37.19",
"windows-sys 0.48.0",
@@ -2435,11 +2444,11 @@ dependencies = [
[[package]]
name = "num_cpus"
version = "1.16.0"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
dependencies = [
"hermit-abi",
"hermit-abi 0.2.6",
"libc",
]

View File

@@ -615,7 +615,11 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
#########################################################################################
#
# Layer "rust extensions"
# This layer is used to build `pgrx` deps
# This layer is used to build `pgx` deps
#
# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
# dependency on all the rust extension that depend on it, too.
#
#########################################################################################
FROM build-deps AS rust-extensions-build
@@ -631,12 +635,22 @@ USER nonroot
WORKDIR /home/nonroot
ARG PG_VERSION
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
;; \
esac && \
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
cargo install --locked --version 0.10.2 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
cargo install --locked --version 0.7.3 cargo-pgx && \
/bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
@@ -650,11 +664,23 @@ USER root
FROM rust-extensions-build AS pg-jsonschema-pg-build
ARG PG_VERSION
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
;; \
esac && \
wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
#########################################################################################
@@ -667,11 +693,26 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.
FROM rust-extensions-build AS pg-graphql-pg-build
ARG PG_VERSION
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in
# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
# same 1.1 version we've used before.
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
cargo pgx install --release && \
# it's needed to enable extension because it uses untrusted C language
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -686,11 +727,21 @@ RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz
FROM rust-extensions-build AS pg-tiktoken-pg-build
ARG PG_VERSION
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
cargo pgrx install --release && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
@@ -703,15 +754,21 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
echo "********************************************************************************************************" && \
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################

View File

@@ -1,5 +1,5 @@
use std::sync::Arc;
use std::{thread, time::Duration};
use std::{thread, time};
use chrono::{DateTime, Utc};
use postgres::{Client, NoTls};
@@ -7,7 +7,7 @@ use tracing::{debug, info};
use crate::compute::ComputeNode;
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
// Spin in a loop and figure out the last activity time in the Postgres.
// Then update it in the shared state. This function never errors out.
@@ -17,12 +17,13 @@ fn watch_compute_activity(compute: &ComputeNode) {
let connstr = compute.connstr.as_str();
// Define `client` outside of the loop to reuse existing connection if it's active.
let mut client = Client::connect(connstr, NoTls);
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
info!("watching Postgres activity at {}", connstr);
loop {
// Should be outside of the write lock to allow others to read while we sleep.
thread::sleep(MONITOR_CHECK_INTERVAL);
thread::sleep(timeout);
match &mut client {
Ok(cli) => {

View File

@@ -23,7 +23,7 @@ vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
ignore = []
ignore = ["RUSTSEC-2023-0052"]
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:

View File

@@ -47,47 +47,10 @@ pub struct S3Bucket {
bucket_name: String,
prefix_in_bucket: Option<String>,
max_keys_per_list_response: Option<i32>,
concurrency_limiter: ConcurrencyLimiter,
}
struct ConcurrencyLimiter {
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
// The helps to ensure we don't exceed the thresholds.
write: Arc<Semaphore>,
read: Arc<Semaphore>,
}
impl ConcurrencyLimiter {
fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
match kind {
RequestKind::Get => &self.read,
RequestKind::Put => &self.write,
RequestKind::List => &self.read,
RequestKind::Delete => &self.write,
}
}
async fn acquire(
&self,
kind: RequestKind,
) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
self.for_kind(kind).acquire().await
}
async fn acquire_owned(
&self,
kind: RequestKind,
) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
Arc::clone(self.for_kind(kind)).acquire_owned().await
}
fn new(limit: usize) -> ConcurrencyLimiter {
Self {
read: Arc::new(Semaphore::new(limit)),
write: Arc::new(Semaphore::new(limit)),
}
}
concurrency_limiter: Arc<Semaphore>,
}
#[derive(Default)]
@@ -154,7 +117,7 @@ impl S3Bucket {
bucket_name: aws_config.bucket_name.clone(),
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
})
}
@@ -193,7 +156,7 @@ impl S3Bucket {
let started_at = start_counting_cancelled_wait(kind);
let permit = self
.concurrency_limiter
.acquire(kind)
.acquire()
.await
.expect("semaphore is never closed");
@@ -209,7 +172,8 @@ impl S3Bucket {
let started_at = start_counting_cancelled_wait(kind);
let permit = self
.concurrency_limiter
.acquire_owned(kind)
.clone()
.acquire_owned()
.await
.expect("semaphore is never closed");

View File

@@ -24,9 +24,6 @@ pub enum ApiError {
#[error("Precondition failed: {0}")]
PreconditionFailed(Box<str>),
#[error("Resource temporarily unavailable: {0}")]
ResourceUnavailable(String),
#[error("Shutting down")]
ShuttingDown,
@@ -62,10 +59,6 @@ impl ApiError {
"Shutting down".to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -132,7 +132,7 @@ impl From<PageReconstructError> for ApiError {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::AncestorStopping(_) => {
ApiError::ResourceUnavailable(format!("{pre}"))
ApiError::InternalServerError(anyhow::Error::new(pre))
}
PageReconstructError::WalRedo(pre) => {
ApiError::InternalServerError(anyhow::Error::new(pre))
@@ -145,7 +145,7 @@ impl From<TenantMapInsertError> for ApiError {
fn from(tmie: TenantMapInsertError) -> ApiError {
match tmie {
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
ApiError::ResourceUnavailable(format!("{tmie}"))
ApiError::InternalServerError(anyhow::Error::new(tmie))
}
TenantMapInsertError::TenantAlreadyExists(id, state) => {
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
@@ -159,12 +159,6 @@ impl From<TenantStateError> for ApiError {
fn from(tse: TenantStateError) -> ApiError {
match tse {
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
TenantStateError::NotActive(_) => {
ApiError::ResourceUnavailable("Tenant not yet active".into())
}
TenantStateError::IsStopping(_) => {
ApiError::ResourceUnavailable("Tenant is stopping".into())
}
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
}
}
@@ -174,17 +168,14 @@ impl From<GetTenantError> for ApiError {
fn from(tse: GetTenantError) -> ApiError {
match tse {
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
GetTenantError::Broken(reason) => {
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
}
GetTenantError::NotActive(_) => {
e @ GetTenantError::NotActive(_) => {
// Why is this not `ApiError::NotFound`?
// Because we must be careful to never return 404 for a tenant if it does
// in fact exist locally. If we did, the caller could draw the conclusion
// that it can attach the tenant to another PS and we'd be in split-brain.
//
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::ResourceUnavailable("Tenant not yet active".into())
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
}
@@ -631,9 +622,8 @@ async fn tenant_list_handler(
let response_data = mgr::list_tenants()
.instrument(info_span!("tenant_list"))
.await
.map_err(|_| {
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string())
})?
.map_err(anyhow::Error::new)
.map_err(ApiError::InternalServerError)?
.iter()
.map(|(id, state)| TenantInfo {
id: *id,

View File

@@ -264,46 +264,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",
"Time spent acquiring a pinned slot in the page cache",
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
)
.expect("failed to define a metric")
});
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"page_cache_errors_total",
"Number of timeouts while acquiring a pinned slot in the page cache",
&["error_kind"]
)
.expect("failed to define a metric")
});
#[derive(IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum PageCacheErrorKind {
AcquirePinnedSlotTimeout,
EvictIterLimit,
}
pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
PAGE_CACHE_ERRORS
.get_metric_with_label_values(&[error_kind.into()])
.unwrap()
.inc();
}
pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wait_lsn_seconds",

View File

@@ -75,11 +75,7 @@
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::{
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
Arc, Weak,
},
time::Duration,
sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
};
use anyhow::Context;
@@ -169,8 +165,6 @@ struct Slot {
struct SlotInner {
key: Option<CacheKey>,
// for `coalesce_readers_permit`
permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
buf: &'static mut [u8; PAGE_SZ],
}
@@ -213,22 +207,6 @@ impl Slot {
}
}
impl SlotInner {
/// If there is aready a reader, drop our permit and share its permit, just like we share read access.
fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
let mut guard = self.permit.lock().unwrap();
if let Some(existing_permit) = guard.upgrade() {
drop(guard);
drop(permit);
existing_permit
} else {
let permit = Arc::new(permit);
*guard = Arc::downgrade(&permit);
permit
}
}
}
pub struct PageCache {
/// This contains the mapping from the cache key to buffer slot that currently
/// contains the page, if any.
@@ -246,8 +224,6 @@ pub struct PageCache {
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
pinned_slots: Arc<tokio::sync::Semaphore>,
/// Index of the next candidate to evict, for the Clock replacement algorithm.
/// This is interpreted modulo the page cache size.
next_evict_slot: AtomicUsize,
@@ -255,28 +231,23 @@ pub struct PageCache {
size_metrics: &'static PageCacheSizeMetrics,
}
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
///
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
/// until the guard is dropped.
///
pub struct PageReadGuard<'i> {
_permit: Arc<PinnedSlotsPermit>,
slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
}
pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
impl std::ops::Deref for PageReadGuard<'_> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
self.slot_guard.buf
self.0.buf
}
}
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
fn as_ref(&self) -> &[u8; PAGE_SZ] {
self.slot_guard.buf
self.0.buf
}
}
@@ -293,8 +264,6 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
pub struct PageWriteGuard<'i> {
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
_permit: PinnedSlotsPermit,
// Are the page contents currently valid?
// Used to mark pages as invalid that are assigned but not yet filled with data.
valid: bool,
@@ -379,10 +348,6 @@ impl PageCache {
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, PageReadGuard)> {
let Ok(permit) = self.try_get_pinned_slot_permit().await else {
return None;
};
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_materialized_page
@@ -397,10 +362,7 @@ impl PageCache {
lsn,
};
if let Some(guard) = self
.try_lock_for_read(&mut cache_key, &mut Some(permit))
.await
{
if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
if let CacheKey::MaterializedPage {
hash_key: _,
lsn: available_lsn,
@@ -483,29 +445,6 @@ impl PageCache {
// "mappings" after this section. But the routines in this section should
// not require changes.
async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
match tokio::time::timeout(
// Choose small timeout, neon_smgr does its own retries.
// https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
Duration::from_secs(10),
Arc::clone(&self.pinned_slots).acquire_owned(),
)
.await
{
Ok(res) => Ok(PinnedSlotsPermit(
res.expect("this semaphore is never closed"),
)),
Err(_timeout) => {
timer.stop_and_discard();
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
);
anyhow::bail!("timeout: there were page guards alive for all page cache slots")
}
}
}
/// Look up a page in the cache.
///
/// If the search criteria is not exact, *cache_key is updated with the key
@@ -515,11 +454,7 @@ impl PageCache {
///
/// If no page is found, returns None and *cache_key is left unmodified.
///
async fn try_lock_for_read(
&self,
cache_key: &mut CacheKey,
permit: &mut Option<PinnedSlotsPermit>,
) -> Option<PageReadGuard> {
async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
let cache_key_orig = cache_key.clone();
if let Some(slot_idx) = self.search_mapping(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
@@ -529,10 +464,7 @@ impl PageCache {
let inner = slot.inner.read().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
return Some(PageReadGuard {
_permit: inner.coalesce_readers_permit(permit.take().unwrap()),
slot_guard: inner,
});
return Some(PageReadGuard(inner));
} else {
// search_mapping might have modified the search key; restore it.
*cache_key = cache_key_orig;
@@ -575,8 +507,6 @@ impl PageCache {
cache_key: &mut CacheKey,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
@@ -593,21 +523,17 @@ impl PageCache {
let mut is_first_iteration = true;
loop {
// First check if the key already exists in the cache.
if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
debug_assert!(permit.is_none());
if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
if is_first_iteration {
hit.inc();
}
return Ok(ReadBufResult::Found(read_guard));
}
debug_assert!(permit.is_some());
is_first_iteration = false;
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
.find_victim(permit.as_ref().unwrap())
.await
.context("Failed to find evict victim")?;
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -629,16 +555,7 @@ impl PageCache {
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
return Ok(ReadBufResult::NotFound(PageWriteGuard {
_permit: permit.take().unwrap(),
inner,
valid: false,
}));
@@ -649,11 +566,7 @@ impl PageCache {
/// found, returns None.
///
/// When locking a page for writing, the search criteria is always "exact".
async fn try_lock_for_write(
&self,
cache_key: &CacheKey,
permit: &mut Option<PinnedSlotsPermit>,
) -> Option<PageWriteGuard> {
async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we don't released the mapping
@@ -662,18 +575,7 @@ impl PageCache {
let inner = slot.inner.write().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
return Some(PageWriteGuard {
_permit: permit.take().unwrap(),
inner,
valid: true,
});
return Some(PageWriteGuard { inner, valid: true });
}
}
None
@@ -684,20 +586,15 @@ impl PageCache {
/// Similar to lock_for_read(), but the returned buffer is write-locked and
/// may be modified by the caller even if it's already found in the cache.
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
loop {
// First check if the key already exists in the cache.
if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
debug_assert!(permit.is_none());
if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
return Ok(WriteBufResult::Found(write_guard));
}
debug_assert!(permit.is_some());
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
.find_victim(permit.as_ref().unwrap())
.await
.context("Failed to find evict victim")?;
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -719,16 +616,7 @@ impl PageCache {
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
return Ok(WriteBufResult::NotFound(PageWriteGuard {
_permit: permit.take().unwrap(),
inner,
valid: false,
}));
@@ -881,10 +769,7 @@ impl PageCache {
/// Find a slot to evict.
///
/// On return, the slot is empty and write-locked.
async fn find_victim(
&self,
_permit_witness: &PinnedSlotsPermit,
) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
let iter_limit = self.slots.len() * 10;
let mut iters = 0;
loop {
@@ -897,40 +782,13 @@ impl PageCache {
let mut inner = match slot.inner.try_write() {
Ok(inner) => inner,
Err(_err) => {
// If we have looped through the whole buffer pool 10 times
// and still haven't found a victim buffer, something's wrong.
// Maybe all the buffers were in locked. That could happen in
// theory, if you have more threads holding buffers locked than
// there are buffers in the pool. In practice, with a reasonably
// large buffer pool it really shouldn't happen.
if iters > iter_limit {
// NB: Even with the permits, there's no hard guarantee that we will find a slot with
// any particular number of iterations: other threads might race ahead and acquire and
// release pins just as we're scanning the array.
//
// Imagine that nslots is 2, and as starting point, usage_count==1 on all
// slots. There are two threads running concurrently, A and B. A has just
// acquired the permit from the semaphore.
//
// A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
// B: Acquire permit.
// B: Look at slot 2, decrement its usage_count to zero and continue the search
// B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
// B: Release pin and permit again
// B: Acquire permit.
// B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
// B: Release pin and permit again
//
// Now we're back in the starting situation that both slots have
// usage_count 1, but A has now been through one iteration of the
// find_victim() loop. This can repeat indefinitely and on each
// iteration, A's iteration count increases by one.
//
// So, even though the semaphore for the permits is fair, the victim search
// itself happens in parallel and is not fair.
// Hence even with a permit, a task can theoretically be starved.
// To avoid this, we'd need tokio to give priority to tasks that are holding
// permits for longer.
// Note that just yielding to tokio during iteration without such
// priority boosting is likely counter-productive. We'd just give more opportunities
// for B to bump usage count, further starving A.
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::EvictIterLimit,
);
anyhow::bail!("exceeded evict iter limit");
}
continue;
@@ -941,7 +799,6 @@ impl PageCache {
self.remove_mapping(old_key);
inner.key = None;
}
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
return Ok((slot_idx, inner));
}
}
@@ -969,11 +826,7 @@ impl PageCache {
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
Slot {
inner: tokio::sync::RwLock::new(SlotInner {
key: None,
buf,
permit: std::sync::Mutex::new(Weak::new()),
}),
inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
usage_count: AtomicU8::new(0),
}
})
@@ -985,7 +838,6 @@ impl PageCache {
slots,
next_evict_slot: AtomicUsize::new(0),
size_metrics,
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
}
}
}

View File

@@ -1265,10 +1265,7 @@ async fn get_active_tenant_with_timeout(
Ok(tenant) => tenant,
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
Err(GetTenantError::NotActive(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
}
Err(GetTenantError::Broken(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
unreachable!("we're calling get_tenant with active=false")
}
};
let wait_time = Duration::from_secs(30);

View File

@@ -130,15 +130,10 @@ pub async fn init_tenant_mgr(
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
//
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
resources
.deletion_queue_client
.recover(result.clone())
.await?;
}
resources
.deletion_queue_client
.recover(result.clone())
.await?;
Some(result)
} else {
@@ -510,11 +505,6 @@ pub enum GetTenantError {
NotFound(TenantId),
#[error("Tenant {0} is not active")]
NotActive(TenantId),
/// Broken is logically a subset of NotActive, but a distinct error is useful as
/// NotActive is usually a retryable state for API purposes, whereas Broken
/// is a stuck error state
#[error("Tenant is broken: {0}")]
Broken(String),
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
@@ -529,20 +519,10 @@ pub async fn get_tenant(
let tenant = m
.get(&tenant_id)
.ok_or(GetTenantError::NotFound(tenant_id))?;
match tenant.current_state() {
TenantState::Broken {
reason,
backtrace: _,
} if active_only => Err(GetTenantError::Broken(reason)),
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
if active_only && !tenant.is_active() {
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}

View File

@@ -496,36 +496,13 @@ impl Timeline {
};
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
let path = self
.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
timer.stop_and_record();
let timer = RECONSTRUCT_TIME.start_timer();
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
timer.stop_and_record();
if cfg!(feature = "testing") && res.is_err() {
// it can only be walredo issue
use std::fmt::Write;
let mut msg = String::new();
path.into_iter().for_each(|(res, cont_lsn, layer)| {
writeln!(
msg,
"- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
layer(),
)
.expect("string grows")
});
// this is to rule out or provide evidence that we could in some cases read a duplicate
// walrecord
tracing::info!("walredo failed, path:\n{msg}");
}
res
RECONSTRUCT_TIME
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
.await
}
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
@@ -2247,7 +2224,7 @@ impl Timeline {
request_lsn: Lsn,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
) -> Result<(), PageReconstructError> {
// Start from the current timeline.
let mut timeline_owned;
let mut timeline = self;
@@ -2278,12 +2255,12 @@ impl Timeline {
// The function should have updated 'state'
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
match result {
ValueReconstructResult::Complete => return Ok(traversal_path),
ValueReconstructResult::Complete => return Ok(()),
ValueReconstructResult::Continue => {
// If we reached an earlier cached page image, we're done.
if cont_lsn == cached_lsn + 1 {
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
return Ok(traversal_path);
return Ok(());
}
if prev_lsn <= cont_lsn {
// Didn't make any progress in last iteration. Error out to avoid

View File

@@ -177,19 +177,19 @@ impl OpenFiles {
pub enum CrashsafeOverwriteError {
#[error("final path has no parent dir")]
FinalPathHasNoParentDir,
#[error("remove tempfile")]
#[error("remove tempfile: {0}")]
RemovePreviousTempfile(#[source] std::io::Error),
#[error("create tempfile")]
#[error("create tempfile: {0}")]
CreateTempfile(#[source] std::io::Error),
#[error("write tempfile")]
#[error("write tempfile: {0}")]
WriteContents(#[source] std::io::Error),
#[error("sync tempfile")]
#[error("sync tempfile: {0}")]
SyncTempfile(#[source] std::io::Error),
#[error("rename tempfile to final path")]
#[error("rename tempfile to final path: {0}")]
RenameTempfileToFinalPath(#[source] std::io::Error),
#[error("open final path parent dir")]
#[error("open final path parent dir: {0}")]
OpenFinalPathParentDir(#[source] std::io::Error),
#[error("sync final path parent dir")]
#[error("sync final path parent dir: {0}")]
SyncFinalPathParentDir(#[source] std::io::Error),
}
impl CrashsafeOverwriteError {

View File

@@ -38,9 +38,6 @@ use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
#[cfg(feature = "testing")]
use std::sync::atomic::{AtomicUsize, Ordering};
use crate::metrics::{
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
WAL_REDO_WAIT_TIME,
@@ -116,9 +113,6 @@ struct ProcessOutput {
pub struct PostgresRedoManager {
tenant_id: TenantId,
conf: &'static PageServerConf,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
stdout: Mutex<Option<ProcessOutput>>,
stdin: Mutex<Option<ProcessInput>>,
@@ -230,8 +224,6 @@ impl PostgresRedoManager {
PostgresRedoManager {
tenant_id,
conf,
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
stdin: Mutex::new(None),
stdout: Mutex::new(None),
stderr: Mutex::new(None),
@@ -298,25 +290,25 @@ impl PostgresRedoManager {
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
debug!(
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
len,
nbytes,
duration.as_micros(),
lsn
);
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
len,
nbytes,
duration.as_micros(),
lsn
);
// If something went wrong, don't try to reuse the process. Kill it, and
// next request will launch a new one.
if result.is_err() {
error!(
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn
);
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn
);
// self.stdin only holds stdin & stderr as_raw_fd().
// Dropping it as part of take() doesn't close them.
// The owning objects (ChildStdout and ChildStderr) are stored in
@@ -750,7 +742,7 @@ impl PostgresRedoManager {
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
fn apply_wal_records(
&self,
input: MutexGuard<Option<ProcessInput>>,
mut input: MutexGuard<Option<ProcessInput>>,
tag: BufferTag,
base_img: &Option<Bytes>,
records: &[(Lsn, NeonWalRecord)],
@@ -787,23 +779,6 @@ impl PostgresRedoManager {
build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
fn apply_wal_records0(
&self,
writebuf: &[u8],
mut input: MutexGuard<Option<ProcessInput>>,
wal_redo_timeout: Duration,
) -> Result<Bytes, std::io::Error> {
let proc = input.as_mut().unwrap();
let mut nwrite = 0usize;
let stdout_fd = proc.stdout_fd;
@@ -1009,38 +984,6 @@ impl PostgresRedoManager {
}
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
/// Wrapper type around `std::process::Child` which guarantees that the child

View File

@@ -741,13 +741,6 @@ NeonProcessUtility(
break;
case T_DropdbStmt:
HandleDropDb(castNode(DropdbStmt, parseTree));
/*
* We do this here to hack around the fact that Postgres performs the drop
* INSIDE of standard_ProcessUtility, which means that if we try to
* abort the drop normally it'll be too late. DROP DATABASE can't be inside
* of a transaction block anyway, so this should be fine to do.
*/
NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
break;
case T_CreateRoleStmt:
HandleCreateRole(castNode(CreateRoleStmt, parseTree));

12
poetry.lock generated
View File

@@ -2415,18 +2415,18 @@ files = [
[[package]]
name = "urllib3"
version = "1.26.17"
version = "1.26.11"
description = "HTTP library with thread-safe connection pooling, file post, and more."
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4"
files = [
{file = "urllib3-1.26.17-py2.py3-none-any.whl", hash = "sha256:94a757d178c9be92ef5539b8840d48dc9cf1b2709c9d6b588232a055c524458b"},
{file = "urllib3-1.26.17.tar.gz", hash = "sha256:24d6a242c28d29af46c3fae832c36db3bbebcc533dd1bb549172cd739c82df21"},
{file = "urllib3-1.26.11-py2.py3-none-any.whl", hash = "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc"},
{file = "urllib3-1.26.11.tar.gz", hash = "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a"},
]
[package.extras]
brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[[package]]

View File

@@ -160,19 +160,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
Test(_) => Some("test".to_owned()),
}
}
/// Get username from the credentials.
pub fn get_user(&self) -> &str {
use BackendType::*;
match self {
Console(_, creds) => creds.user,
Postgres(_, creds) => creds.user,
Link(_) => "link",
Test(_) => "test",
}
}
/// Authenticate the client via the requested backend, possibly using credentials.
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate(

View File

@@ -95,6 +95,20 @@ pub mod errors {
_ => false,
}
}
fn cannot_retry(&self) -> bool {
match self {
// retry some transport errors
Self::Transport(io) => io.cannot_retry(),
// locked can be returned when the endpoint was in transition
// or when quotas are exceeded. don't retry when quotas are exceeded
Self::Console {
status: http::StatusCode::LOCKED,
ref text,
} => text.contains("quota"),
_ => false,
}
}
}
impl From<reqwest::Error> for ApiError {

View File

@@ -414,6 +414,9 @@ where
Ok(res) => return Ok(res),
Err(e) => {
error!(error = ?e, "could not connect to compute node");
if e.cannot_retry() {
return Err(e.into());
}
(invalidate_cache(node_info), e)
}
};
@@ -501,6 +504,8 @@ pub fn handle_try_wake(
pub trait ShouldRetry {
fn could_retry(&self) -> bool;
/// return true if retrying definitely won't make a difference - or is harmful
fn cannot_retry(&self) -> bool;
fn should_retry(&self, num_retries: u32) -> bool {
match self {
_ if num_retries >= NUM_RETRIES_CONNECT => false,
@@ -517,6 +522,9 @@ impl ShouldRetry for io::Error {
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
)
}
fn cannot_retry(&self) -> bool {
false
}
}
impl ShouldRetry for tokio_postgres::error::DbError {
@@ -530,6 +538,22 @@ impl ShouldRetry for tokio_postgres::error::DbError {
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
)
}
fn cannot_retry(&self) -> bool {
use tokio_postgres::error::SqlState;
match self.code() {
&SqlState::TOO_MANY_CONNECTIONS
| &SqlState::INVALID_CATALOG_NAME
| &SqlState::INVALID_PASSWORD => true,
// pgbouncer errors?
&SqlState::PROTOCOL_VIOLATION => matches!(
self.message(),
"no more connections allowed (max_client_conn)"
| "server login has been failing, try again later (server_login_retry)"
| "query_wait_timeout"
),
_ => false,
}
}
}
impl ShouldRetry for tokio_postgres::Error {
@@ -542,6 +566,13 @@ impl ShouldRetry for tokio_postgres::Error {
false
}
}
fn cannot_retry(&self) -> bool {
if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
tokio_postgres::error::DbError::cannot_retry(db_err)
} else {
false
}
}
}
impl ShouldRetry for compute::ConnectionError {
@@ -552,6 +583,13 @@ impl ShouldRetry for compute::ConnectionError {
_ => false,
}
}
fn cannot_retry(&self) -> bool {
match self {
compute::ConnectionError::Postgres(err) => err.cannot_retry(),
compute::ConnectionError::CouldNotConnect(err) => err.cannot_retry(),
_ => false,
}
}
}
pub fn retry_after(num_retries: u32) -> time::Duration {
@@ -697,14 +735,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
.await
{
Ok(auth_result) => auth_result,
Err(e) => {
let user = creds.get_user();
let db = params.get("database");
let app = params.get("application_name");
let params_span = tracing::info_span!("", ?user, ?db, ?app);
return stream.throw_error(e).instrument(params_span).await;
}
Err(e) => return stream.throw_error(e).await,
};
let AuthSuccess {

View File

@@ -365,6 +365,9 @@ impl ShouldRetry for TestConnectError {
fn could_retry(&self) -> bool {
self.retryable
}
fn cannot_retry(&self) -> bool {
false
}
}
#[async_trait]

View File

@@ -37,7 +37,6 @@ from psycopg2.extensions import connection as PgConnection
from psycopg2.extensions import cursor as PgCursor
from psycopg2.extensions import make_dsn, parse_dsn
from typing_extensions import Literal
from urllib3.util.retry import Retry
from fixtures.broker import NeonBroker
from fixtures.log_helper import log
@@ -1652,14 +1651,11 @@ class NeonPageserver(PgProtocol):
if '"testing"' not in self.version:
pytest.skip("pageserver was built without 'testing' feature")
def http_client(
self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
) -> PageserverHttpClient:
def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient:
return PageserverHttpClient(
port=self.service_port.http,
auth_token=auth_token,
is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
retries=retries,
)
@property

View File

@@ -7,8 +7,6 @@ from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from fixtures.log_helper import log
from fixtures.metrics import Metrics, parse_metrics
@@ -115,40 +113,12 @@ class TenantConfig:
class PageserverHttpClient(requests.Session):
def __init__(
self,
port: int,
is_testing_enabled_or_skip: Fn,
auth_token: Optional[str] = None,
retries: Optional[Retry] = None,
):
def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
super().__init__()
self.port = port
self.auth_token = auth_token
self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
if retries is None:
# We apply a retry policy that is different to the default `requests` behavior,
# because the pageserver has various transiently unavailable states that benefit
# from a client retrying on 503
retries = Retry(
# Status retries are for retrying on 503 while e.g. waiting for tenants to activate
status=5,
# Connection retries are for waiting for the pageserver to come up and listen
connect=5,
# No read retries: if a request hangs that is not expected behavior
# (this may change in future if we do fault injection of a kind that causes
# requests TCP flows to stick)
read=False,
backoff_factor=0,
status_forcelist=[503],
allowed_methods=None,
remove_headers_on_redirect=[],
)
self.mount("http://", HTTPAdapter(max_retries=retries))
if auth_token is not None:
self.headers["Authorization"] = f"Bearer {auth_token}"

View File

@@ -222,7 +222,7 @@ def get_scale_for_db(size_mb: int) -> int:
ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
)
@@ -250,9 +250,6 @@ def allure_attach_from_dir(dir: Path):
elif source.endswith(".html"):
attachment_type = "text/html"
extension = "html"
elif source.endswith(".walredo"):
attachment_type = "application/octet-stream"
extension = "walredo"
else:
attachment_type = "text/plain"
extension = attachment.suffix.removeprefix(".")

View File

@@ -211,12 +211,4 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
ddl.wait()
ddl.failures(False)
cur.execute("CREATE DATABASE failure WITH OWNER=cork")
ddl.wait()
with pytest.raises(psycopg2.InternalError):
ddl.failures(True)
cur.execute("DROP DATABASE failure")
ddl.wait()
ddl.pg.connect(dbname="failure") # Ensure we can connect after a failed drop
conn.close()

View File

@@ -169,6 +169,3 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
# Check that all the updates are visible
num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
assert num_updates == i * 100000
with open(neon_env_builder.test_output_dir / "foobar.walredo", "w") as file:
file.write("lets see if this ends in the report")

View File

@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_until_tenant_active,
)
from fixtures.pg_version import PgVersion
from fixtures.pg_version import PgVersion, xfail_on_postgres
from fixtures.types import Lsn, TenantId, TimelineId
@@ -532,24 +532,7 @@ def test_single_branch_get_tenant_size_grows(
assert size_after == prev, "size after restarting pageserver should not have changed"
def assert_size_approx_equal(size_a, size_b):
"""
Tests that evaluate sizes are checking the pageserver space consumption
that sits many layers below the user input. The exact space needed
varies slightly depending on postgres behavior.
Rather than expecting postgres to be determinstic and occasionally
failing the test, we permit sizes for the same data to vary by a few pages.
"""
# Determined empirically from examples of equality failures: they differ
# by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid
# failing on outliers from that observed range.
threshold = 4 * 8272
assert size_a == pytest.approx(size_b, abs=threshold)
@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
def test_get_tenant_size_with_multiple_branches(
neon_env_builder: NeonEnvBuilder, test_output_dir: Path
):
@@ -590,7 +573,7 @@ def test_get_tenant_size_with_multiple_branches(
)
size_after_first_branch = http_client.tenant_size(tenant_id)
assert_size_approx_equal(size_after_first_branch, size_at_branch)
assert size_after_first_branch == size_at_branch
first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)
@@ -616,7 +599,7 @@ def test_get_tenant_size_with_multiple_branches(
"second-branch", main_branch_name, tenant_id
)
size_after_second_branch = http_client.tenant_size(tenant_id)
assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
assert size_after_second_branch == size_after_continuing_on_main
second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)
@@ -652,7 +635,7 @@ def test_get_tenant_size_with_multiple_branches(
# tenant_size but so far this has been reliable, even though at least gc
# and tenant_size race for the same locks
size_after = http_client.tenant_size(tenant_id)
assert_size_approx_equal(size_after, size_after_thinning_branch)
assert size_after == size_after_thinning_branch
size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
size_debug = http_client.tenant_size_debug(tenant_id)

View File

@@ -34,7 +34,6 @@ from fixtures.remote_storage import (
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
from urllib3.util.retry import Retry
def test_timeline_delete(neon_simple_env: NeonEnv):
@@ -615,7 +614,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
child_timeline_id = env.neon_cli.create_branch("child", "main")
ps_http = env.pageserver.http_client(retries=Retry(0, read=False))
ps_http = env.pageserver.http_client()
failpoint_name = "persist_deleted_index_part"
ps_http.configure_failpoints((failpoint_name, "pause"))
@@ -855,7 +854,7 @@ def test_timeline_delete_resumed_on_attach(
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
# Polling after attach may fail with this
".*Resource temporarily unavailable.*Tenant not yet active",
f".*InternalServerError\\(Tenant {tenant_id} is not active.*",
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
)
)