Compare commits

..

1 Commits

Author SHA1 Message Date
Alexey Kondratov
2394c9c16d DO_NOT_MERGE: custom pg_session_jwt version without audit
It causes too much noise.
2025-04-29 14:20:40 +02:00
34 changed files with 121 additions and 332 deletions

View File

@@ -824,7 +824,7 @@ jobs:
- pg: v17
debian: bookworm
env:
VM_BUILDER_VERSION: v0.46.0
VM_BUILDER_VERSION: v0.42.2
steps:
- name: Harden the runner (Audit all outbound calls)

68
Cargo.lock generated
View File

@@ -2491,18 +2491,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "getrandom"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasi 0.14.2+wasi-0.2.4",
]
[[package]]
name = "gettid"
version = "0.1.3"
@@ -5271,12 +5259,6 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "5.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
[[package]]
name = "rand"
version = "0.7.3"
@@ -5301,16 +5283,6 @@ dependencies = [
"rand_core 0.6.4",
]
[[package]]
name = "rand"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.3",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
@@ -5331,16 +5303,6 @@ dependencies = [
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core 0.9.3",
]
[[package]]
name = "rand_core"
version = "0.5.1"
@@ -5359,15 +5321,6 @@ dependencies = [
"getrandom 0.2.11",
]
[[package]]
name = "rand_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
"getrandom 0.3.2",
]
[[package]]
name = "rand_distr"
version = "0.4.3"
@@ -7244,11 +7197,11 @@ dependencies = [
[[package]]
name = "tokio-epoll-uring"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
dependencies = [
"futures",
"nix 0.26.4",
"once_cell",
"rand 0.9.1",
"scopeguard",
"thiserror 1.0.69",
"tokio",
@@ -7855,6 +7808,7 @@ dependencies = [
[[package]]
name = "uring-common"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
dependencies = [
"bytes",
"io-uring",
@@ -8096,15 +8050,6 @@ version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasi"
version = "0.14.2+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
dependencies = [
"wit-bindgen-rt",
]
[[package]]
name = "wasite"
version = "0.1.0"
@@ -8462,15 +8407,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "workspace_hack"
version = "0.1.0"

View File

@@ -187,8 +187,7 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
#tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-epoll-uring = { path = "../tokio-epoll-uring/tokio-epoll-uring" }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}

View File

@@ -1084,18 +1084,7 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
#########################################################################################
#
# Layer "rust extensions pgrx14"
#
#########################################################################################
FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
ARG PG_VERSION
RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
#########################################################################################
#
# Layers "pg-onnx-build" and "pgrag-build"
@@ -1111,11 +1100,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
echo "#nothing to test here" > neon-test.sh
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \
echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \
echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
FROM rust-extensions-build-pgrx14 AS pgrag-build
FROM rust-extensions-build-pgrx12 AS pgrag-build
COPY --from=pgrag-src /ext-src/ /ext-src/
# Install build-time dependencies
@@ -1135,19 +1124,19 @@ RUN . venv/bin/activate && \
WORKDIR /ext-src/pgrag-src
RUN cd exts/rag && \
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control
RUN cd exts/rag_bge_small_en_v15 && \
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
cargo pgrx install --release --features remote_onnx && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
RUN cd exts/rag_jina_reranker_v1_tiny_en && \
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
cargo pgrx install --release --features remote_onnx && \
@@ -1316,8 +1305,8 @@ ARG PG_VERSION
# Do not update without approve from proxy team
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
WORKDIR /ext-src
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/heads/alexk/tmp-disable-audit.tar.gz -O pg_session_jwt.tar.gz && \
echo "b868227950973df5186af54dfa03617536d21e96dddcb7b25e7ce199b4956bdb pg_session_jwt.tar.gz" | sha256sum --check && \
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \

View File

@@ -23,8 +23,6 @@
import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
import 'sql_exporter/getpage_sync_requests_total.libsonnet',
import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet',
import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet',
import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',

View File

@@ -1,9 +0,0 @@
{
metric_name: 'compute_getpage_max_inflight_stuck_time_ms',
type: 'gauge',
help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for',
values: [
'compute_getpage_max_inflight_stuck_time_ms',
],
query_ref: 'neon_perf_counters',
}

View File

@@ -1,9 +0,0 @@
{
metric_name: 'compute_getpage_stuck_requests_total',
type: 'counter',
help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout',
values: [
'compute_getpage_stuck_requests_total',
],
query_ref: 'neon_perf_counters',
}

View File

@@ -9,8 +9,6 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
getpage_wait_seconds_sum numeric,
getpage_prefetch_requests_total numeric,
getpage_sync_requests_total numeric,
compute_getpage_stuck_requests_total numeric,
compute_getpage_max_inflight_stuck_time_ms numeric,
getpage_prefetch_misses_total numeric,
getpage_prefetch_discards_total numeric,
getpage_prefetches_buffered numeric,

View File

@@ -38,6 +38,11 @@ Currently, the following metrics are collected:
Amount of WAL produced , by a timeline, i.e. last_record_lsn
This is an absolute, per-timeline metric.
- `resident_size`
Size of all the layer files in the tenant's directory on disk on the pageserver.
This is an absolute, per-tenant metric.
- `remote_storage_size`
Size of the remote storage (S3) directory.

View File

@@ -841,10 +841,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
let expected_end = match &end {
ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
// The timeline doesn't exist and we have been requested to not auto-create it.
// Compute requests for timelines that haven't been created yet
// might reach us before the storcon request to create those timelines.
TimelineNoCreate => true,
CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
if is_expected_io_error(io_error) =>
{
@@ -1063,8 +1059,6 @@ pub enum CopyStreamHandlerEnd {
Terminate,
#[error("EOF on COPY stream")]
EOF,
#[error("timeline not found, and allow_timeline_creation is false")]
TimelineNoCreate,
/// The connection was lost
#[error("connection error: {0}")]
Disconnected(#[from] ConnectionError),

View File

@@ -303,8 +303,7 @@ pub struct PullTimelineRequest {
#[derive(Debug, Serialize, Deserialize)]
pub struct PullTimelineResponse {
/// Donor safekeeper host.
/// None if no pull happened because the timeline already exists.
pub safekeeper_host: Option<String>,
// Donor safekeeper host
pub safekeeper_host: String,
// TODO: add more fields?
}

View File

@@ -46,7 +46,6 @@ pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
let mut sigint = signal(SignalKind::interrupt()).unwrap();
let mut sigterm = signal(SignalKind::terminate()).unwrap();
let mut sigquit = signal(SignalKind::quit()).unwrap();
let mut sigusr1 = signal(SignalKind::user_defined1()).unwrap();
loop {
let signal = tokio::select! {
@@ -56,17 +55,13 @@ pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
}
_ = sigint.recv() => "SIGINT",
_ = sigterm.recv() => "SIGTERM",
_ = sigusr1.recv() => {
info!("Got signal SIGUSR1");
continue;
}
};
if !token.is_cancelled() {
info!(thread_id=?std::thread::current().id(), "Got signal {signal}. Terminating gracefully in fast shutdown mode.");
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
token.cancel();
} else {
info!(thread_id=?std::thread::current().id(), "Got signal {signal}. Already shutting down.");
info!("Got signal {signal}. Already shutting down.");
}
}
}

View File

@@ -30,6 +30,9 @@ pub(super) enum Name {
/// Tenant remote size
#[serde(rename = "remote_storage_size")]
RemoteSize,
/// Tenant resident size
#[serde(rename = "resident_size")]
ResidentSize,
/// Tenant synthetic size
#[serde(rename = "synthetic_storage_size")]
SyntheticSize,
@@ -184,6 +187,18 @@ impl MetricsKey {
.absolute_values()
}
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
///
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: Name::ResidentSize,
}
.absolute_values()
}
/// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
///
/// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
@@ -246,7 +261,10 @@ where
let mut tenants = std::pin::pin!(tenants);
while let Some((tenant_id, tenant)) = tenants.next().await {
let mut tenant_resident_size = 0;
let timelines = tenant.list_timelines();
let timelines_len = timelines.len();
for timeline in timelines {
let timeline_id = timeline.timeline_id;
@@ -269,9 +287,16 @@ where
continue;
}
}
tenant_resident_size += timeline.resident_physical_size();
}
let snap = TenantSnapshot::collect(&tenant);
if timelines_len == 0 {
// Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
tenant_resident_size = 1;
}
let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
}
@@ -280,14 +305,19 @@ where
/// In-between abstraction to allow testing metrics without actual Tenants.
struct TenantSnapshot {
resident_size: u64,
remote_size: u64,
synthetic_size: u64,
}
impl TenantSnapshot {
/// Collect tenant status to have metrics created out of it.
fn collect(t: &Arc<crate::tenant::TenantShard>) -> Self {
///
/// `resident_size` is calculated of the timelines we had access to for other metrics, so we
/// cannot just list timelines here.
fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
TenantSnapshot {
resident_size,
remote_size: t.remote_size(),
// Note that this metric is calculated in a separate bgworker
// Here we only use cached value, which may lag behind the real latest one
@@ -304,6 +334,8 @@ impl TenantSnapshot {
) {
let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
let synthetic_size = {
let factory = MetricsKey::synthetic_size(tenant_id);
let mut synthetic_size = self.synthetic_size;
@@ -323,7 +355,11 @@ impl TenantSnapshot {
}
};
metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten());
metrics.extend(
[Some(remote_size), Some(resident_size), synthetic_size]
.into_iter()
.flatten(),
);
}
}

View File

@@ -224,6 +224,7 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
let tenant_id = TenantId::generate();
let ts = TenantSnapshot {
resident_size: 1000,
remote_size: 1000,
// not yet calculated
synthetic_size: 0,
@@ -244,6 +245,7 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
metrics,
&[
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
MetricsKey::resident_size(tenant_id).at(now, 1000),
MetricsKey::synthetic_size(tenant_id).at(now, 1000),
]
);
@@ -254,6 +256,7 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
let tenant_id = TenantId::generate();
let ts = TenantSnapshot {
resident_size: 1000,
remote_size: 1000,
// not yet calculated
synthetic_size: 0,
@@ -271,6 +274,7 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
metrics,
&[
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
MetricsKey::resident_size(tenant_id).at(now, 1000),
// no synthetic size here
]
);
@@ -291,13 +295,14 @@ pub(crate) const fn metric_examples_old(
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [RawMetric; 5] {
) -> [RawMetric; 6] {
[
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_until_old_format(before, now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
]
}
@@ -307,12 +312,13 @@ pub(crate) const fn metric_examples(
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [NewRawMetric; 5] {
) -> [NewRawMetric; 6] {
[
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
MetricsKey::resident_size(tenant_id).at(now, 0),
MetricsKey::synthetic_size(tenant_id).at(now, 1),
]
}

View File

@@ -521,6 +521,10 @@ mod tests {
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
@@ -560,7 +564,7 @@ mod tests {
assert_eq!(upgraded_samples, new_samples);
}
fn metric_samples_old() -> [RawMetric; 5] {
fn metric_samples_old() -> [RawMetric; 6] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);
@@ -572,7 +576,7 @@ mod tests {
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
}
fn metric_samples() -> [NewRawMetric; 5] {
fn metric_samples() -> [NewRawMetric; 6] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);

View File

@@ -497,24 +497,6 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_ondemand_download_bytes_total",
"Total bytes of layers on-demand downloaded",
&["task_kind"]
)
.expect("failed to define a metric")
});
pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_ondemand_download_count",
"Total count of layers on-demand downloaded",
&["task_kind"]
)
.expect("failed to define a metric")
});
pub(crate) mod wait_ondemand_download_time {
use super::*;
const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
@@ -2198,10 +2180,6 @@ impl BasebackupQueryTimeOngoingRecording<'_> {
// If you want to change categorize of a specific error, also change it in `log_query_error`.
let metric = match res {
Ok(_) => &self.parent.ok,
Err(QueryError::Shutdown) => {
// Do not observe ok/err for shutdown
return;
}
Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
if is_expected_io_error(io_error) =>
{

View File

@@ -1035,25 +1035,10 @@ impl PageServerHandler {
// avoid a somewhat costly Span::record() by constructing the entire span in one go.
macro_rules! mkspan {
(before shard routing) => {{
tracing::info_span!(
parent: &parent_span,
"handle_get_page_request",
rel = %req.rel,
blkno = %req.blkno,
req_lsn = %req.hdr.request_lsn,
not_modified_since_lsn = %req.hdr.not_modified_since
)
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
}};
($shard_id:expr) => {{
tracing::info_span!(
parent: &parent_span,
"handle_get_page_request",
rel = %req.rel,
blkno = %req.blkno,
req_lsn = %req.hdr.request_lsn,
not_modified_since_lsn = %req.hdr.not_modified_since,
shard_id = %$shard_id
)
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
}};
}
@@ -1117,7 +1102,6 @@ impl PageServerHandler {
shard_id = %shard.get_shard_identity().shard_slug(),
timeline_id = %timeline_id,
lsn = %req.hdr.request_lsn,
not_modified_since_lsn = %req.hdr.not_modified_since,
request_id = %req.hdr.reqid,
key = %key,
)

View File

@@ -4,7 +4,6 @@ use std::sync::{Arc, Weak};
use std::time::{Duration, SystemTime};
use crate::PERF_TRACE_TARGET;
use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::keyspace::KeySpace;
@@ -1256,14 +1255,6 @@ impl LayerInner {
self.access_stats.record_residence_event();
let task_kind: &'static str = ctx.task_kind().into();
ONDEMAND_DOWNLOAD_BYTES
.with_label_values(&[task_kind])
.inc_by(self.desc.file_size);
ONDEMAND_DOWNLOAD_COUNT
.with_label_values(&[task_kind])
.inc();
Ok(self.initialize_after_layer_is_on_disk(permit))
}
Err(e) => {

View File

@@ -76,13 +76,12 @@ pub async fn thread_local_system() -> Handle {
)
.await;
let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id);
info!(thread_id=?std::thread::current().id(), thread_name=?std::thread::current().name(), "launching system");
let res = System::launch_with_metrics(per_system_metrics, usize::try_from(inner.thread_local_state_id).unwrap())
let res = System::launch_with_metrics(per_system_metrics)
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it
.await;
match res {
Ok(system) => {
info!(thread_id=?std::thread::current().id(), thread_name=?std::thread::current().name(), "successfully launched system");
info!("successfully launched system");
metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
Ok(system)
}

View File

@@ -687,14 +687,8 @@ prefetch_wait_for(uint64 ring_index)
END_PREFETCH_RECEIVE_WORK();
CHECK_FOR_INTERRUPTS();
}
if (result)
{
/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
PrefetchRequest *slot = GetPrfSlot(ring_index);
return slot->status == PRFS_RECEIVED;
}
return false;
;
return result;
}
/*

View File

@@ -877,7 +877,6 @@ retry:
int port;
int sndbuf;
int recvbuf;
uint64* max_wait;
get_local_port(PQsocket(pageserver_conn), &port);
get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf);
@@ -888,10 +887,7 @@ retry:
shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf,
pageserver_conn->inStart, pageserver_conn->inEnd);
shard->receive_last_log_time = now;
MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged;
shard->receive_logged = true;
max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms;
*max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start));
}
/*
@@ -914,7 +910,6 @@ retry:
get_local_port(PQsocket(pageserver_conn), &port);
neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)",
INSTR_TIME_GET_DOUBLE(since_start), port);
MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
pageserver_disconnect(shard_no);
return -1;
}
@@ -938,7 +933,6 @@ retry:
INSTR_TIME_SET_ZERO(shard->receive_start_time);
INSTR_TIME_SET_ZERO(shard->receive_last_log_time);
shard->receive_logged = false;
MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
return ret;
}

View File

@@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram,
static metric_t *
neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
{
#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12)
#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10)
metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
int i = 0;
@@ -166,8 +166,6 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
APPEND_METRIC(getpage_prefetch_requests_total);
APPEND_METRIC(getpage_sync_requests_total);
APPEND_METRIC(compute_getpage_stuck_requests_total);
APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms);
APPEND_METRIC(getpage_prefetch_misses_total);
APPEND_METRIC(getpage_prefetch_discards_total);
APPEND_METRIC(pageserver_requests_sent_total);
@@ -296,11 +294,6 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
totals.file_cache_hits_total += counters->file_cache_hits_total;
histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total;
totals.compute_getpage_max_inflight_stuck_time_ms = Max(
totals.compute_getpage_max_inflight_stuck_time_ms,
counters->compute_getpage_max_inflight_stuck_time_ms);
}
metrics = neon_perf_counters_to_metrics(&totals);

View File

@@ -57,18 +57,6 @@ typedef struct
uint64 getpage_prefetch_requests_total;
uint64 getpage_sync_requests_total;
/*
* Total number of Getpage requests left without an answer for more than
* pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout
*/
uint64 compute_getpage_stuck_requests_total;
/*
* Longest waiting time for active stuck requests. If a stuck request gets a
* response or disconnects, this metric is updated
*/
uint64 compute_getpage_max_inflight_stuck_time_ms;
/*
* Total number of readahead misses; consisting of either prefetches that
* don't satisfy the LSN bounds, or cases where no readahead was issued

View File

@@ -836,7 +836,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
{
uint32 n_greeted = 0;
for (uint32 i = 0; i < mset->len; i++)
for (uint32 i = 0; i < wp->mconf.members.len; i++)
{
Safekeeper *sk = msk[i];
@@ -1106,7 +1106,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
{
uint32 n_votes = 0;
for (uint32 i = 0; i < mset->len; i++)
for (uint32 i = 0; i < wp->mconf.members.len; i++)
{
Safekeeper *sk = msk[i];

View File

@@ -63,7 +63,7 @@
char *wal_acceptors_list = "";
int wal_acceptor_reconnect_timeout = 1000;
int wal_acceptor_connection_timeout = 10000;
int safekeeper_proto_version = 3;
int safekeeper_proto_version = 2;
/* Set to true in the walproposer bgw. */
static bool am_walproposer;
@@ -228,7 +228,7 @@ nwp_register_gucs(void)
"Version of compute <-> safekeeper protocol.",
"Used while migrating from 2 to 3.",
&safekeeper_proto_version,
3, 0, INT_MAX,
2, 0, INT_MAX,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);

View File

@@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation
(see end of this page on how to generate certs with openssl):
```
LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
```
If both postgres and proxy are running you may send a SQL query:
@@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key
Then we need to build proxy with 'testing' feature and run, e.g.:
```sh
RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
```
Now from client you can start a new session:

View File

@@ -132,10 +132,11 @@ impl Drop for LoggingGuard {
}
}
// TODO: make JSON the default
#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
enum LogFormat {
Text,
#[default]
Text = 1,
Json,
}

View File

@@ -401,10 +401,7 @@ pub async fn handle_request(
request.timeline_id,
));
if existing_tli.is_ok() {
info!("Timeline {} already exists", request.timeline_id);
return Ok(PullTimelineResponse {
safekeeper_host: None,
});
bail!("Timeline {} already exists", request.timeline_id);
}
let mut http_client = reqwest::Client::builder();
@@ -428,25 +425,8 @@ pub async fn handle_request(
let mut statuses = Vec::new();
for (i, response) in responses.into_iter().enumerate() {
match response {
Ok(status) => {
statuses.push((status, i));
}
Err(e) => {
info!("error fetching status from {}: {e}", http_hosts[i]);
}
}
}
// Allow missing responses from up to one safekeeper (say due to downtime)
// e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
// offline and C comes online. Then we want a pull on C with A and B as hosts to work.
let min_required_successful = (http_hosts.len() - 1).max(1);
if statuses.len() < min_required_successful {
bail!(
"only got {} successful status responses. required: {min_required_successful}",
statuses.len()
)
let status = response.context(format!("fetching status from {}", http_hosts[i]))?;
statuses.push((status, i));
}
// Find the most advanced safekeeper
@@ -556,6 +536,6 @@ async fn pull_timeline(
.await?;
Ok(PullTimelineResponse {
safekeeper_host: Some(host),
safekeeper_host: host,
})
}

View File

@@ -32,7 +32,7 @@ use crate::metrics::{
WAL_RECEIVERS,
};
use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage};
use crate::timeline::{TimelineError, WalResidentTimeline};
use crate::timeline::WalResidentTimeline;
const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
@@ -357,14 +357,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
.await
.context("create timeline")?
} else {
let timeline_res = self.global_timelines.get(self.ttid);
match timeline_res {
Ok(tl) => tl,
Err(TimelineError::NotFound(_)) => {
return Err(CopyStreamHandlerEnd::TimelineNoCreate);
}
other => other.context("get_timeline")?,
}
self.global_timelines
.get(self.ttid)
.context("get timeline")?
};
tli.wal_residence_guard().await?
}

View File

@@ -19,8 +19,7 @@ use storage_controller::service::chaos_injector::ChaosInjector;
use storage_controller::service::{
Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service,
PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service,
};
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
@@ -133,10 +132,6 @@ struct Cli {
#[arg(long)]
priority_reconciler_concurrency: Option<usize>,
/// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper)
#[arg(long)]
safekeeper_reconciler_concurrency: Option<usize>,
/// Tenant API rate limit, as requests per second per tenant.
#[arg(long, default_value = "10")]
tenant_rate_limit: NonZeroU32,
@@ -408,9 +403,6 @@ async fn async_main() -> anyhow::Result<()> {
priority_reconciler_concurrency: args
.priority_reconciler_concurrency
.unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
safekeeper_reconciler_concurrency: args
.safekeeper_reconciler_concurrency
.unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT),
tenant_rate_limit: args.tenant_rate_limit,
split_threshold: args.split_threshold,
max_split_shards: args.max_split_shards,

View File

@@ -194,7 +194,6 @@ pub(crate) enum LeadershipStatus {
pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -383,9 +382,6 @@ pub struct Config {
/// How many high-priority Reconcilers may be spawned concurrently
pub priority_reconciler_concurrency: usize,
/// How many safekeeper reconciles may happen concurrently (per safekeeper)
pub safekeeper_reconciler_concurrency: usize,
/// How many API requests per second to allow per tenant, across all
/// tenant-scoped API endpoints. Further API requests queue until ready.
pub tenant_rate_limit: NonZeroU32,
@@ -3663,7 +3659,7 @@ impl Service {
locations: ShardMutationLocations,
http_client: reqwest::Client,
jwt: Option<String>,
mut create_req: TimelineCreateRequest,
create_req: TimelineCreateRequest,
) -> Result<TimelineInfo, ApiError> {
let latest = locations.latest.node;
@@ -3682,15 +3678,6 @@ impl Service {
.await
.map_err(|e| passthrough_api_error(&latest, e))?;
// If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use
// the initdb generated by the latest location, rather than generating their own. This avoids racing uploads
// of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries.
if tenant_shard_id.is_shard_zero() {
if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode {
*existing_initdb_timeline_id = Some(create_req.new_timeline_id);
}
}
// We propagate timeline creations to all attached locations such that a compute
// for the new timeline is able to start regardless of the current state of the
// tenant shard reconciliation.

View File

@@ -3,10 +3,7 @@ use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
use clashmap::{ClashMap, Entry};
use safekeeper_api::models::PullTimelineRequest;
use safekeeper_client::mgmt_api;
use tokio::sync::{
Semaphore,
mpsc::{self, UnboundedReceiver, UnboundedSender},
};
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{
@@ -209,27 +206,18 @@ impl ReconcilerHandle {
}
pub(crate) struct SafekeeperReconciler {
inner: SafekeeperReconcilerInner,
concurrency_limiter: Arc<Semaphore>,
service: Arc<Service>,
rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
cancel: CancellationToken,
}
/// Thin wrapper over `Service` to not clutter its inherent functions
#[derive(Clone)]
struct SafekeeperReconcilerInner {
service: Arc<Service>,
}
impl SafekeeperReconciler {
fn spawn(cancel: CancellationToken, service: Arc<Service>) -> ReconcilerHandle {
// We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
let (tx, rx) = mpsc::unbounded_channel();
let concurrency = service.config.safekeeper_reconciler_concurrency;
let mut reconciler = SafekeeperReconciler {
inner: SafekeeperReconcilerInner { service },
service,
rx,
concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
cancel: cancel.clone(),
};
let handle = ReconcilerHandle {
@@ -242,44 +230,31 @@ impl SafekeeperReconciler {
}
async fn run(&mut self) {
loop {
// TODO add parallelism with semaphore here
let req = tokio::select! {
req = self.rx.recv() => req,
_ = self.cancel.cancelled() => break,
};
let Some((req, req_cancel)) = req else { break };
let permit_res = tokio::select! {
req = self.concurrency_limiter.clone().acquire_owned() => req,
_ = self.cancel.cancelled() => break,
};
let Ok(_permit) = permit_res else { return };
let inner = self.inner.clone();
if req_cancel.is_cancelled() {
continue;
}
tokio::task::spawn(async move {
let kind = req.kind;
let tenant_id = req.tenant_id;
let timeline_id = req.timeline_id;
let node_id = req.safekeeper.skp.id;
inner
.reconcile_one(req, req_cancel)
.instrument(tracing::info_span!(
"reconcile_one",
?kind,
%tenant_id,
?timeline_id,
%node_id,
))
.await;
});
let kind = req.kind;
let tenant_id = req.tenant_id;
let timeline_id = req.timeline_id;
let node_id = req.safekeeper.skp.id;
self.reconcile_one(req, req_cancel)
.instrument(tracing::info_span!(
"reconcile_one",
?kind,
%tenant_id,
?timeline_id,
%node_id,
))
.await;
}
}
}
impl SafekeeperReconcilerInner {
async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
let req_host = req.safekeeper.skp.host.clone();
match req.kind {
@@ -306,11 +281,10 @@ impl SafekeeperReconcilerInner {
req,
async |client| client.pull_timeline(&pull_req).await,
|resp| {
if let Some(host) = resp.safekeeper_host {
tracing::info!("pulled timeline from {host} onto {req_host}");
} else {
tracing::info!("timeline already present on safekeeper on {req_host}");
}
tracing::info!(
"pulled timeline from {} onto {req_host}",
resp.safekeeper_host,
);
},
req_cancel,
)

View File

@@ -2,8 +2,6 @@ from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
@@ -12,7 +10,6 @@ if TYPE_CHECKING:
# while initial compute node is down and pageserver is lagging behind safekeepers.
# Ensure that basebackup after restart of all components is correct
# and new compute node contains all data.
@pytest.mark.repeat(1000)
def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()

View File

@@ -506,6 +506,7 @@ class SyntheticSizeVerifier:
PER_METRIC_VERIFIERS = {
"remote_storage_size": CannotVerifyAnything,
"resident_size": CannotVerifyAnything,
"written_size": WrittenDataVerifier,
"written_data_bytes_delta": WrittenDataDeltaVerifier,
"timeline_logical_size": CannotVerifyAnything,