mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-23 16:10:37 +00:00
Merge branch 'main' into communicator-rewrite
This commit is contained in:
@@ -33,6 +33,7 @@ workspace-members = [
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
"json",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"postgres_backend",
|
||||
|
||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -7,6 +7,7 @@ self-hosted-runner:
|
||||
- small-metal
|
||||
- small-arm64
|
||||
- unit-perf
|
||||
- unit-perf-aws-arm
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AWS_ECR_REGION
|
||||
|
||||
23
.github/workflows/build_and_test.yml
vendored
23
.github/workflows/build_and_test.yml
vendored
@@ -87,6 +87,24 @@ jobs:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
lint-openapi-spec:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [ meta, check-permissions ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- run: make lint-openapi-spec
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
@@ -306,14 +324,14 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -986,6 +1004,7 @@ jobs:
|
||||
- name: Verify docker-compose example and test extensions
|
||||
timeout-minutes: 60
|
||||
env:
|
||||
PARALLEL_COMPUTES: 3
|
||||
TAG: >-
|
||||
${{
|
||||
needs.meta.outputs.run-kind == 'compute-rc-pr'
|
||||
|
||||
4
.github/workflows/periodic_pagebench.yml
vendored
4
.github/workflows/periodic_pagebench.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Periodic pagebench performance test on unit-perf hetzner runner
|
||||
name: Periodic pagebench performance test on unit-perf-aws-arm runners
|
||||
|
||||
on:
|
||||
schedule:
|
||||
@@ -40,7 +40,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
|
||||
4
.github/workflows/proxy-benchmark.yml
vendored
4
.github/workflows/proxy-benchmark.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Periodic proxy performance test on unit-perf hetzner runner
|
||||
name: Periodic proxy performance test on unit-perf-aws-arm runners
|
||||
|
||||
on:
|
||||
push: # TODO: remove after testing
|
||||
@@ -32,7 +32,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [self-hosted, unit-perf]
|
||||
runs-on: [self-hosted, unit-perf-aws-arm]
|
||||
timeout-minutes: 60 # 1h timeout
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@ neon.iml
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
pgxn/neon/communicator/communicator_bindings.h
|
||||
docker-compose/docker-compose-parallel.yml
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
|
||||
41
Cargo.lock
generated
41
Cargo.lock
generated
@@ -1427,6 +1427,7 @@ dependencies = [
|
||||
"p256 0.13.2",
|
||||
"pageserver_page_api",
|
||||
"postgres",
|
||||
"postgres-types",
|
||||
"postgres_initdb",
|
||||
"postgres_versioninfo",
|
||||
"regex",
|
||||
@@ -3597,6 +3598,15 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "json"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"itoa",
|
||||
"ryu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "json-structural-diff"
|
||||
version = "0.2.0"
|
||||
@@ -4454,6 +4464,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"svg_fmt",
|
||||
"thiserror 1.0.69",
|
||||
@@ -4471,6 +4482,7 @@ dependencies = [
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"base64 0.22.1",
|
||||
"bincode",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
@@ -4632,30 +4644,18 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"compute_api",
|
||||
"dashmap 5.5.0",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_page_api",
|
||||
"priority-queue",
|
||||
"rand 0.8.5",
|
||||
"scopeguard",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tower 0.4.13",
|
||||
"tracing",
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5271,17 +5271,6 @@ dependencies = [
|
||||
"elliptic-curve 0.13.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "priority-queue"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5676d703dda103cbb035b653a9f11448c0a7216c7926bd35fcb5865475d0c970"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"equivalent",
|
||||
"indexmap 2.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.94"
|
||||
@@ -5915,6 +5904,8 @@ dependencies = [
|
||||
"azure_identity",
|
||||
"azure_storage",
|
||||
"azure_storage_blobs",
|
||||
"base64 0.22.1",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
@@ -8993,8 +8984,10 @@ dependencies = [
|
||||
"fail",
|
||||
"form_urlencoded",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"getrandom 0.2.11",
|
||||
|
||||
@@ -44,6 +44,7 @@ members = [
|
||||
"libs/walproposer",
|
||||
"libs/wal_decoder",
|
||||
"libs/postgres_initdb",
|
||||
"libs/proxy/json",
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
|
||||
9
Makefile
9
Makefile
@@ -220,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
.PHONY: lint-openapi-spec
|
||||
lint-openapi-spec:
|
||||
# operation-2xx-response: pageserver timeline delete returns 404 on success
|
||||
find . -iname "openapi_spec.y*ml" -exec\
|
||||
docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
|
||||
--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
|
||||
--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
|
||||
lint {} \+
|
||||
|
||||
# Targets for building PostgreSQL are defined in postgres.mk.
|
||||
#
|
||||
# But if the caller has indicated that PostgreSQL is already
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
disallowed-methods = [
|
||||
"tokio::task::block_in_place",
|
||||
|
||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||
# "tokio::runtime::Handle::block_on",
|
||||
# use tokio_epoll_uring_ext instead
|
||||
"tokio_epoll_uring::thread_local_system",
|
||||
|
||||
# tokio-epoll-uring:
|
||||
# - allow-invalid because the method doesn't exist on macOS
|
||||
{ path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true }
|
||||
]
|
||||
|
||||
disallowed-macros = [
|
||||
|
||||
@@ -1915,10 +1915,10 @@ RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /e
|
||||
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \
|
||||
&& apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
ENV PGHOST=compute
|
||||
ENV PGHOST=compute1
|
||||
ENV PGPORT=55433
|
||||
ENV PGUSER=cloud_admin
|
||||
ENV PGDATABASE=postgres
|
||||
|
||||
@@ -66,7 +66,7 @@ url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
postgres-types.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
compute_api.workspace = true
|
||||
|
||||
@@ -46,11 +46,14 @@ stateDiagram-v2
|
||||
Configuration --> Failed : Failed to configure the compute
|
||||
Configuration --> Running : Compute has been configured
|
||||
Empty --> Init : Compute spec is immediately available
|
||||
Empty --> TerminationPending : Requested termination
|
||||
Empty --> TerminationPendingFast : Requested termination
|
||||
Empty --> TerminationPendingImmediate : Requested termination
|
||||
Init --> Failed : Failed to start Postgres
|
||||
Init --> Running : Started Postgres
|
||||
Running --> TerminationPending : Requested termination
|
||||
TerminationPending --> Terminated : Terminated compute
|
||||
Running --> TerminationPendingFast : Requested termination
|
||||
Running --> TerminationPendingImmediate : Requested termination
|
||||
TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
|
||||
TerminationPendingImmediate --> Terminated : Terminated compute immediately
|
||||
Failed --> [*] : Compute exited
|
||||
Terminated --> [*] : Compute exited
|
||||
```
|
||||
|
||||
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
|
||||
@@ -30,8 +30,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{spawn, time};
|
||||
use tokio::{spawn, sync::watch, task::JoinHandle, time};
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -176,6 +175,7 @@ pub struct ComputeState {
|
||||
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
|
||||
/// mode == ComputeMode::Primary. None otherwise
|
||||
pub terminate_flush_lsn: Option<Lsn>,
|
||||
pub promote_state: Option<watch::Receiver<PromoteState>>,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
}
|
||||
@@ -193,6 +193,7 @@ impl ComputeState {
|
||||
lfc_prewarm_state: LfcPrewarmState::default(),
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
terminate_flush_lsn: None,
|
||||
promote_state: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -983,14 +984,20 @@ impl ComputeNode {
|
||||
None
|
||||
};
|
||||
|
||||
let mut delay_exit = false;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.terminate_flush_lsn = lsn;
|
||||
if let ComputeStatus::TerminationPending { mode } = state.status {
|
||||
|
||||
let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
|
||||
if state.status == ComputeStatus::TerminationPendingFast
|
||||
|| state.status == ComputeStatus::TerminationPendingImmediate
|
||||
{
|
||||
info!(
|
||||
"Changing compute status from {} to {}",
|
||||
state.status,
|
||||
ComputeStatus::Terminated
|
||||
);
|
||||
state.status = ComputeStatus::Terminated;
|
||||
self.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = mode == compute_api::responses::TerminateMode::Fast
|
||||
}
|
||||
drop(state);
|
||||
|
||||
@@ -1826,6 +1833,8 @@ impl ComputeNode {
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
if !spec.skip_pg_catalog_updates {
|
||||
let max_concurrent_connections = spec.reconfigure_concurrency;
|
||||
// Temporarily reset max_cluster_size in config
|
||||
@@ -1845,10 +1854,9 @@ impl ComputeNode {
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let unknown_op = "unknown".to_string();
|
||||
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
||||
info!(
|
||||
@@ -1921,7 +1929,8 @@ impl ComputeNode {
|
||||
|
||||
// exit loop
|
||||
ComputeStatus::Failed
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Terminated => break 'cert_update,
|
||||
|
||||
// wait
|
||||
@@ -2455,19 +2464,11 @@ LIMIT 100",
|
||||
// If the value is -1, we never suspend so set the value to default collection.
|
||||
// If the value is 0, it means default, we will just continue to use the default.
|
||||
if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
|
||||
spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"[NEON_EXT_INT_UPD] Spec Timeout: {}",
|
||||
spec.suspend_timeout_seconds
|
||||
);
|
||||
self.params.installed_extensions_collection_interval.store(
|
||||
spec.suspend_timeout_seconds as u64,
|
||||
std::sync::atomic::Ordering::SeqCst,
|
||||
|
||||
@@ -70,7 +70,7 @@ impl ComputeNode {
|
||||
}
|
||||
};
|
||||
let row = match client
|
||||
.query_one("select * from get_prewarm_info()", &[])
|
||||
.query_one("select * from neon.get_prewarm_info()", &[])
|
||||
.await
|
||||
{
|
||||
Ok(row) => row,
|
||||
@@ -105,7 +105,8 @@ impl ComputeNode {
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
|
||||
return;
|
||||
};
|
||||
error!(%err);
|
||||
crate::metrics::LFC_PREWARM_ERRORS.inc();
|
||||
error!(%err, "prewarming lfc");
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
@@ -145,7 +146,7 @@ impl ComputeNode {
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select prewarm_local_cache($1)", &[&uncompressed])
|
||||
.query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
|
||||
.await
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())
|
||||
@@ -180,7 +181,8 @@ impl ComputeNode {
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
};
|
||||
error!(%err);
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "offloading lfc");
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
@@ -194,7 +196,7 @@ impl ComputeNode {
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select get_local_cache_state()", &[])
|
||||
.query_one("select neon.get_local_cache_state()", &[])
|
||||
.await
|
||||
.context("querying LFC state")?
|
||||
.try_get::<usize, &[u8]>(0)
|
||||
|
||||
132
compute_tools/src/compute_promote.rs
Normal file
132
compute_tools/src/compute_promote.rs
Normal file
@@ -0,0 +1,132 @@
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::{Context, Result, bail};
|
||||
use compute_api::{
|
||||
responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
|
||||
spec::ComputeMode,
|
||||
};
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::sleep;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl ComputeNode {
|
||||
/// Returns only when promote fails or succeeds. If a network error occurs
|
||||
/// and http client disconnects, this does not stop promotion, and subsequent
|
||||
/// calls block until promote finishes.
|
||||
/// Called by control plane on secondary after primary endpoint is terminated
|
||||
pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
|
||||
let cloned = self.clone();
|
||||
let start_promotion = || {
|
||||
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
|
||||
tokio::spawn(async move {
|
||||
tx.send(match cloned.promote_impl(safekeepers_lsn).await {
|
||||
Ok(_) => PromoteState::Completed,
|
||||
Err(err) => {
|
||||
tracing::error!(%err, "promoting");
|
||||
PromoteState::Failed {
|
||||
error: err.to_string(),
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
rx
|
||||
};
|
||||
|
||||
let mut task;
|
||||
// self.state is unlocked after block ends so we lock it in promote_impl
|
||||
// and task.changed() is reached
|
||||
{
|
||||
task = self
|
||||
.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.promote_state
|
||||
.get_or_insert_with(start_promotion)
|
||||
.clone()
|
||||
}
|
||||
task.changed().await.expect("promote sender dropped");
|
||||
task.borrow().clone()
|
||||
}
|
||||
|
||||
// Why do we have to supply safekeepers?
|
||||
// For secondary we use primary_connection_conninfo so safekeepers field is empty
|
||||
async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
|
||||
{
|
||||
let state = self.state.lock().unwrap();
|
||||
let mode = &state.pspec.as_ref().unwrap().spec.mode;
|
||||
if *mode != ComputeMode::Replica {
|
||||
bail!("{} is not replica", mode.to_type_str());
|
||||
}
|
||||
|
||||
// we don't need to query Postgres so not self.lfc_prewarm_state()
|
||||
match &state.lfc_prewarm_state {
|
||||
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
|
||||
bail!("prewarm not requested or pending")
|
||||
}
|
||||
LfcPrewarmState::Failed { error } => {
|
||||
tracing::warn!(%error, "replica prewarm failed")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
|
||||
let primary_lsn = safekeepers_lsn.wal_flush_lsn;
|
||||
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
|
||||
const RETRIES: i32 = 20;
|
||||
for i in 0..=RETRIES {
|
||||
let row = client
|
||||
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
|
||||
.await
|
||||
.context("getting last replay lsn")?;
|
||||
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
|
||||
last_wal_replay_lsn = lsn.into();
|
||||
if last_wal_replay_lsn >= primary_lsn {
|
||||
break;
|
||||
}
|
||||
tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
if last_wal_replay_lsn < primary_lsn {
|
||||
bail!("didn't catch up with primary in {RETRIES} retries");
|
||||
}
|
||||
|
||||
// using $1 doesn't work with ALTER SYSTEM SET
|
||||
let safekeepers_sql = format!(
|
||||
"ALTER SYSTEM SET neon.safekeepers='{}'",
|
||||
safekeepers_lsn.safekeepers
|
||||
);
|
||||
client
|
||||
.query(&safekeepers_sql, &[])
|
||||
.await
|
||||
.context("setting safekeepers")?;
|
||||
client
|
||||
.query("SELECT pg_reload_conf()", &[])
|
||||
.await
|
||||
.context("reloading postgres config")?;
|
||||
let row = client
|
||||
.query_one("SELECT * FROM pg_promote()", &[])
|
||||
.await
|
||||
.context("pg_promote")?;
|
||||
if !row.get::<usize, bool>(0) {
|
||||
bail!("pg_promote() returned false");
|
||||
}
|
||||
|
||||
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
let row = client
|
||||
.query_one("SHOW transaction_read_only", &[])
|
||||
.await
|
||||
.context("getting transaction_read_only")?;
|
||||
if row.get::<usize, &str>(0) == "on" {
|
||||
bail!("replica in read only mode after promotion");
|
||||
}
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -83,6 +83,87 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/DbsAndRoles"
|
||||
|
||||
/promote:
|
||||
post:
|
||||
tags:
|
||||
- Promotion
|
||||
summary: Promote secondary replica to primary
|
||||
description: ""
|
||||
operationId: promoteReplica
|
||||
requestBody:
|
||||
description: Promote requests data
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SafekeepersLsn"
|
||||
responses:
|
||||
200:
|
||||
description: Promote succeeded or wasn't started
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
500:
|
||||
description: Promote failed
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PromoteState"
|
||||
|
||||
/lfc/prewarm:
|
||||
post:
|
||||
summary: Request LFC Prewarm
|
||||
parameters:
|
||||
- name: from_endpoint
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: ""
|
||||
operationId: lfcPrewarm
|
||||
responses:
|
||||
202:
|
||||
description: LFC prewarm started
|
||||
429:
|
||||
description: LFC prewarm ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC prewarm state
|
||||
description: ""
|
||||
operationId: getLfcPrewarmState
|
||||
responses:
|
||||
200:
|
||||
description: Prewarm state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcPrewarmState"
|
||||
|
||||
/lfc/offload:
|
||||
post:
|
||||
summary: Request LFC offload
|
||||
description: ""
|
||||
operationId: lfcOffload
|
||||
responses:
|
||||
202:
|
||||
description: LFC offload started
|
||||
429:
|
||||
description: LFC offload ongoing
|
||||
get:
|
||||
tags:
|
||||
- Prewarm
|
||||
summary: Get LFC offloading state
|
||||
description: ""
|
||||
operationId: getLfcOffloadState
|
||||
responses:
|
||||
200:
|
||||
description: Offload state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LfcOffloadState"
|
||||
|
||||
/database_schema:
|
||||
get:
|
||||
tags:
|
||||
@@ -290,9 +371,28 @@ paths:
|
||||
summary: Terminate Postgres and wait for it to exit
|
||||
description: ""
|
||||
operationId: terminate
|
||||
parameters:
|
||||
- name: mode
|
||||
in: query
|
||||
description: "Terminate mode: fast (wait 30s before returning) and immediate"
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
enum: ["fast", "immediate"]
|
||||
default: fast
|
||||
responses:
|
||||
200:
|
||||
description: Result
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TerminateResponse"
|
||||
201:
|
||||
description: Result if compute is already terminated
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TerminateResponse"
|
||||
412:
|
||||
description: "wrong state"
|
||||
content:
|
||||
@@ -335,15 +435,6 @@ components:
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
Info:
|
||||
type: object
|
||||
description: Information about VM/Pod.
|
||||
required:
|
||||
- num_cpus
|
||||
properties:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
DbsAndRoles:
|
||||
type: object
|
||||
description: Databases and Roles
|
||||
@@ -458,11 +549,14 @@ components:
|
||||
type: string
|
||||
enum:
|
||||
- empty
|
||||
- init
|
||||
- failed
|
||||
- running
|
||||
- configuration_pending
|
||||
- init
|
||||
- running
|
||||
- configuration
|
||||
- failed
|
||||
- termination_pending_fast
|
||||
- termination_pending_immediate
|
||||
- terminated
|
||||
example: running
|
||||
|
||||
ExtensionInstallRequest:
|
||||
@@ -497,25 +591,69 @@ components:
|
||||
type: string
|
||||
example: "1.0.0"
|
||||
|
||||
InstalledExtensions:
|
||||
SafekeepersLsn:
|
||||
type: object
|
||||
required:
|
||||
- safekeepers
|
||||
- wal_flush_lsn
|
||||
properties:
|
||||
extensions:
|
||||
description: Contains list of installed extensions.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
extname:
|
||||
type: string
|
||||
version:
|
||||
type: string
|
||||
items:
|
||||
type: string
|
||||
n_databases:
|
||||
type: integer
|
||||
owned_by_superuser:
|
||||
type: integer
|
||||
safekeepers:
|
||||
description: Primary replica safekeepers
|
||||
type: string
|
||||
wal_flush_lsn:
|
||||
description: Primary last WAL flush LSN
|
||||
type: string
|
||||
|
||||
LfcPrewarmState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- total
|
||||
- prewarmed
|
||||
- skipped
|
||||
properties:
|
||||
status:
|
||||
description: Lfc prewarm status
|
||||
enum: [not_prewarmed, prewarming, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc prewarm error, if any
|
||||
type: string
|
||||
total:
|
||||
description: Total pages processed
|
||||
type: integer
|
||||
prewarmed:
|
||||
description: Total pages prewarmed
|
||||
type: integer
|
||||
skipped:
|
||||
description: Pages processed but not prewarmed
|
||||
type: integer
|
||||
|
||||
LfcOffloadState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Lfc offload status
|
||||
enum: [not_offloaded, offloading, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Lfc offload error, if any
|
||||
type: string
|
||||
|
||||
PromoteState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
properties:
|
||||
status:
|
||||
description: Promote result
|
||||
enum: [not_promoted, completed, failed]
|
||||
type: string
|
||||
error:
|
||||
description: Promote error, if any
|
||||
type: string
|
||||
|
||||
SetRoleGrantsRequest:
|
||||
type: object
|
||||
@@ -544,6 +682,17 @@ components:
|
||||
description: Role name.
|
||||
example: "neon"
|
||||
|
||||
TerminateResponse:
|
||||
type: object
|
||||
required:
|
||||
- lsn
|
||||
properties:
|
||||
lsn:
|
||||
type: string
|
||||
nullable: true
|
||||
description: "last WAL flush LSN"
|
||||
example: "0/028F10D8"
|
||||
|
||||
SetRoleGrantsResponse:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
|
||||
pub(in crate::http) mod lfc;
|
||||
pub(in crate::http) mod metrics;
|
||||
pub(in crate::http) mod metrics_json;
|
||||
pub(in crate::http) mod promote;
|
||||
pub(in crate::http) mod status;
|
||||
pub(in crate::http) mod terminate;
|
||||
|
||||
|
||||
14
compute_tools/src/http/routes/promote.rs
Normal file
14
compute_tools/src/http/routes/promote.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
use crate::http::JsonResponse;
|
||||
use axum::Form;
|
||||
use http::StatusCode;
|
||||
|
||||
pub(in crate::http) async fn promote(
|
||||
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
|
||||
Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
|
||||
) -> axum::response::Response {
|
||||
let state = compute.promote(safekeepers_lsn).await;
|
||||
if let compute_api::responses::PromoteState::Failed { error } = state {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
|
||||
}
|
||||
JsonResponse::success(StatusCode::OK, state)
|
||||
}
|
||||
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use axum_extra::extract::OptionalQuery;
|
||||
use compute_api::responses::{ComputeStatus, TerminateResponse};
|
||||
use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
|
||||
use http::StatusCode;
|
||||
use serde::Deserialize;
|
||||
use std::sync::Arc;
|
||||
@@ -12,7 +12,7 @@ use tracing::info;
|
||||
|
||||
#[derive(Deserialize, Default)]
|
||||
pub struct TerminateQuery {
|
||||
mode: compute_api::responses::TerminateMode,
|
||||
mode: TerminateMode,
|
||||
}
|
||||
|
||||
/// Terminate the compute.
|
||||
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
|
||||
let response = TerminateResponse {
|
||||
lsn: state.terminate_flush_lsn,
|
||||
};
|
||||
return JsonResponse::success(StatusCode::CREATED, response);
|
||||
}
|
||||
|
||||
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
state.set_status(
|
||||
ComputeStatus::TerminationPending { mode },
|
||||
&compute.state_changed,
|
||||
);
|
||||
state.set_status(mode.into(), &compute.state_changed);
|
||||
}
|
||||
|
||||
forward_termination_signal(false);
|
||||
|
||||
@@ -23,7 +23,7 @@ use super::{
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, lfc, metrics, metrics_json, status, terminate,
|
||||
grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
|
||||
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
|
||||
.route("/promote", post(promote::promote))
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
|
||||
@@ -12,6 +12,7 @@ pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod compute_prewarm;
|
||||
pub mod compute_promote;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod installed_extensions;
|
||||
|
||||
@@ -105,6 +105,14 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_prewarm_errors_total",
|
||||
"Total number of LFC prewarm errors",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offloads_total",
|
||||
@@ -113,6 +121,14 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offload_errors_total",
|
||||
"Total number of LFC offload errors",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn collect() -> Vec<MetricFamily> {
|
||||
let mut metrics = COMPUTE_CTL_UP.collect();
|
||||
metrics.extend(INSTALLED_EXTENSIONS.collect());
|
||||
@@ -123,6 +139,8 @@ pub fn collect() -> Vec<MetricFamily> {
|
||||
metrics.extend(PG_CURR_DOWNTIME_MS.collect());
|
||||
metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
|
||||
metrics.extend(LFC_PREWARMS.collect());
|
||||
metrics.extend(LFC_PREWARM_ERRORS.collect());
|
||||
metrics.extend(LFC_OFFLOADS.collect());
|
||||
metrics.extend(LFC_OFFLOAD_ERRORS.collect());
|
||||
metrics
|
||||
}
|
||||
|
||||
@@ -1,3 +1,16 @@
|
||||
-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
|
||||
-- it was found that BYPASSRLS was being applied to all roles.
|
||||
--
|
||||
-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
|
||||
-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
|
||||
--
|
||||
-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
|
||||
-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
|
||||
-- keep the migration around for a long time, if not indefinitely, so any
|
||||
-- cluster can be fixed.
|
||||
--
|
||||
-- Branching is the gift that keeps on giving...
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name text;
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
|
||||
@@ -7,13 +7,17 @@ BEGIN
|
||||
INTO monitor
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_monitor'::regrole
|
||||
AND member = 'pg_monitor'::regrole;
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
IF NOT monitor.member THEN
|
||||
IF monitor IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
|
||||
END IF;
|
||||
|
||||
IF monitor.admin IS NULL OR NOT monitor.member THEN
|
||||
RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
|
||||
END IF;
|
||||
|
||||
IF NOT monitor.admin THEN
|
||||
IF monitor.admin IS NULL OR NOT monitor.admin THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
signal_backend record;
|
||||
BEGIN
|
||||
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
|
||||
admin_option AS admin
|
||||
INTO signal_backend
|
||||
FROM pg_auth_members
|
||||
WHERE roleid = 'pg_signal_backend'::regrole
|
||||
AND member = 'neon_superuser'::regrole;
|
||||
|
||||
IF signal_backend IS NULL THEN
|
||||
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
|
||||
END IF;
|
||||
|
||||
IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
|
||||
RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
|
||||
END IF;
|
||||
|
||||
IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
|
||||
RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -84,7 +84,8 @@ impl ComputeMonitor {
|
||||
if matches!(
|
||||
compute_status,
|
||||
ComputeStatus::Terminated
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Failed
|
||||
) {
|
||||
info!(
|
||||
|
||||
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
include_str!(
|
||||
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
|
||||
),
|
||||
include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
|
||||
];
|
||||
|
||||
MigrationRunner::new(client, &migrations)
|
||||
|
||||
@@ -914,7 +914,8 @@ impl Endpoint {
|
||||
ComputeStatus::Empty
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::TerminationPending { .. }
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Terminated => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
|
||||
@@ -452,6 +452,12 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
// HADRON
|
||||
image_layer_force_creation_period: settings
|
||||
.remove("image_layer_force_creation_period")
|
||||
.map(humantime::parse_duration)
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_layer_force_creation_period' as duration")?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
|
||||
@@ -54,14 +54,16 @@ else
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
|
||||
echo "Check if a timeline present"
|
||||
PARAMS=(
|
||||
-X GET
|
||||
-H "Content-Type: application/json"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
|
||||
if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then
|
||||
echo "Check if a timeline present"
|
||||
PARAMS=(
|
||||
-X GET
|
||||
-H "Content-Type: application/json"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
fi
|
||||
if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
|
||||
@@ -142,7 +142,7 @@ services:
|
||||
- "storage_broker"
|
||||
- "--listen-addr=0.0.0.0:50051"
|
||||
|
||||
compute:
|
||||
compute1:
|
||||
restart: always
|
||||
build:
|
||||
context: ./compute_wrapper/
|
||||
@@ -152,6 +152,7 @@ services:
|
||||
- TAG=${COMPUTE_TAG:-${TAG:-latest}}
|
||||
- http_proxy=${http_proxy:-}
|
||||
- https_proxy=${https_proxy:-}
|
||||
image: built-compute
|
||||
environment:
|
||||
- PG_VERSION=${PG_VERSION:-16}
|
||||
- TENANT_ID=${TENANT_ID:-}
|
||||
@@ -166,6 +167,11 @@ services:
|
||||
- 3080:3080 # http endpoints
|
||||
entrypoint:
|
||||
- "/shell/compute.sh"
|
||||
# Ad an alias for compute1 for compatibility
|
||||
networks:
|
||||
default:
|
||||
aliases:
|
||||
- compute
|
||||
depends_on:
|
||||
- safekeeper1
|
||||
- safekeeper2
|
||||
@@ -174,15 +180,20 @@ services:
|
||||
|
||||
compute_is_ready:
|
||||
image: postgres:latest
|
||||
environment:
|
||||
- PARALLEL_COMPUTES=1
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "until pg_isready -h compute -p 55433 -U cloud_admin ; do
|
||||
echo 'Waiting to start compute...' && sleep 1;
|
||||
done"
|
||||
- "for i in $(seq 1 $${PARALLEL_COMPUTES}); do
|
||||
until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do
|
||||
sleep 1;
|
||||
done;
|
||||
done;
|
||||
echo All computes are started"
|
||||
depends_on:
|
||||
- compute
|
||||
- compute1
|
||||
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
@@ -196,4 +207,4 @@ services:
|
||||
command:
|
||||
- sleep 3600
|
||||
depends_on:
|
||||
- compute
|
||||
- compute1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# A basic test to ensure Docker images are built correctly.
|
||||
# Build a wrapper around the compute, start all services and runs a simple SQL query.
|
||||
@@ -13,9 +13,36 @@
|
||||
#
|
||||
set -eux -o pipefail
|
||||
|
||||
cd "$(dirname "${0}")"
|
||||
export COMPOSE_FILE='docker-compose.yml'
|
||||
export COMPOSE_PROFILES=test-extensions
|
||||
cd "$(dirname "${0}")"
|
||||
export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
|
||||
READY_MESSAGE="All computes are started"
|
||||
COMPUTES=()
|
||||
for i in $(seq 1 "${PARALLEL_COMPUTES}"); do
|
||||
COMPUTES+=("compute${i}")
|
||||
done
|
||||
CURRENT_TMPDIR=$(mktemp -d)
|
||||
trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT
|
||||
if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
|
||||
export COMPOSE_FILE=docker-compose-parallel.yml
|
||||
cp docker-compose.yml docker-compose-parallel.yml
|
||||
# Replace the environment variable PARALLEL_COMPUTES with the actual value
|
||||
yq eval -i ".services.compute_is_ready.environment |= map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE}
|
||||
for i in $(seq 2 "${PARALLEL_COMPUTES}"); do
|
||||
# Duplicate compute1 as compute${i} for parallel execution
|
||||
yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE}
|
||||
# We don't need these sections, so delete them
|
||||
yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE}
|
||||
# Let the compute 1 be the only dependence
|
||||
yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE}
|
||||
# Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes
|
||||
yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE}
|
||||
# Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes
|
||||
# They will create new TENANT_ID and TIMELINE_ID anyway.
|
||||
yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE}
|
||||
done
|
||||
fi
|
||||
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
|
||||
|
||||
function cleanup() {
|
||||
@@ -27,11 +54,11 @@ function cleanup() {
|
||||
|
||||
for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
pg_version=${pg_version/v/}
|
||||
echo "clean up containers if exists"
|
||||
echo "clean up containers if exist"
|
||||
cleanup
|
||||
PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
|
||||
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d
|
||||
|
||||
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1
|
||||
PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
cnt=0
|
||||
while sleep 3; do
|
||||
@@ -41,45 +68,50 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
echo "timeout before the compute is ready."
|
||||
exit 1
|
||||
fi
|
||||
if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then
|
||||
if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then
|
||||
echo "OK. The compute is ready to connect."
|
||||
echo "execute simple queries."
|
||||
docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
|
||||
for compute in "${COMPUTES[@]}"; do
|
||||
docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
|
||||
done
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${pg_version} -ge 16 ]]; then
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# Prepare for the PostGIS test
|
||||
docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
|
||||
docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
|
||||
docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
|
||||
docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
|
||||
rm -rf "${TMPDIR}"
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
|
||||
docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/
|
||||
rm -rf "${TMPDIR}"
|
||||
# The following block does the same for the contrib/file_fdw test
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data"
|
||||
docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data
|
||||
rm -rf "${TMPDIR}"
|
||||
mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src}
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test"
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install"
|
||||
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data"
|
||||
docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data"
|
||||
|
||||
for compute in "${COMPUTES[@]}"; do
|
||||
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config on "${compute}"
|
||||
docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# Prepare for the PostGIS test
|
||||
docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
|
||||
docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test
|
||||
docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/
|
||||
# The following block does the same for the contrib/file_fdw test
|
||||
docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data
|
||||
done
|
||||
# Apply patches
|
||||
docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
|
||||
# We are running tests now
|
||||
rm -f testout.txt testout_contrib.txt
|
||||
# We want to run the longest tests first to better utilize parallelization and reduce overall test time.
|
||||
# Tests listed in the RUN_FIRST variable will be run before others.
|
||||
# If parallelization is not used, this environment variable will be ignored.
|
||||
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
-e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
|
||||
neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
|
||||
docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
|
||||
-e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
|
||||
neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
|
||||
if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then
|
||||
CONTRIB_FAILED=
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -x
|
||||
|
||||
if [[ -v BENCHMARK_CONNSTR ]]; then
|
||||
@@ -26,8 +26,9 @@ if [[ -v BENCHMARK_CONNSTR ]]; then
|
||||
fi
|
||||
fi
|
||||
REGULAR_USER=false
|
||||
while getopts r arg; do
|
||||
case $arg in
|
||||
PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
|
||||
while getopts pr arg; do
|
||||
case ${arg} in
|
||||
r)
|
||||
REGULAR_USER=true
|
||||
shift $((OPTIND-1))
|
||||
@@ -41,26 +42,49 @@ extdir=${1}
|
||||
|
||||
cd "${extdir}" || exit 2
|
||||
FAILED=
|
||||
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
|
||||
for d in ${LIST}; do
|
||||
[ -d "${d}" ] || continue
|
||||
if ! psql -w -c "select 1" >/dev/null; then
|
||||
FAILED="${d} ${FAILED}"
|
||||
break
|
||||
fi
|
||||
if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
|
||||
"${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
|
||||
continue
|
||||
fi
|
||||
export FAILED_FILE=/tmp/failed
|
||||
rm -f ${FAILED_FILE}
|
||||
mapfile -t LIST < <( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
|
||||
if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
|
||||
# Avoid errors if RUN_FIRST is not defined
|
||||
RUN_FIRST=${RUN_FIRST:-}
|
||||
# Move entries listed in the RUN_FIRST variable to the beginning
|
||||
ORDERED_LIST=$(printf "%s\n" "${LIST[@]}" | grep -x -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"); printf "%s\n" "${LIST[@]}" | grep -vx -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"))
|
||||
parallel -j"${PARALLEL_COMPUTES}" "[[ -d {} ]] || exit 0
|
||||
export PGHOST=compute{%}
|
||||
if ! psql -c 'select 1'>/dev/null; then
|
||||
exit 1
|
||||
fi
|
||||
echo Running on \${PGHOST}
|
||||
if [[ -f ${extdir}/{}/neon-test.sh ]]; then
|
||||
echo Running from script
|
||||
${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE};
|
||||
else
|
||||
echo Running using make;
|
||||
USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE};
|
||||
fi" ::: ${ORDERED_LIST}
|
||||
[[ ! -f ${FAILED_FILE} ]] && exit 0
|
||||
else
|
||||
for d in "${LIST[@]}"; do
|
||||
[ -d "${d}" ] || continue
|
||||
if ! psql -w -c "select 1" >/dev/null; then
|
||||
FAILED="${d} ${FAILED}"
|
||||
break
|
||||
fi
|
||||
if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
|
||||
"${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ -f "${d}/neon-test.sh" ]; then
|
||||
"${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
|
||||
else
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
fi
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
for d in ${FAILED}; do
|
||||
if [ -f "${d}/neon-test.sh" ]; then
|
||||
"${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
|
||||
else
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
fi
|
||||
done
|
||||
[[ -z ${FAILED} ]] && exit 0
|
||||
fi
|
||||
for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do
|
||||
cat "$(find $d -name regression.diffs)"
|
||||
done
|
||||
for postgis_diff in /tmp/pgis_reg/*_diff; do
|
||||
@@ -68,4 +92,5 @@ for postgis_diff in /tmp/pgis_reg/*_diff; do
|
||||
cat "${postgis_diff}"
|
||||
done
|
||||
echo "${FAILED}"
|
||||
cat ${FAILED_FILE}
|
||||
exit 1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -eux -o pipefail
|
||||
cd "$(dirname "${0}")"
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
@@ -60,8 +60,8 @@ function check_timeline() {
|
||||
# Restarts the compute node with the required compute tag and timeline.
|
||||
# Accepts the tag for the compute node and the timeline as parameters.
|
||||
function restart_compute() {
|
||||
docker compose down compute compute_is_ready
|
||||
COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
|
||||
docker compose down compute1 compute_is_ready
|
||||
COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready
|
||||
wait_for_ready
|
||||
check_timeline ${2}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake
|
||||
up all computes for the change. Neither we want to fully reimplement the leader
|
||||
logic second time outside compute. Because of that the proposed algorithm relies
|
||||
for issuing configurations on the external fault tolerant (distributed) strongly
|
||||
consisent storage with simple API: CAS (compare-and-swap) on the single key.
|
||||
consistent storage with simple API: CAS (compare-and-swap) on the single key.
|
||||
Properly configured postgres suits this.
|
||||
|
||||
In the system consensus is implemented at the timeline level, so algorithm below
|
||||
@@ -34,7 +34,7 @@ A configuration is
|
||||
|
||||
```
|
||||
struct Configuration {
|
||||
generation: Generation, // a number uniquely identifying configuration
|
||||
generation: SafekeeperGeneration, // a number uniquely identifying configuration
|
||||
sk_set: Vec<NodeId>, // current safekeeper set
|
||||
new_sk_set: Optional<Vec<NodeId>>,
|
||||
}
|
||||
@@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it
|
||||
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
|
||||
response it sends its current configuration generation to let walproposer know.
|
||||
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
|
||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership`
|
||||
accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its
|
||||
current one and ignores it otherwise. In any case it replies with
|
||||
```
|
||||
struct ConfigurationSwitchResponse {
|
||||
struct TimelineMembershipSwitchResponse {
|
||||
conf: Configuration,
|
||||
term: Term,
|
||||
last_log_term: Term,
|
||||
@@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting.
|
||||
It should stop talking to safekeepers not listed in the configuration at this
|
||||
point, though it is not unsafe to continue doing so.
|
||||
|
||||
To be elected it must receive votes from both majorites if `new_sk_set` is present.
|
||||
To be elected it must receive votes from both majorities if `new_sk_set` is present.
|
||||
Similarly, to commit WAL it must receive flush acknowledge from both majorities.
|
||||
|
||||
If walproposer hears from safekeeper configuration higher than his own (i.e.
|
||||
@@ -130,7 +130,7 @@ storage are reachable.
|
||||
1) Fetch current timeline configuration from the configuration storage.
|
||||
2) If it is already joint one and `new_set` is different from `desired_set`
|
||||
refuse to change. However, assign join conf to (in memory) var
|
||||
`join_conf` and proceed to step 4 to finish the ongoing change.
|
||||
`joint_conf` and proceed to step 4 to finish the ongoing change.
|
||||
3) Else, create joint `joint_conf: Configuration`: increment current conf number
|
||||
`n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
|
||||
storage by doing CAS on the current generation: change happens only if
|
||||
@@ -161,11 +161,11 @@ storage are reachable.
|
||||
because `pull_timeline` already includes it and plus additionally would be
|
||||
broadcast by compute. More importantly, we may proceed to the next step
|
||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
||||
`sync_position`. Similarly, on the happy path no waiting is needed because
|
||||
`pull_timeline` already includes it. However, we should double
|
||||
check to be safe. For example, timeline could have been created earlier e.g.
|
||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
||||
7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new
|
||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
||||
storage under one more CAS.
|
||||
8) Call `PUT` `configuration` on safekeepers from the new set,
|
||||
@@ -178,12 +178,12 @@ spec of it.
|
||||
|
||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
||||
considerations.
|
||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
||||
1) It makes sense to ping new set to ensure we are migrating to live node(s) before
|
||||
step 3.
|
||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
||||
it is safe to rollback to the old conf with one more CAS.
|
||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
||||
the simplest is the procedure restart. There are more complicated scenarious like mentioned
|
||||
the simplest is the procedure restart. There are more complicated scenarios like mentioned
|
||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
||||
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
|
||||
@@ -192,7 +192,7 @@ considerations.
|
||||
4) In the end timeline should be locally deleted on the safekeeper(s) which are
|
||||
in the old set but not in the new one, unless they are unreachable. To be
|
||||
safe this also should be done under generation number (deletion proceeds only if
|
||||
current configuration is <= than one in request and safekeeper is not memeber of it).
|
||||
current configuration is <= than one in request and safekeeper is not member of it).
|
||||
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
|
||||
jump to step 7, using it as `new_conf`.
|
||||
|
||||
@@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST
|
||||
Response should be augmented with `safekeepers_generation` and `safekeepers`
|
||||
fields like described in `/notify-safekeepers` above. Initially (currently)
|
||||
these fields may be absent; in this case cplane chooses safekeepers on its own
|
||||
like it currently does. The call should be retried until succeeds.
|
||||
like it currently does. The call should be retried until it succeeds.
|
||||
|
||||
Timeline deletion and tenant deletion in cplane should call appropriate
|
||||
storage_controller endpoints like it currently does for sharded tenants. The
|
||||
calls should be retried until they succeed.
|
||||
|
||||
When compute receives safekeepers list from control plane it needs to know the
|
||||
generation to checked whether it should be updated (note that compute may get
|
||||
When compute receives safekeeper list from control plane it needs to know the
|
||||
generation to check whether it should be updated (note that compute may get
|
||||
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
|
||||
GUC is just a comma separates list of `host:port`. Let's prefix it with
|
||||
`g#<generation>:` to this end, so it will look like
|
||||
@@ -305,8 +305,8 @@ enum MigrationRequest {
|
||||
```
|
||||
|
||||
`FinishPending` requests to run the procedure to ensure state is clean: current
|
||||
configuration is not joint and majority of safekeepers are aware of it, but do
|
||||
not attempt to migrate anywhere. If current configuration fetched on step 1 is
|
||||
configuration is not joint and the majority of safekeepers are aware of it, but do
|
||||
not attempt to migrate anywhere. If the current configuration fetched on step 1 is
|
||||
not joint it jumps to step 7. It should be run at startup for all timelines (but
|
||||
similarly, in the first version it is ok to trigger it manually).
|
||||
|
||||
@@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually).
|
||||
`safekeepers` table mirroring current `nodes` should be added, except that for
|
||||
`scheduling_policy`: it is enough to have at least in the beginning only 3
|
||||
fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
|
||||
3) `decomissioned` (node is removed).
|
||||
3) `decommissioned` (node is removed).
|
||||
|
||||
`timelines` table:
|
||||
```
|
||||
@@ -326,9 +326,10 @@ table! {
|
||||
tenant_id -> Varchar,
|
||||
start_lsn -> pg_lsn,
|
||||
generation -> Int4,
|
||||
sk_set -> Array<Int4>, // list of safekeeper ids
|
||||
sk_set -> Array<Int8>, // list of safekeeper ids
|
||||
new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
|
||||
cplane_notified_generation -> Int4,
|
||||
sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about
|
||||
deleted_at -> Nullable<Timestamptz>,
|
||||
}
|
||||
}
|
||||
@@ -338,13 +339,23 @@ table! {
|
||||
might also want to add ancestor_timeline_id to preserve the hierarchy, but for
|
||||
this RFC it is not needed.
|
||||
|
||||
`cplane_notified_generation` and `sk_set_notified_generation` fields are used to
|
||||
track the last stage of the algorithm, when we need to notify safekeeper set and cplane
|
||||
with the final configuration after it's already committed to DB.
|
||||
|
||||
The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and
|
||||
`*_notified_generation` fields are up to date with `generation`.
|
||||
|
||||
It's possible to replace `*_notified_generation` with one boolean field `migration_completed`,
|
||||
but for better observability it's nice to have them separately.
|
||||
|
||||
#### API
|
||||
|
||||
Node management is similar to pageserver:
|
||||
1) POST `/control/v1/safekeepers` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeepers` lists safekeepers.
|
||||
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
|
||||
1) POST `/control/v1/safekeeper` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeeper` lists safekeepers.
|
||||
3) GET `/control/v1/safekeeper/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g.
|
||||
`offline` or `decomissioned`. Initially it is simpler not to schedule any
|
||||
migrations here.
|
||||
|
||||
@@ -368,8 +379,8 @@ Migration API: the first version is the simplest and the most imperative:
|
||||
all timelines from one safekeeper to another. It accepts json
|
||||
```
|
||||
{
|
||||
"src_sk": u32,
|
||||
"dst_sk": u32,
|
||||
"src_sk": NodeId,
|
||||
"dst_sk": NodeId,
|
||||
"limit": Optional<u32>,
|
||||
}
|
||||
```
|
||||
@@ -379,12 +390,15 @@ Returns list of scheduled requests.
|
||||
2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
|
||||
to move single timeline to given set of safekeepers:
|
||||
```
|
||||
{
|
||||
"desired_set": Vec<u32>,
|
||||
struct TimelineSafekeeperMigrateRequest {
|
||||
"new_sk_set": Vec<NodeId>,
|
||||
}
|
||||
```
|
||||
|
||||
Returns scheduled request.
|
||||
In the first version the handler migrates the timeline to `new_sk_set` synchronously.
|
||||
Should be retried until success.
|
||||
|
||||
In the future we might change it to asynchronous API and return scheduled request.
|
||||
|
||||
Similar call should be added for the tenant.
|
||||
|
||||
@@ -434,6 +448,9 @@ table! {
|
||||
}
|
||||
```
|
||||
|
||||
We load all pending ops from the table on startup into the memory.
|
||||
The table is needed only to preserve the state between restarts.
|
||||
|
||||
`op_type` can be `include` (seed from peers and ensure generation is up to
|
||||
date), `exclude` (remove locally) and `delete`. Field is actually not strictly
|
||||
needed as it can be computed from current configuration, but gives more explicit
|
||||
@@ -474,7 +491,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For
|
||||
the initial (tenant creation) call cplane doesn't know it. However, setting
|
||||
start_lsn on safekeepers during creation is a good thing -- it provides a
|
||||
guarantee that walproposer can always find a common point in WAL histories of
|
||||
safekeeper and its own, and so absense of it would be a clear sign of
|
||||
safekeeper and its own, and so absence of it would be a clear sign of
|
||||
corruption. The following sequence works:
|
||||
1) Create timeline (or observe that it exists) on pageserver,
|
||||
figuring out last_record_lsn in response.
|
||||
@@ -497,11 +514,9 @@ corruption. The following sequence works:
|
||||
retries the call until 200 response.
|
||||
|
||||
There is a small question how request handler (timeline creation in this
|
||||
case) would interact with per sk reconciler. As always I prefer to do the
|
||||
simplest possible thing and here it seems to be just waking it up so it
|
||||
re-reads the db for work to do. Passing work in memory is faster, but
|
||||
that shouldn't matter, and path to scan db for work will exist anyway,
|
||||
simpler to reuse it.
|
||||
case) would interact with per sk reconciler. In the current implementation
|
||||
we first persist the request in the DB, and then send an in-memory request
|
||||
to each safekeeper reconciler to process it.
|
||||
|
||||
For pg version / wal segment size: while we may persist them in `timelines`
|
||||
table, it is not necessary as initial creation at step 3 can take them from
|
||||
@@ -509,30 +524,40 @@ pageserver or cplane creation call and later pull_timeline will carry them
|
||||
around.
|
||||
|
||||
Timeline migration.
|
||||
1) CAS to the db to create joint conf, and in the same transaction create
|
||||
`safekeeper_timeline_pending_ops` `include` entries to initialize new members
|
||||
as well as deliver this conf to current ones; poke per sk reconcilers to work
|
||||
on it. Also any conf change should also poke cplane notifier task(s).
|
||||
2) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Task should get wakeups from per sk reconcilers because
|
||||
conf switch is required for advancement; however retries should be sleep
|
||||
based as well as LSN advancement might be needed, though in happy path
|
||||
it isn't. To see whether further transition is possible on wakup migration
|
||||
executor polls safekeepers per the algorithm. CAS creating new conf with only
|
||||
new members should again insert entries to `safekeeper_timeline_pending_ops`
|
||||
to switch them there, as well as `exclude` rows to remove timeline from
|
||||
old members.
|
||||
1) CAS to the db to create joint conf. Since this moment the migration is considered to be
|
||||
"in progress". We can detect all "in-progress" migrations looking into the database.
|
||||
2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership
|
||||
configuration on all safekeepers, notify cplane, etc. All operations are idempotent,
|
||||
so we don't need to persist anything in the database at this stage. If any errors occur,
|
||||
it's safe to retry or abort the migration.
|
||||
3) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops`
|
||||
in the same DB transaction. Adding `exclude` entries atomically is nesessary because after
|
||||
CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we
|
||||
need to have them persisted somewhere in case the migration is interrupted right after the CAS.
|
||||
4) Finish the migration. The final membership configuration is committed to the DB at this stage.
|
||||
So, the migration can not be aborted anymore. But it can still be retried if the migration fails
|
||||
past stage 3. To finish the migration we need to send the new membership configuration to
|
||||
a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude`
|
||||
requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's
|
||||
possible that we have already committed `exclude` requests to DB, but didn't send them to
|
||||
the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops`
|
||||
because it's the only place where they are persistent. The fields `sk_set_notified_generation`
|
||||
and `cplane_notified_generation` are updated after each step. The migration is considered
|
||||
fully completed when they match the `generation` field.
|
||||
|
||||
In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline
|
||||
reconciler (if we implement it). But it's wise to at least try to finish them synchronously,
|
||||
so the timeline is always in a "good state" and doesn't require an old quorum to commit
|
||||
WAL after the migration reported "success".
|
||||
|
||||
Timeline deletion: just set `deleted_at` on the timeline row and insert
|
||||
`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
|
||||
per sk reconcilers.
|
||||
|
||||
When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
|
||||
When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops`
|
||||
for it must be cleared in the same transaction.
|
||||
|
||||
One more task pool should infinitely retry notifying control plane about changed
|
||||
safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
|
||||
|
||||
#### Dealing with multiple instances of storage_controller
|
||||
|
||||
Operations described above executed concurrently might create some errors but do
|
||||
@@ -541,7 +566,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy.
|
||||
|
||||
To harden against some controller instance creating some work in
|
||||
`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
|
||||
the job per sk reconcilers apart from explicit wakups should scan for work
|
||||
the job per sk reconcilers apart from explicit wakeups should scan for work
|
||||
periodically. It is possible to remove that though if all db updates are
|
||||
protected with leadership token/term -- then such scans are needed only after
|
||||
leadership is acquired.
|
||||
@@ -563,7 +588,7 @@ There should be following layers of tests:
|
||||
safekeeper communication and pull_timeline need to be mocked and main switch
|
||||
procedure wrapped to as a node (thread) in simulation tests, using these
|
||||
mocks. Test would inject migrations like it currently injects
|
||||
safekeeper/walproposer restars. Main assert is the same -- committed WAL must
|
||||
safekeeper/walproposer restarts. Main assert is the same -- committed WAL must
|
||||
not be lost.
|
||||
|
||||
3) Since simulation testing injects at relatively high level points (not
|
||||
@@ -613,7 +638,7 @@ Let's have the following implementation bits for gradual rollout:
|
||||
`notify-safekeepers`.
|
||||
|
||||
Then the rollout for a region would be:
|
||||
- Current situation: safekeepers are choosen by control_plane.
|
||||
- Current situation: safekeepers are chosen by control_plane.
|
||||
- We manually migrate some timelines, test moving them around.
|
||||
- Then we enable `--set-safekeepers` so that all new timelines
|
||||
are on storage controller.
|
||||
|
||||
@@ -13,6 +13,8 @@ use utils::backoff::retry;
|
||||
pub fn app(state: Arc<Storage>) -> Router<()> {
|
||||
use axum::routing::{delete as _delete, get as _get};
|
||||
let delete_prefix = _delete(delete_prefix);
|
||||
// NB: On any changes do not forget to update the OpenAPI spec
|
||||
// in /endpoint_storage/src/openapi_spec.yml.
|
||||
Router::new()
|
||||
.route(
|
||||
"/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
|
||||
|
||||
146
endpoint_storage/src/openapi_spec.yml
Normal file
146
endpoint_storage/src/openapi_spec.yml
Normal file
@@ -0,0 +1,146 @@
|
||||
openapi: "3.0.2"
|
||||
info:
|
||||
title: Endpoint Storage API
|
||||
description: Endpoint Storage API
|
||||
version: "1.0"
|
||||
license:
|
||||
name: "Apache"
|
||||
url: https://github.com/neondatabase/neon/blob/main/LICENSE
|
||||
servers:
|
||||
- url: ""
|
||||
paths:
|
||||
/status:
|
||||
description: Healthcheck endpoint
|
||||
get:
|
||||
description: Healthcheck
|
||||
security: []
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/{tenant_id}/{timeline_id}/{endpoint_id}/{key}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: endpoint_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: key
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get file from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: "File stream from blob storage"
|
||||
content:
|
||||
application/octet-stream:
|
||||
schema:
|
||||
type: string
|
||||
format: binary
|
||||
"400":
|
||||
description: File was not found
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
put:
|
||||
description: Insert file into blob storage. If file exists, override it
|
||||
requestBody:
|
||||
content:
|
||||
application/octet-stream:
|
||||
schema:
|
||||
type: string
|
||||
format: binary
|
||||
responses:
|
||||
"200":
|
||||
description: File was inserted successfully
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
delete:
|
||||
description: Delete file from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: File was successfully deleted or not found
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}/{timeline_id}/{endpoint_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: endpoint_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete endpoint data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Endpoint data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete timeline data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Timeline data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
delete:
|
||||
description: Delete tenant data from blob storage
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant data was deleted
|
||||
"403":
|
||||
description: JWT does not authorize request to this route
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
type: http
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
|
||||
pub version: ExtVersion,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcPrewarmState {
|
||||
#[default]
|
||||
@@ -58,6 +58,17 @@ pub enum LfcPrewarmState {
|
||||
},
|
||||
}
|
||||
|
||||
impl Display for LfcPrewarmState {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
|
||||
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
|
||||
LfcPrewarmState::Completed => f.write_str("Completed"),
|
||||
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcOffloadState {
|
||||
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
/// Response of /promote
|
||||
pub enum PromoteState {
|
||||
NotPromoted,
|
||||
Completed,
|
||||
Failed { error: String },
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
/// Result of /safekeepers_lsn
|
||||
pub struct SafekeepersLsn {
|
||||
pub safekeepers: String,
|
||||
pub wal_flush_lsn: utils::lsn::Lsn,
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
@@ -93,6 +121,15 @@ pub enum TerminateMode {
|
||||
Immediate,
|
||||
}
|
||||
|
||||
impl From<TerminateMode> for ComputeStatus {
|
||||
fn from(mode: TerminateMode) -> Self {
|
||||
match mode {
|
||||
TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
|
||||
TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
@@ -113,7 +150,9 @@ pub enum ComputeStatus {
|
||||
// control-plane to terminate it.
|
||||
Failed,
|
||||
// Termination requested
|
||||
TerminationPending { mode: TerminateMode },
|
||||
TerminationPendingFast,
|
||||
// Termination requested, without waiting 30s before returning from /terminate
|
||||
TerminationPendingImmediate,
|
||||
// Terminated Postgres
|
||||
Terminated,
|
||||
}
|
||||
@@ -132,7 +171,10 @@ impl Display for ComputeStatus {
|
||||
ComputeStatus::Running => f.write_str("running"),
|
||||
ComputeStatus::Configuration => f.write_str("configuration"),
|
||||
ComputeStatus::Failed => f.write_str("failed"),
|
||||
ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
|
||||
ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
|
||||
ComputeStatus::TerminationPendingImmediate => {
|
||||
f.write_str("termination-pending-immediate")
|
||||
}
|
||||
ComputeStatus::Terminated => f.write_str("terminated"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{Instrument, debug, info, info_span, warn};
|
||||
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};
|
||||
|
||||
use crate::error::{ApiError, api_error_handler, route_error_handler};
|
||||
use crate::request::{get_query_param, parse_query_param};
|
||||
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub async fn prometheus_metrics_handler(
|
||||
req: Request<Body>,
|
||||
force_metric_collection_on_scrape: bool,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
// HADRON
|
||||
let requested_use_latest = parse_query_param(&req, "use_latest")?;
|
||||
|
||||
let use_latest = match requested_use_latest {
|
||||
None => force_metric_collection_on_scrape,
|
||||
Some(true) => true,
|
||||
Some(false) => {
|
||||
if force_metric_collection_on_scrape {
|
||||
// We don't cache in this case
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
|
||||
let _span = span.entered();
|
||||
|
||||
let metrics = metrics::gather();
|
||||
// HADRON
|
||||
let collected = if use_latest {
|
||||
// Skip caching the results if we always force metric collection on scrape.
|
||||
METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
|
||||
} else {
|
||||
METRICS_COLLECTOR.last_collected()
|
||||
};
|
||||
|
||||
let gathered_at = std::time::Instant::now();
|
||||
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.encode(&collected.metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
// this instant is not when we finally got the full response sent, sending is done by hyper
|
||||
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
let encoded_in = encoded_at - gathered_at - writer.wait_time();
|
||||
let total = encoded_at - started_at;
|
||||
|
||||
// HADRON
|
||||
let staleness_ms = (encoded_at - collected.collected_at).as_millis();
|
||||
METRICS_STALE_MILLIS.set(staleness_ms as i64);
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
|
||||
spawning_ms = spawned_in.as_millis(),
|
||||
collection_ms = collected_in.as_millis(),
|
||||
encoding_ms = encoded_in.as_millis(),
|
||||
stalenss_ms = staleness_ms,
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -41,17 +41,35 @@ pub fn get_query_param<'a>(
|
||||
Some(q) => q,
|
||||
None => return Ok(None),
|
||||
};
|
||||
let mut values = url::form_urlencoded::parse(query.as_bytes())
|
||||
let values = url::form_urlencoded::parse(query.as_bytes())
|
||||
.filter_map(|(k, v)| if k == param_name { Some(v) } else { None })
|
||||
// we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards
|
||||
.fuse();
|
||||
|
||||
let value1 = values.next();
|
||||
if values.next().is_some() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"param {param_name} specified more than once"
|
||||
)));
|
||||
}
|
||||
// Work around an issue with Alloy's pyroscope scrape where the "seconds"
|
||||
// parameter is added several times. https://github.com/grafana/alloy/issues/3026
|
||||
// TODO: revert after Alloy is fixed.
|
||||
let value1 = values
|
||||
.map(Ok)
|
||||
.reduce(|acc, i| {
|
||||
match acc {
|
||||
Err(_) => acc,
|
||||
|
||||
// It's okay to have duplicates as along as they have the same value.
|
||||
Ok(ref a) if a == &i.unwrap() => acc,
|
||||
|
||||
_ => Err(ApiError::BadRequest(anyhow!(
|
||||
"param {param_name} specified more than once"
|
||||
))),
|
||||
}
|
||||
})
|
||||
.transpose()?;
|
||||
// if values.next().is_some() {
|
||||
// return Err(ApiError::BadRequest(anyhow!(
|
||||
// "param {param_name} specified more than once"
|
||||
// )));
|
||||
// }
|
||||
|
||||
Ok(value1)
|
||||
}
|
||||
|
||||
@@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError>
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_query_param_duplicate() {
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert_eq!(value.unwrap(), "1");
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1&testparam=1")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert_eq!(value.unwrap(), "1");
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam").unwrap();
|
||||
assert!(value.is_none());
|
||||
|
||||
let req = Request::builder()
|
||||
.uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3")
|
||||
.body(hyper::Body::empty())
|
||||
.unwrap();
|
||||
let value = get_query_param(&req, "testparam");
|
||||
assert!(value.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ mod tests;
|
||||
|
||||
use const_format::formatcp;
|
||||
use posthog_client_lite::PostHogClientConfig;
|
||||
use utils::serde_percent::Percent;
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
@@ -223,8 +224,9 @@ pub struct ConfigToml {
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
pub test_remote_failures: u64,
|
||||
pub test_remote_failures_probability: u64,
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub background_task_maximum_delay: Duration,
|
||||
@@ -270,9 +272,13 @@ pub struct ConfigToml {
|
||||
pub timeline_import_config: TimelineImportConfig,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub basebackup_cache_config: Option<BasebackupCacheConfig>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_generation_large_timeline_threshold: Option<u64>,
|
||||
pub force_metric_collection_on_scrape: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: utils::serde_percent::Percent,
|
||||
pub min_avail_bytes: u64,
|
||||
@@ -283,6 +289,21 @@ pub struct DiskUsageEvictionTaskConfig {
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
pub enabled: bool,
|
||||
}
|
||||
|
||||
impl Default for DiskUsageEvictionTaskConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: EvictionOrder::default(),
|
||||
enabled: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -543,6 +564,11 @@ pub struct TenantConfigToml {
|
||||
pub gc_period: Duration,
|
||||
// Delta layer churn threshold to create L1 image layers.
|
||||
pub image_creation_threshold: usize,
|
||||
// HADRON
|
||||
// When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
|
||||
// (2) create image layers if there are any L1 deltas.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub image_layer_force_creation_period: Option<Duration>,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is time.
|
||||
@@ -738,9 +764,10 @@ impl Default for ConfigToml {
|
||||
|
||||
metric_collection_bucket: (None),
|
||||
|
||||
disk_usage_based_eviction: (None),
|
||||
disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
|
||||
|
||||
test_remote_failures: (0),
|
||||
test_remote_failures_probability: (100),
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: (false),
|
||||
|
||||
@@ -804,6 +831,8 @@ impl Default for ConfigToml {
|
||||
},
|
||||
basebackup_cache_config: None,
|
||||
posthog_config: None,
|
||||
image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
|
||||
force_metric_collection_on_scrape: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -897,6 +926,7 @@ impl Default for TenantConfigToml {
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
image_layer_force_creation_period: None,
|
||||
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
||||
.expect("cannot parse default PITR interval"),
|
||||
walreceiver_connect_timeout: humantime::parse_duration(
|
||||
|
||||
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
|
||||
pub safekeepers: Vec<SafekeeperInfo>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct SafekeeperInfo {
|
||||
pub id: NodeId,
|
||||
pub hostname: String,
|
||||
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
|
||||
pub gc_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_creation_threshold: FieldPatch<usize>,
|
||||
// HADRON
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_layer_force_creation_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub pitr_interval: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
@@ -700,6 +703,11 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
|
||||
// HADRON
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub image_layer_force_creation_period: Option<Duration>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub pitr_interval: Option<Duration>,
|
||||
@@ -798,6 +806,7 @@ impl TenantConfig {
|
||||
mut gc_horizon,
|
||||
mut gc_period,
|
||||
mut image_creation_threshold,
|
||||
mut image_layer_force_creation_period,
|
||||
mut pitr_interval,
|
||||
mut walreceiver_connect_timeout,
|
||||
mut lagging_wal_timeout,
|
||||
@@ -861,6 +870,11 @@ impl TenantConfig {
|
||||
patch
|
||||
.image_creation_threshold
|
||||
.apply(&mut image_creation_threshold);
|
||||
// HADRON
|
||||
patch
|
||||
.image_layer_force_creation_period
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut image_layer_force_creation_period);
|
||||
patch
|
||||
.pitr_interval
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
@@ -942,6 +956,7 @@ impl TenantConfig {
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
image_creation_threshold,
|
||||
image_layer_force_creation_period,
|
||||
pitr_interval,
|
||||
walreceiver_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
@@ -1016,6 +1031,9 @@ impl TenantConfig {
|
||||
image_creation_threshold: self
|
||||
.image_creation_threshold
|
||||
.unwrap_or(global_conf.image_creation_threshold),
|
||||
image_layer_force_creation_period: self
|
||||
.image_layer_force_creation_period
|
||||
.or(global_conf.image_layer_force_creation_period),
|
||||
pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
|
||||
walreceiver_connect_timeout: self
|
||||
.walreceiver_connect_timeout
|
||||
|
||||
12
libs/proxy/json/Cargo.toml
Normal file
12
libs/proxy/json/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "json"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
ryu = "1"
|
||||
itoa = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
futures = "0.3"
|
||||
412
libs/proxy/json/src/lib.rs
Normal file
412
libs/proxy/json/src/lib.rs
Normal file
@@ -0,0 +1,412 @@
|
||||
//! A JSON serialization lib, designed for more flexibility than `serde_json` offers.
|
||||
//!
|
||||
//! Features:
|
||||
//!
|
||||
//! ## Dynamic construction
|
||||
//!
|
||||
//! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc.
|
||||
//! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types.
|
||||
//! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct),
|
||||
//! but that is often not very efficient.
|
||||
//!
|
||||
//! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the
|
||||
//! relevant functions, and it will guarantee a correctly encoded JSON value.
|
||||
//!
|
||||
//! ## Async construction
|
||||
//!
|
||||
//! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory
|
||||
//! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however
|
||||
//! there are exceptions.
|
||||
//!
|
||||
//! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes,
|
||||
//! whereas serializing values incrementally spreads out the CPU load and reduces lag.
|
||||
//!
|
||||
//! ## Examples
|
||||
//!
|
||||
//! To represent the following JSON as a compact string
|
||||
//!
|
||||
//! ```json
|
||||
//! {
|
||||
//! "results": {
|
||||
//! "rows": [
|
||||
//! {
|
||||
//! "id": 1,
|
||||
//! "value": null
|
||||
//! },
|
||||
//! {
|
||||
//! "id": 2,
|
||||
//! "value": "hello"
|
||||
//! }
|
||||
//! ]
|
||||
//! }
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! We can use the following code:
|
||||
//!
|
||||
//! ```
|
||||
//! // create the outer object
|
||||
//! let s = json::value_to_string!(|v| json::value_as_object!(|v| {
|
||||
//! // create an entry with key "results" and start an object value associated with it.
|
||||
//! let results = v.key("results");
|
||||
//! json::value_as_object!(|results| {
|
||||
//! // create an entry with key "rows" and start an list value associated with it.
|
||||
//! let rows = results.key("rows");
|
||||
//! json::value_as_list!(|rows| {
|
||||
//! // create a list entry and start an object value associated with it.
|
||||
//! let row = rows.entry();
|
||||
//! json::value_as_object!(|row| {
|
||||
//! // add entry "id": 1
|
||||
//! row.entry("id", 1);
|
||||
//! // add entry "value": null
|
||||
//! row.entry("value", json::Null);
|
||||
//! });
|
||||
//!
|
||||
//! // create a list entry and start an object value associated with it.
|
||||
//! let row = rows.entry();
|
||||
//! json::value_as_object!(|row| {
|
||||
//! // add entry "id": 2
|
||||
//! row.entry("id", 2);
|
||||
//! // add entry "value": "hello"
|
||||
//! row.entry("value", "hello");
|
||||
//! });
|
||||
//! });
|
||||
//! });
|
||||
//! }));
|
||||
//!
|
||||
//! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#);
|
||||
//! ```
|
||||
|
||||
mod macros;
|
||||
mod str;
|
||||
mod value;
|
||||
|
||||
pub use value::{Null, ValueEncoder};
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a single json value.
|
||||
pub struct ValueSer<'buf> {
|
||||
buf: &'buf mut Vec<u8>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ValueSer<'buf> {
|
||||
/// Create a new json value serializer.
|
||||
pub fn new(buf: &'buf mut Vec<u8>) -> Self {
|
||||
Self { buf, start: 0 }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.buf
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn value(self, e: impl ValueEncoder) {
|
||||
e.encode(self);
|
||||
}
|
||||
|
||||
/// Write raw bytes to the buf. This must be already JSON encoded.
|
||||
#[inline]
|
||||
pub fn write_raw_json(self, data: &[u8]) {
|
||||
self.buf.extend_from_slice(data);
|
||||
self.finish();
|
||||
}
|
||||
|
||||
/// Start a new object serializer.
|
||||
#[inline]
|
||||
pub fn object(self) -> ObjectSer<'buf> {
|
||||
ObjectSer::new(self)
|
||||
}
|
||||
|
||||
/// Start a new list serializer.
|
||||
#[inline]
|
||||
pub fn list(self) -> ListSer<'buf> {
|
||||
ListSer::new(self)
|
||||
}
|
||||
|
||||
/// Finish the value ser.
|
||||
#[inline]
|
||||
fn finish(self) {
|
||||
// don't trigger the drop handler which triggers a rollback.
|
||||
// this won't cause memory leaks because `ValueSet` owns no allocations.
|
||||
std::mem::forget(self);
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ValueSer<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.buf.truncate(self.start);
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a json object.
|
||||
pub struct ObjectSer<'buf> {
|
||||
value: ValueSer<'buf>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ObjectSer<'buf> {
|
||||
/// Start a new object serializer.
|
||||
#[inline]
|
||||
pub fn new(value: ValueSer<'buf>) -> Self {
|
||||
value.buf.push(b'{');
|
||||
let start = value.buf.len();
|
||||
Self { value, start }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.value.as_buffer()
|
||||
}
|
||||
|
||||
/// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value.
|
||||
#[inline]
|
||||
pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> {
|
||||
key.write_key(self)
|
||||
}
|
||||
|
||||
/// Write an entry (key-value pair) to the object.
|
||||
#[inline]
|
||||
pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) {
|
||||
self.key(key).value(val);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn entry_inner(&mut self, f: impl FnOnce(&mut Vec<u8>)) -> ValueSer<'_> {
|
||||
// track before the separator so we the value is rolled back it also removes the separator.
|
||||
let start = self.value.buf.len();
|
||||
|
||||
// push separator if necessary
|
||||
if self.value.buf.len() > self.start {
|
||||
self.value.buf.push(b',');
|
||||
}
|
||||
// push key
|
||||
f(self.value.buf);
|
||||
// push value separator
|
||||
self.value.buf.push(b':');
|
||||
|
||||
// return value writer.
|
||||
ValueSer {
|
||||
buf: self.value.buf,
|
||||
start,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the buffer back to before this object was started.
|
||||
#[inline]
|
||||
pub fn rollback(self) -> ValueSer<'buf> {
|
||||
// Do not fully reset the value, only reset it to before the `{`.
|
||||
// This ensures any `,` before this value are not clobbered.
|
||||
self.value.buf.truncate(self.start - 1);
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Finish the object ser.
|
||||
#[inline]
|
||||
pub fn finish(self) {
|
||||
self.value.buf.push(b'}');
|
||||
self.value.finish();
|
||||
}
|
||||
}
|
||||
|
||||
pub trait KeyEncoder {
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>;
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
/// Serialize a json object.
|
||||
pub struct ListSer<'buf> {
|
||||
value: ValueSer<'buf>,
|
||||
start: usize,
|
||||
}
|
||||
|
||||
impl<'buf> ListSer<'buf> {
|
||||
/// Start a new list serializer.
|
||||
#[inline]
|
||||
pub fn new(value: ValueSer<'buf>) -> Self {
|
||||
value.buf.push(b'[');
|
||||
let start = value.buf.len();
|
||||
Self { value, start }
|
||||
}
|
||||
|
||||
/// Borrow the underlying buffer
|
||||
pub fn as_buffer(&self) -> &[u8] {
|
||||
self.value.as_buffer()
|
||||
}
|
||||
|
||||
/// Write an value to the list.
|
||||
#[inline]
|
||||
pub fn push(&mut self, val: impl ValueEncoder) {
|
||||
self.entry().value(val);
|
||||
}
|
||||
|
||||
/// Start a new value entry in this list.
|
||||
#[inline]
|
||||
pub fn entry(&mut self) -> ValueSer<'_> {
|
||||
// track before the separator so we the value is rolled back it also removes the separator.
|
||||
let start = self.value.buf.len();
|
||||
|
||||
// push separator if necessary
|
||||
if self.value.buf.len() > self.start {
|
||||
self.value.buf.push(b',');
|
||||
}
|
||||
|
||||
// return value writer.
|
||||
ValueSer {
|
||||
buf: self.value.buf,
|
||||
start,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset the buffer back to before this object was started.
|
||||
#[inline]
|
||||
pub fn rollback(self) -> ValueSer<'buf> {
|
||||
// Do not fully reset the value, only reset it to before the `[`.
|
||||
// This ensures any `,` before this value are not clobbered.
|
||||
self.value.buf.truncate(self.start - 1);
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Finish the object ser.
|
||||
#[inline]
|
||||
pub fn finish(self) {
|
||||
self.value.buf.push(b']');
|
||||
self.value.finish();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{Null, ValueSer};
|
||||
|
||||
#[test]
|
||||
fn object() {
|
||||
let mut buf = vec![];
|
||||
let mut object = ValueSer::new(&mut buf).object();
|
||||
object.entry("foo", "bar");
|
||||
object.entry("baz", Null);
|
||||
object.finish();
|
||||
|
||||
assert_eq!(buf, br#"{"foo":"bar","baz":null}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list() {
|
||||
let mut buf = vec![];
|
||||
let mut list = ValueSer::new(&mut buf).list();
|
||||
list.entry().value("bar");
|
||||
list.entry().value(Null);
|
||||
list.finish();
|
||||
|
||||
assert_eq!(buf, br#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn object_macro() {
|
||||
let res = crate::value_to_string!(|obj| {
|
||||
crate::value_as_object!(|obj| {
|
||||
obj.entry("foo", "bar");
|
||||
obj.entry("baz", Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"{"foo":"bar","baz":null}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list_macro() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
list.entry().value("bar");
|
||||
list.entry().value(Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_on_drop() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
list.entry().value("bar");
|
||||
|
||||
'cancel: {
|
||||
let nested_list = list.entry();
|
||||
crate::value_as_list!(|nested_list| {
|
||||
nested_list.entry().value(1);
|
||||
|
||||
assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#);
|
||||
if true {
|
||||
break 'cancel;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
assert_eq!(list.as_buffer(), br#"["bar""#);
|
||||
|
||||
list.entry().value(Null);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"["bar",null]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_object() {
|
||||
let res = crate::value_to_string!(|obj| {
|
||||
crate::value_as_object!(|obj| {
|
||||
let entry = obj.key("1");
|
||||
entry.value(1_i32);
|
||||
|
||||
let entry = obj.key("2");
|
||||
let entry = {
|
||||
let mut nested_obj = entry.object();
|
||||
nested_obj.entry("foo", "bar");
|
||||
nested_obj.rollback()
|
||||
};
|
||||
|
||||
entry.value(2_i32);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"{"1":1,"2":2}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rollback_list() {
|
||||
let res = crate::value_to_string!(|list| {
|
||||
crate::value_as_list!(|list| {
|
||||
let entry = list.entry();
|
||||
entry.value(1_i32);
|
||||
|
||||
let entry = list.entry();
|
||||
let entry = {
|
||||
let mut nested_list = entry.list();
|
||||
nested_list.push("foo");
|
||||
nested_list.rollback()
|
||||
};
|
||||
|
||||
entry.value(2_i32);
|
||||
})
|
||||
});
|
||||
|
||||
assert_eq!(res, r#"[1,2]"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_escaping() {
|
||||
let mut buf = vec![];
|
||||
let mut object = ValueSer::new(&mut buf).object();
|
||||
|
||||
let key = "hello";
|
||||
let value = "\n world";
|
||||
|
||||
object.entry(format_args!("{key:?}"), value);
|
||||
object.finish();
|
||||
|
||||
assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#);
|
||||
}
|
||||
}
|
||||
86
libs/proxy/json/src/macros.rs
Normal file
86
libs/proxy/json/src/macros.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
//! # Examples
|
||||
//!
|
||||
//! ```
|
||||
//! use futures::{StreamExt, TryStream, TryStreamExt};
|
||||
//!
|
||||
//! async fn stream_to_json_list<S, T, E>(mut s: S) -> Result<String, E>
|
||||
//! where
|
||||
//! S: TryStream<Ok = T, Error = E> + Unpin,
|
||||
//! T: json::ValueEncoder
|
||||
//! {
|
||||
//! Ok(json::value_to_string!(|val| json::value_as_list!(|val| {
|
||||
//! // note how we can use `.await` and `?` in here.
|
||||
//! while let Some(value) = s.try_next().await? {
|
||||
//! val.push(value);
|
||||
//! }
|
||||
//! })))
|
||||
//! }
|
||||
//!
|
||||
//! let stream = futures::stream::iter([1, 2, 3]).map(Ok::<i32, ()>);
|
||||
//! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap();
|
||||
//! assert_eq!(json_string, "[1,2,3]");
|
||||
//! ```
|
||||
|
||||
/// A helper to create a new JSON vec.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_to_vec {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut buf = vec![];
|
||||
let $val = $crate::ValueSer::new(&mut buf);
|
||||
let _: () = $body;
|
||||
buf
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper to create a new JSON string.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_to_string {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body))
|
||||
.expect("json should be valid utf8")
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion.
|
||||
///
|
||||
/// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer.
|
||||
/// The serializer is only 'finished' if the body completes.
|
||||
/// The serializer is rolled back if `break`/`return` escapes the body.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_as_object {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut obj = $crate::ObjectSer::new($val);
|
||||
|
||||
let $val = &mut obj;
|
||||
let res = $body;
|
||||
|
||||
obj.finish();
|
||||
res
|
||||
}};
|
||||
}
|
||||
|
||||
/// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion.
|
||||
///
|
||||
/// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer.
|
||||
/// The serializer is only 'finished' if the body completes.
|
||||
/// The serializer is rolled back if `break`/`return` escapes the body.
|
||||
///
|
||||
/// Implemented as a macro to preserve all control flow.
|
||||
#[macro_export]
|
||||
macro_rules! value_as_list {
|
||||
(|$val:ident| $body:expr) => {{
|
||||
let mut list = $crate::ListSer::new($val);
|
||||
|
||||
let $val = &mut list;
|
||||
let res = $body;
|
||||
|
||||
list.finish();
|
||||
res
|
||||
}};
|
||||
}
|
||||
166
libs/proxy/json/src/str.rs
Normal file
166
libs/proxy/json/src/str.rs
Normal file
@@ -0,0 +1,166 @@
|
||||
//! Helpers for serializing escaped strings.
|
||||
//!
|
||||
//! ## License
|
||||
//!
|
||||
//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L1514-L1552>
|
||||
//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L2081-L2157>
|
||||
//! Licensed by David Tolnay under MIT or Apache-2.0.
|
||||
//!
|
||||
//! With modifications by Conrad Ludgate on behalf of Databricks.
|
||||
|
||||
use std::fmt::{self, Write};
|
||||
|
||||
/// Represents a character escape code in a type-safe manner.
|
||||
pub enum CharEscape {
|
||||
/// An escaped quote `"`
|
||||
Quote,
|
||||
/// An escaped reverse solidus `\`
|
||||
ReverseSolidus,
|
||||
// /// An escaped solidus `/`
|
||||
// Solidus,
|
||||
/// An escaped backspace character (usually escaped as `\b`)
|
||||
Backspace,
|
||||
/// An escaped form feed character (usually escaped as `\f`)
|
||||
FormFeed,
|
||||
/// An escaped line feed character (usually escaped as `\n`)
|
||||
LineFeed,
|
||||
/// An escaped carriage return character (usually escaped as `\r`)
|
||||
CarriageReturn,
|
||||
/// An escaped tab character (usually escaped as `\t`)
|
||||
Tab,
|
||||
/// An escaped ASCII plane control character (usually escaped as
|
||||
/// `\u00XX` where `XX` are two hex characters)
|
||||
AsciiControl(u8),
|
||||
}
|
||||
|
||||
impl CharEscape {
|
||||
#[inline]
|
||||
fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
|
||||
match escape {
|
||||
self::BB => CharEscape::Backspace,
|
||||
self::TT => CharEscape::Tab,
|
||||
self::NN => CharEscape::LineFeed,
|
||||
self::FF => CharEscape::FormFeed,
|
||||
self::RR => CharEscape::CarriageReturn,
|
||||
self::QU => CharEscape::Quote,
|
||||
self::BS => CharEscape::ReverseSolidus,
|
||||
self::UU => CharEscape::AsciiControl(byte),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn format_escaped_str(writer: &mut Vec<u8>, value: &str) {
|
||||
writer.reserve(2 + value.len());
|
||||
|
||||
writer.push(b'"');
|
||||
|
||||
let rest = format_escaped_str_contents(writer, value);
|
||||
writer.extend_from_slice(rest);
|
||||
|
||||
writer.push(b'"');
|
||||
}
|
||||
|
||||
pub(crate) fn format_escaped_fmt(writer: &mut Vec<u8>, args: fmt::Arguments) {
|
||||
writer.push(b'"');
|
||||
|
||||
Collect { buf: writer }
|
||||
.write_fmt(args)
|
||||
.expect("formatting should not error");
|
||||
|
||||
writer.push(b'"');
|
||||
}
|
||||
|
||||
struct Collect<'buf> {
|
||||
buf: &'buf mut Vec<u8>,
|
||||
}
|
||||
|
||||
impl fmt::Write for Collect<'_> {
|
||||
fn write_str(&mut self, s: &str) -> fmt::Result {
|
||||
let last = format_escaped_str_contents(self.buf, s);
|
||||
self.buf.extend(last);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// writes any escape sequences, and returns the suffix still needed to be written.
|
||||
fn format_escaped_str_contents<'a>(writer: &mut Vec<u8>, value: &'a str) -> &'a [u8] {
|
||||
let bytes = value.as_bytes();
|
||||
|
||||
let mut start = 0;
|
||||
|
||||
for (i, &byte) in bytes.iter().enumerate() {
|
||||
let escape = ESCAPE[byte as usize];
|
||||
if escape == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
writer.extend_from_slice(&bytes[start..i]);
|
||||
|
||||
let char_escape = CharEscape::from_escape_table(escape, byte);
|
||||
write_char_escape(writer, char_escape);
|
||||
|
||||
start = i + 1;
|
||||
}
|
||||
|
||||
&bytes[start..]
|
||||
}
|
||||
|
||||
const BB: u8 = b'b'; // \x08
|
||||
const TT: u8 = b't'; // \x09
|
||||
const NN: u8 = b'n'; // \x0A
|
||||
const FF: u8 = b'f'; // \x0C
|
||||
const RR: u8 = b'r'; // \x0D
|
||||
const QU: u8 = b'"'; // \x22
|
||||
const BS: u8 = b'\\'; // \x5C
|
||||
const UU: u8 = b'u'; // \x00...\x1F except the ones above
|
||||
const __: u8 = 0;
|
||||
|
||||
// Lookup table of escape sequences. A value of b'x' at index i means that byte
|
||||
// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
|
||||
static ESCAPE: [u8; 256] = [
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
|
||||
UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
|
||||
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
|
||||
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
|
||||
];
|
||||
|
||||
fn write_char_escape(writer: &mut Vec<u8>, char_escape: CharEscape) {
|
||||
let s = match char_escape {
|
||||
CharEscape::Quote => b"\\\"",
|
||||
CharEscape::ReverseSolidus => b"\\\\",
|
||||
// CharEscape::Solidus => b"\\/",
|
||||
CharEscape::Backspace => b"\\b",
|
||||
CharEscape::FormFeed => b"\\f",
|
||||
CharEscape::LineFeed => b"\\n",
|
||||
CharEscape::CarriageReturn => b"\\r",
|
||||
CharEscape::Tab => b"\\t",
|
||||
CharEscape::AsciiControl(byte) => {
|
||||
static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
|
||||
let bytes = &[
|
||||
b'\\',
|
||||
b'u',
|
||||
b'0',
|
||||
b'0',
|
||||
HEX_DIGITS[(byte >> 4) as usize],
|
||||
HEX_DIGITS[(byte & 0xF) as usize],
|
||||
];
|
||||
return writer.extend_from_slice(bytes);
|
||||
}
|
||||
};
|
||||
|
||||
writer.extend_from_slice(s);
|
||||
}
|
||||
168
libs/proxy/json/src/value.rs
Normal file
168
libs/proxy/json/src/value.rs
Normal file
@@ -0,0 +1,168 @@
|
||||
use core::fmt;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
|
||||
use crate::str::{format_escaped_fmt, format_escaped_str};
|
||||
use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object};
|
||||
|
||||
/// Write a value to the underlying json representation.
|
||||
pub trait ValueEncoder {
|
||||
fn encode(self, v: ValueSer<'_>);
|
||||
}
|
||||
|
||||
pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec<u8>) {
|
||||
b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes());
|
||||
}
|
||||
|
||||
pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec<u8>) {
|
||||
b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes());
|
||||
}
|
||||
|
||||
impl<T: Copy + ValueEncoder> ValueEncoder for &T {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
T::encode(*self, v);
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueEncoder for &str {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
format_escaped_str(v.buf, self);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueEncoder for fmt::Arguments<'_> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
if let Some(s) = self.as_str() {
|
||||
format_escaped_str(v.buf, s);
|
||||
} else {
|
||||
format_escaped_fmt(v.buf, self);
|
||||
}
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! int {
|
||||
[$($t:ty),*] => {
|
||||
$(
|
||||
impl ValueEncoder for $t {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
write_int(self, v.buf);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
int![u8, u16, u32, u64, usize, u128];
|
||||
int![i8, i16, i32, i64, isize, i128];
|
||||
|
||||
macro_rules! float {
|
||||
[$($t:ty),*] => {
|
||||
$(
|
||||
impl ValueEncoder for $t {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
write_float(self, v.buf);
|
||||
v.finish();
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
float![f32, f64];
|
||||
|
||||
impl ValueEncoder for bool {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
v.write_raw_json(if self { b"true" } else { b"false" });
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ValueEncoder> ValueEncoder for Option<T> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
match self {
|
||||
Some(value) => value.encode(v),
|
||||
None => Null.encode(v),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyEncoder for &str {
|
||||
#[inline]
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
|
||||
let obj = &mut *obj;
|
||||
obj.entry_inner(|b| format_escaped_str(b, self))
|
||||
}
|
||||
}
|
||||
|
||||
impl KeyEncoder for fmt::Arguments<'_> {
|
||||
#[inline]
|
||||
fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
|
||||
if let Some(key) = self.as_str() {
|
||||
obj.entry_inner(|b| format_escaped_str(b, key))
|
||||
} else {
|
||||
obj.entry_inner(|b| format_escaped_fmt(b, self))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the JSON null value.
|
||||
pub struct Null;
|
||||
|
||||
impl ValueEncoder for Null {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
v.write_raw_json(b"null");
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ValueEncoder> ValueEncoder for Vec<T> {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
value_as_list!(|v| {
|
||||
for t in self {
|
||||
v.entry().value(t);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Copy + ValueEncoder> ValueEncoder for &[T] {
|
||||
#[inline]
|
||||
fn encode(self, v: ValueSer<'_>) {
|
||||
value_as_list!(|v| {
|
||||
for t in self {
|
||||
v.entry().value(t);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: KeyEncoder, V: ValueEncoder, S> ValueEncoder for HashMap<K, V, S> {
|
||||
#[inline]
|
||||
fn encode(self, o: ValueSer<'_>) {
|
||||
value_as_object!(|o| {
|
||||
for (k, v) in self {
|
||||
o.entry(k, v);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: KeyEncoder, V: ValueEncoder> ValueEncoder for BTreeMap<K, V> {
|
||||
#[inline]
|
||||
fn encode(self, o: ValueSer<'_>) {
|
||||
value_as_object!(|o| {
|
||||
for (k, v) in self {
|
||||
o.entry(k, v);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,7 @@ aws-smithy-async.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
base64.workspace = true
|
||||
bytes.workspace = true
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
humantime-serde.workspace = true
|
||||
@@ -41,6 +42,9 @@ http-body-util.workspace = true
|
||||
itertools.workspace = true
|
||||
sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
|
||||
byteorder = "1.4"
|
||||
rand = "0.8.5"
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
|
||||
@@ -14,17 +14,25 @@ use anyhow::{Context, Result, anyhow};
|
||||
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::blob::operations::GetBlobBuilder;
|
||||
use azure_storage_blobs::blob::BlobBlockType;
|
||||
use azure_storage_blobs::blob::BlockList;
|
||||
use azure_storage_blobs::blob::{Blob, CopyStatus};
|
||||
use azure_storage_blobs::container::operations::ListBlobsBuilder;
|
||||
use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use base64::{Engine as _, engine::general_purpose::URL_SAFE};
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::FutureExt;
|
||||
use futures::future::Either;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use http_types::{StatusCode, Url};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::io::AsyncSeekExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
use utils::backoff;
|
||||
@@ -51,6 +59,9 @@ pub struct AzureBlobStorage {
|
||||
|
||||
// Alternative timeout used for metadata objects which are expected to be small
|
||||
pub small_timeout: Duration,
|
||||
/* BEGIN_HADRON */
|
||||
pub put_block_size_mb: Option<usize>,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
impl AzureBlobStorage {
|
||||
@@ -107,6 +118,9 @@ impl AzureBlobStorage {
|
||||
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
|
||||
timeout,
|
||||
small_timeout,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: azure_config.put_block_size_mb,
|
||||
/* END_HADRON */
|
||||
})
|
||||
}
|
||||
|
||||
@@ -583,31 +597,137 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let op = async {
|
||||
let mut metadata_map = metadata.unwrap_or([].into());
|
||||
let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block");
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let op = async move {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024;
|
||||
if timeline_file_path.is_none() || put_block_size == 0 {
|
||||
// Use put_block_blob directly.
|
||||
let from: Pin<
|
||||
Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
|
||||
> = Box::pin(from);
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
if !metadata_map.0.is_empty() {
|
||||
builder = builder.metadata(to_azure_metadata(metadata_map));
|
||||
}
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
let result = fut.await;
|
||||
match result {
|
||||
Ok(Ok(_response)) => return Ok(()),
|
||||
Ok(Err(azure)) => return Err(azure.into()),
|
||||
Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
};
|
||||
}
|
||||
// Upload chunks concurrently using Put Block.
|
||||
// Each PutBlock uploads put_block_size bytes of the file.
|
||||
let mut upload_futures: Vec<tokio::task::JoinHandle<Result<(), azure_core::Error>>> =
|
||||
vec![];
|
||||
let mut block_list = BlockList::default();
|
||||
let mut start_bytes = 0u64;
|
||||
let mut remaining_bytes = data_size_bytes;
|
||||
let mut block_list_count = 0;
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
while remaining_bytes > 0 {
|
||||
let block_size = std::cmp::min(remaining_bytes, put_block_size);
|
||||
let end_bytes = start_bytes + block_size as u64;
|
||||
let block_id = block_list_count;
|
||||
let timeout = self.timeout;
|
||||
let blob_client = blob_client.clone();
|
||||
let timeline_file = timeline_file_path.clone().unwrap().clone();
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
let mut encoded_block_id = [0u8; 8];
|
||||
BigEndian::write_u64(&mut encoded_block_id, block_id);
|
||||
URL_SAFE.encode(encoded_block_id);
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
// Put one block.
|
||||
let part_fut = async move {
|
||||
let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?;
|
||||
file.seek(io::SeekFrom::Start(start_bytes)).await?;
|
||||
let limited_reader = file.take(block_size as u64);
|
||||
let file_chunk_stream =
|
||||
tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024);
|
||||
let file_chunk_stream_pin: Pin<
|
||||
Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
|
||||
> = Box::pin(file_chunk_stream);
|
||||
let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper));
|
||||
// Azure put block takes URL-encoded block ids and all blocks must have the same byte length.
|
||||
// https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters
|
||||
let builder = blob_client.put_block(encoded_block_id.to_vec(), body);
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(timeout, fut);
|
||||
let result = fut.await;
|
||||
tracing::debug!(
|
||||
"azure put block id-{} size {} start {} end {} file {} response {:#?}",
|
||||
block_id,
|
||||
block_size,
|
||||
start_bytes,
|
||||
end_bytes,
|
||||
timeline_file,
|
||||
result
|
||||
);
|
||||
match result {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure),
|
||||
Err(_timeout) => Err(azure_core::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::TimedOut,
|
||||
"Operation timed out",
|
||||
),
|
||||
)),
|
||||
}
|
||||
};
|
||||
upload_futures.push(tokio::spawn(part_fut));
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
block_list_count += 1;
|
||||
remaining_bytes -= block_size;
|
||||
start_bytes += block_size as u64;
|
||||
|
||||
block_list
|
||||
.blocks
|
||||
.push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into()));
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
"azure put blocks {} total MB: {} chunk size MB: {}",
|
||||
block_list_count,
|
||||
data_size_bytes / 1024 / 1024,
|
||||
put_block_size / 1024 / 1024
|
||||
);
|
||||
// Wait for all blocks to be uploaded.
|
||||
let upload_results = futures::future::try_join_all(upload_futures).await;
|
||||
if upload_results.is_err() {
|
||||
return Err(anyhow::anyhow!(format!(
|
||||
"Failed to upload all blocks {:#?}",
|
||||
upload_results.unwrap_err()
|
||||
)));
|
||||
}
|
||||
|
||||
// Commit the blocks.
|
||||
let mut builder = blob_client.put_block_list(block_list);
|
||||
if !metadata_map.0.is_empty() {
|
||||
builder = builder.metadata(to_azure_metadata(metadata_map));
|
||||
}
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
let result = fut.await;
|
||||
tracing::debug!("azure put block list response {:#?}", result);
|
||||
|
||||
match fut.await {
|
||||
match result {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
};
|
||||
/* END_HADRON */
|
||||
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
@@ -622,7 +742,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, outcome, started_at);
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
|
||||
@@ -195,8 +195,19 @@ pub struct AzureConfig {
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
#[serde(default = "default_azure_conn_pool_size")]
|
||||
pub conn_pool_size: usize,
|
||||
/* BEGIN_HADRON */
|
||||
#[serde(default = "default_azure_put_block_size_mb")]
|
||||
pub put_block_size_mb: Option<usize>,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
fn default_azure_put_block_size_mb() -> Option<usize> {
|
||||
// Disable parallel upload by default.
|
||||
Some(0)
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
|
||||
NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
|
||||
}
|
||||
@@ -213,6 +224,9 @@ impl Debug for AzureConfig {
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
/* BEGIN_HADRON */
|
||||
.field("put_block_size_mb", &self.put_block_size_mb)
|
||||
/* END_HADRON */
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -352,6 +366,7 @@ timeout = '5s'";
|
||||
upload_storage_class = 'INTELLIGENT_TIERING'
|
||||
timeout = '7s'
|
||||
conn_pool_size = 8
|
||||
put_block_size_mb = 1024
|
||||
";
|
||||
|
||||
let config = parse(toml).unwrap();
|
||||
@@ -367,6 +382,9 @@ timeout = '5s'";
|
||||
concurrency_limit: default_remote_storage_azure_concurrency_limit(),
|
||||
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
|
||||
conn_pool_size: 8,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: Some(1024),
|
||||
/* END_HADRON */
|
||||
}),
|
||||
timeout: Duration::from_secs(7),
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
|
||||
|
||||
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
|
||||
/* BEGIN_HADRON */
|
||||
pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(
|
||||
s,
|
||||
fail_first,
|
||||
fail_probability,
|
||||
)))
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
|
||||
pub async fn upload_storage_object(
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||
//! testing purposes.
|
||||
use rand::Rng;
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::num::NonZeroU32;
|
||||
@@ -25,6 +27,12 @@ pub struct UnreliableWrapper {
|
||||
|
||||
// Tracks how many failed attempts of each operation has been made.
|
||||
attempts: Mutex<HashMap<RemoteOp, u64>>,
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
// This the probability of failure for each operation, ranged from [0, 100].
|
||||
// The probability is default to 100, which means that all operations will fail.
|
||||
attempt_failure_probability: u64,
|
||||
/* END_HADRON */
|
||||
}
|
||||
|
||||
/// Used to identify retries of different unique operation.
|
||||
@@ -40,7 +48,11 @@ enum RemoteOp {
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
||||
pub fn new(
|
||||
inner: crate::GenericRemoteStorage,
|
||||
attempts_to_fail: u64,
|
||||
attempt_failure_probability: u64,
|
||||
) -> Self {
|
||||
assert!(attempts_to_fail > 0);
|
||||
let inner = match inner {
|
||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||
@@ -51,9 +63,11 @@ impl UnreliableWrapper {
|
||||
panic!("Can't wrap unreliable wrapper unreliably")
|
||||
}
|
||||
};
|
||||
let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
|
||||
UnreliableWrapper {
|
||||
inner,
|
||||
attempts_to_fail,
|
||||
attempt_failure_probability: actual_attempt_failure_probability,
|
||||
attempts: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
@@ -66,6 +80,7 @@ impl UnreliableWrapper {
|
||||
///
|
||||
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
|
||||
let mut attempts = self.attempts.lock().unwrap();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
match attempts.entry(op) {
|
||||
Entry::Occupied(mut e) => {
|
||||
@@ -75,15 +90,19 @@ impl UnreliableWrapper {
|
||||
*p
|
||||
};
|
||||
|
||||
if attempts_before_this >= self.attempts_to_fail {
|
||||
// let it succeed
|
||||
e.remove();
|
||||
Ok(attempts_before_this)
|
||||
} else {
|
||||
/* BEGIN_HADRON */
|
||||
// If there are more attempts to fail, fail the request by probability.
|
||||
if (attempts_before_this < self.attempts_to_fail)
|
||||
&& (rng.gen_range(0..=100) < self.attempt_failure_probability)
|
||||
{
|
||||
let error =
|
||||
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||
Err(error)
|
||||
} else {
|
||||
e.remove();
|
||||
Ok(attempts_before_this)
|
||||
}
|
||||
/* END_HADRON */
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||
|
||||
@@ -165,10 +165,42 @@ pub(crate) async fn upload_remote_data(
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let mut metadata = None;
|
||||
if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) {
|
||||
let file_path = "/tmp/dbx_upload_tmp_file.txt";
|
||||
{
|
||||
// Open the file in append mode
|
||||
let mut file = std::fs::OpenOptions::new()
|
||||
.append(true)
|
||||
.create(true) // Create the file if it doesn't exist
|
||||
.open(file_path)?;
|
||||
// Append some bytes to the file
|
||||
std::io::Write::write_all(
|
||||
&mut file,
|
||||
&format!("remote blob data {i}").into_bytes(),
|
||||
)?;
|
||||
file.sync_all()?;
|
||||
}
|
||||
metadata = Some(remote_storage::StorageMetadata::from([(
|
||||
"databricks_azure_put_block",
|
||||
file_path,
|
||||
)]));
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
task_client
|
||||
.upload(data, data_len, &blob_path, None, &cancel)
|
||||
.upload(data, data_len, &blob_path, metadata, &cancel)
|
||||
.await?;
|
||||
|
||||
// TODO: Check upload is using the put_block upload.
|
||||
// We cannot consume data here since data is moved inside the upload.
|
||||
// let total_bytes = data.fold(0, |acc, chunk| async move {
|
||||
// acc + chunk.map(|bytes| bytes.len()).unwrap_or(0)
|
||||
// }).await;
|
||||
// assert_eq!(total_bytes, data_len);
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
@@ -219,6 +219,9 @@ async fn create_azure_client(
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
conn_pool_size: 8,
|
||||
/* BEGIN_HADRON */
|
||||
put_block_size_mb: Some(1),
|
||||
/* END_HADRON */
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
|
||||
|
||||
@@ -221,7 +221,7 @@ pub struct TimelineMembershipSwitchRequest {
|
||||
pub struct TimelineMembershipSwitchResponse {
|
||||
pub previous_conf: Configuration,
|
||||
pub current_conf: Configuration,
|
||||
pub term: Term,
|
||||
pub last_log_term: Term,
|
||||
pub flush_lsn: Lsn,
|
||||
}
|
||||
|
||||
|
||||
@@ -44,3 +44,62 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
pub enum DeploymentMode {
|
||||
Dev,
|
||||
Staging,
|
||||
Prod,
|
||||
}
|
||||
|
||||
pub fn get_deployment_mode() -> Option<DeploymentMode> {
|
||||
match std::env::var("DEPLOYMENT_MODE") {
|
||||
Ok(env) => match env.as_str() {
|
||||
"development" => Some(DeploymentMode::Dev),
|
||||
"staging" => Some(DeploymentMode::Staging),
|
||||
"production" => Some(DeploymentMode::Prod),
|
||||
_ => {
|
||||
tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
tracing::error!("DEPLOYMENT_MODE not set");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_dev_or_staging() -> bool {
|
||||
matches!(
|
||||
get_deployment_mode(),
|
||||
Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
|
||||
)
|
||||
}
|
||||
|
||||
pub enum TestingMode {
|
||||
Chaos,
|
||||
Stress,
|
||||
}
|
||||
|
||||
pub fn get_test_mode() -> Option<TestingMode> {
|
||||
match std::env::var("HADRON_TEST_MODE") {
|
||||
Ok(env) => match env.as_str() {
|
||||
"chaos" => Some(TestingMode::Chaos),
|
||||
"stress" => Some(TestingMode::Stress),
|
||||
_ => {
|
||||
tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
tracing::error!("HADRON_TEST_MODE not set");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_chaos_testing() -> bool {
|
||||
matches!(get_test_mode(), Some(TestingMode::Chaos))
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
@@ -99,6 +99,8 @@ pub mod elapsed_accum;
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod linux_socket_ioctl;
|
||||
|
||||
pub mod metrics_collector;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
75
libs/utils/src/metrics_collector.rs
Normal file
75
libs/utils/src/metrics_collector.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use std::{
|
||||
sync::{Arc, RwLock},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"metrics_metrics_stale_milliseconds",
|
||||
"The current metrics stale time in milliseconds"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CollectedMetrics {
|
||||
pub metrics: Vec<MetricFamily>,
|
||||
pub collected_at: Instant,
|
||||
}
|
||||
|
||||
impl CollectedMetrics {
|
||||
fn new(metrics: Vec<MetricFamily>) -> Self {
|
||||
Self {
|
||||
metrics,
|
||||
collected_at: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MetricsCollector {
|
||||
last_collected: RwLock<Arc<CollectedMetrics>>,
|
||||
}
|
||||
|
||||
impl MetricsCollector {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(name = "metrics_collector", skip_all)]
|
||||
pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
|
||||
let started = Instant::now();
|
||||
let metrics = metrics::gather();
|
||||
let collected = Arc::new(CollectedMetrics::new(metrics));
|
||||
if cache_metrics {
|
||||
let mut guard = self.last_collected.write().unwrap();
|
||||
*guard = collected.clone();
|
||||
}
|
||||
tracing::info!(
|
||||
"Collected {} metric families in {} ms",
|
||||
collected.metrics.len(),
|
||||
started.elapsed().as_millis()
|
||||
);
|
||||
collected
|
||||
}
|
||||
|
||||
pub fn last_collected(&self) -> Arc<CollectedMetrics> {
|
||||
self.last_collected.read().unwrap().clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MetricsCollector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
|
||||
pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
|
||||
|
||||
pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
|
||||
@@ -49,12 +49,6 @@ pub struct TenantShardId {
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardCount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const MIN: Self = Self(0);
|
||||
@@ -177,6 +171,12 @@ impl std::fmt::Display for ShardNumber {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardCount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardSlug<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
|
||||
@@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
shard_number: 0,
|
||||
};
|
||||
|
||||
let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
|
||||
should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
|
||||
sent_bytes: 0,
|
||||
last_recorded_time_us: 0,
|
||||
};
|
||||
|
||||
crate::bindings::WalproposerShmemState {
|
||||
propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
donor_name: [0; 64],
|
||||
@@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
num_shards: 0,
|
||||
replica_promote: false,
|
||||
min_ps_feedback: empty_feedback,
|
||||
wal_rate_limiter: empty_wal_rate_limiter,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -114,6 +114,7 @@ twox-hash.workspace = true
|
||||
procfs.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
base64.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::error::Error as _;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -251,6 +251,70 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_timeline_compact(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
force_image_layer_creation: bool,
|
||||
must_force_image_layer_creation: bool,
|
||||
scheduled: bool,
|
||||
wait_until_done: bool,
|
||||
) -> Result<()> {
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
|
||||
self.mgmt_api_endpoint
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
|
||||
if force_image_layer_creation {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("force_image_layer_creation", "true");
|
||||
}
|
||||
|
||||
if must_force_image_layer_creation {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("must_force_image_layer_creation", "true");
|
||||
}
|
||||
|
||||
if scheduled {
|
||||
path.query_pairs_mut().append_pair("scheduled", "true");
|
||||
}
|
||||
if wait_until_done {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("wait_until_scheduled_compaction_done", "true");
|
||||
path.query_pairs_mut()
|
||||
.append_pair("wait_until_uploaded", "true");
|
||||
}
|
||||
self.request(Method::PUT, path, ()).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
pub async fn tenant_timeline_describe(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<TimelineInfo> {
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
path.query_pairs_mut()
|
||||
.append_pair("include-image-consistent-lsn", "true");
|
||||
|
||||
let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
|
||||
let body = response.json().await.map_err(Error::ReceiveBody)?;
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
|
||||
let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
|
||||
let resp = self.get(&uri).await?;
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
pub async fn tenant_scan_remote_storage(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
[package]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = ["pageserver_api/testing"]
|
||||
@@ -10,35 +11,14 @@ testing = ["pageserver_api/testing"]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
bytes.workspace = true
|
||||
compute_api.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
thiserror.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio = { version = "1.43.1", features = [
|
||||
"full",
|
||||
"macros",
|
||||
"net",
|
||||
"io-util",
|
||||
"rt",
|
||||
"rt-multi-thread",
|
||||
] }
|
||||
uuid = { version = "1", features = ["v4"] }
|
||||
tower = { version = "0.4", features = ["timeout", "util"] }
|
||||
rand = "0.8"
|
||||
tokio-util = { version = "0.7", features = ["compat"] }
|
||||
hyper-util = "0.1.9"
|
||||
hyper = "1.6.0"
|
||||
metrics.workspace = true
|
||||
priority-queue = "2.3.1"
|
||||
scopeguard.workspace = true
|
||||
async-trait = { version = "0.1" }
|
||||
tokio-stream = "0.1"
|
||||
dashmap = "5"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
compute_api.workspace = true
|
||||
|
||||
|
||||
pageserver_page_api.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -6,18 +6,18 @@ use anyhow::anyhow;
|
||||
use arc_swap::ArcSwap;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt as _, StreamExt as _};
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tracing::instrument;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
use crate::split::GetPageSplitter;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
pub use pageserver_api::shard::ShardStripeSize;
|
||||
|
||||
/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
|
||||
/// when full.
|
||||
///
|
||||
@@ -63,6 +63,8 @@ pub struct PageserverClient {
|
||||
timeline_id: TimelineId,
|
||||
/// The JWT auth token for this tenant, if any.
|
||||
auth_token: Option<String>,
|
||||
/// The compression to use, if any.
|
||||
compression: Option<CompressionEncoding>,
|
||||
/// The shards for this tenant.
|
||||
shards: ArcSwap<Shards>,
|
||||
/// The retry configuration.
|
||||
@@ -77,12 +79,20 @@ impl PageserverClient {
|
||||
timeline_id: TimelineId,
|
||||
shard_spec: ShardSpec,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let shards = Shards::new(tenant_id, timeline_id, shard_spec, auth_token.clone())?;
|
||||
let shards = Shards::new(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_spec,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
)?;
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
auth_token,
|
||||
compression,
|
||||
shards: ArcSwap::new(Arc::new(shards)),
|
||||
retry: Retry,
|
||||
})
|
||||
@@ -94,11 +104,33 @@ impl PageserverClient {
|
||||
/// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
|
||||
/// properly spun down and dropped afterwards.
|
||||
pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
|
||||
// Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
|
||||
// with concurrent updates, but that involves creating a new `Shards` on every attempt,
|
||||
// which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
|
||||
// in the stack, and if they're violated then we already have problems elsewhere, so a
|
||||
// best-effort but possibly-racy check is okay here.
|
||||
let old = self.shards.load_full();
|
||||
if shard_spec.count < old.count {
|
||||
return Err(anyhow!(
|
||||
"can't reduce shard count from {} to {}",
|
||||
old.count,
|
||||
shard_spec.count
|
||||
));
|
||||
}
|
||||
if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
|
||||
return Err(anyhow!(
|
||||
"can't change stripe size from {} to {}",
|
||||
old.stripe_size,
|
||||
shard_spec.stripe_size
|
||||
));
|
||||
}
|
||||
|
||||
let shards = Shards::new(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
shard_spec,
|
||||
self.auth_token.clone(),
|
||||
self.compression,
|
||||
)?;
|
||||
self.shards.store(Arc::new(shards));
|
||||
Ok(())
|
||||
@@ -111,7 +143,7 @@ impl PageserverClient {
|
||||
req: page_api::CheckRelExistsRequest,
|
||||
) -> tonic::Result<page_api::CheckRelExistsResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
.with(async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.check_rel_exists(req).await
|
||||
@@ -126,7 +158,7 @@ impl PageserverClient {
|
||||
req: page_api::GetDbSizeRequest,
|
||||
) -> tonic::Result<page_api::GetDbSizeResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
.with(async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.get_db_size(req).await
|
||||
@@ -134,8 +166,9 @@ impl PageserverClient {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
|
||||
/// splits requests that straddle shard boundaries, and assembles the responses.
|
||||
/// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
|
||||
/// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
|
||||
/// shard boundaries, and assembles the responses.
|
||||
///
|
||||
/// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
|
||||
/// errors. All responses will have `GetPageStatusCode::Ok`.
|
||||
@@ -155,6 +188,10 @@ impl PageserverClient {
|
||||
if req.block_numbers.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument("no block number"));
|
||||
}
|
||||
// The request attempt must be 0. The client will increment it internally.
|
||||
if req.request_id.attempt != 0 {
|
||||
return Err(tonic::Status::invalid_argument("request attempt must be 0"));
|
||||
}
|
||||
|
||||
// The shards may change while we're fetching pages. We execute the request using a stable
|
||||
// view of the shards (especially important for requests that span shards), but retry the
|
||||
@@ -165,7 +202,11 @@ impl PageserverClient {
|
||||
// TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
|
||||
// once we figure out how to handle these.
|
||||
self.retry
|
||||
.with(async || Self::get_page_with_shards(req.clone(), &self.shards.load_full()).await)
|
||||
.with(async |attempt| {
|
||||
let mut req = req.clone();
|
||||
req.request_id.attempt = attempt as u32;
|
||||
Self::get_page_with_shards(req, &self.shards.load_full()).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -177,7 +218,7 @@ impl PageserverClient {
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Fast path: request is for a single shard.
|
||||
if let Some(shard_id) =
|
||||
GetPageSplitter::is_single_shard(&req, shards.count, shards.stripe_size)
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
|
||||
{
|
||||
return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
|
||||
}
|
||||
@@ -194,10 +235,10 @@ impl PageserverClient {
|
||||
}
|
||||
|
||||
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter.add_response(shard_id, shard_response);
|
||||
splitter.add_response(shard_id, shard_response)?;
|
||||
}
|
||||
|
||||
splitter.assemble_response()
|
||||
splitter.get_response()
|
||||
}
|
||||
|
||||
/// Fetches pages on the given shard. Does not retry internally.
|
||||
@@ -205,9 +246,8 @@ impl PageserverClient {
|
||||
req: page_api::GetPageRequest,
|
||||
shard: &Shard,
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
let expected = req.block_numbers.len();
|
||||
let stream = shard.stream(req.request_class.is_bulk()).await;
|
||||
let resp = stream.send(req).await?;
|
||||
let resp = stream.send(req.clone()).await?;
|
||||
|
||||
// Convert per-request errors into a tonic::Status.
|
||||
if resp.status_code != page_api::GetPageStatusCode::Ok {
|
||||
@@ -217,11 +257,27 @@ impl PageserverClient {
|
||||
));
|
||||
}
|
||||
|
||||
// Check that we received the expected number of pages.
|
||||
let actual = resp.page_images.len();
|
||||
if expected != actual {
|
||||
return Err(tonic::Status::data_loss(format!(
|
||||
"expected {expected} pages, got {actual}",
|
||||
// Check that we received the expected pages.
|
||||
if req.rel != resp.rel {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {} returned wrong relation, expected {} got {}",
|
||||
shard.id, req.rel, resp.rel
|
||||
)));
|
||||
}
|
||||
if !req
|
||||
.block_numbers
|
||||
.iter()
|
||||
.copied()
|
||||
.eq(resp.pages.iter().map(|p| p.block_number))
|
||||
{
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {} returned wrong pages, expected {:?} got {:?}",
|
||||
shard.id,
|
||||
req.block_numbers,
|
||||
resp.pages
|
||||
.iter()
|
||||
.map(|page| page.block_number)
|
||||
.collect::<Vec<_>>()
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -235,7 +291,7 @@ impl PageserverClient {
|
||||
req: page_api::GetRelSizeRequest,
|
||||
) -> tonic::Result<page_api::GetRelSizeResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
.with(async |_| {
|
||||
// Relation metadata is only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.get_rel_size(req).await
|
||||
@@ -250,7 +306,7 @@ impl PageserverClient {
|
||||
req: page_api::GetSlruSegmentRequest,
|
||||
) -> tonic::Result<page_api::GetSlruSegmentResponse> {
|
||||
self.retry
|
||||
.with(async || {
|
||||
.with(async |_| {
|
||||
// SLRU segments are only available on shard 0.
|
||||
let mut client = self.shards.load_full().get_zero().client().await?;
|
||||
client.get_slru_segment(req).await
|
||||
@@ -344,13 +400,21 @@ impl Shards {
|
||||
timeline_id: TimelineId,
|
||||
shard_spec: ShardSpec,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// NB: the shard spec has already been validated when constructed.
|
||||
let mut shards = HashMap::with_capacity(shard_spec.urls.len());
|
||||
for (shard_id, url) in shard_spec.urls {
|
||||
shards.insert(
|
||||
shard_id,
|
||||
Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?,
|
||||
Shard::new(
|
||||
url,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
)?,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -386,6 +450,8 @@ impl Shards {
|
||||
/// * Bulk client pool: unbounded.
|
||||
/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
|
||||
struct Shard {
|
||||
/// The shard ID.
|
||||
id: ShardIndex,
|
||||
/// Unary gRPC client pool.
|
||||
client_pool: Arc<ClientPool>,
|
||||
/// GetPage stream pool.
|
||||
@@ -402,6 +468,7 @@ impl Shard {
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Common channel pool for unary and stream requests. Bounded by client/stream pools.
|
||||
let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
|
||||
@@ -413,6 +480,7 @@ impl Shard {
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
Some(MAX_UNARY_CLIENTS),
|
||||
);
|
||||
|
||||
@@ -425,6 +493,7 @@ impl Shard {
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token.clone(),
|
||||
compression,
|
||||
None, // unbounded, limited by stream pool
|
||||
),
|
||||
Some(MAX_STREAMS),
|
||||
@@ -440,6 +509,7 @@ impl Shard {
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
compression,
|
||||
None, // unbounded, limited by stream pool
|
||||
),
|
||||
Some(MAX_BULK_STREAMS),
|
||||
@@ -447,6 +517,7 @@ impl Shard {
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
id: shard_id,
|
||||
client_pool,
|
||||
stream_pool,
|
||||
bulk_stream_pool,
|
||||
|
||||
@@ -3,4 +3,5 @@ mod pool;
|
||||
mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::{PageserverClient, ShardSpec, ShardStripeSize};
|
||||
pub use client::{PageserverClient, ShardSpec};
|
||||
pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec
|
||||
|
||||
@@ -40,6 +40,7 @@ use futures::StreamExt as _;
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tracing::{error, warn};
|
||||
|
||||
@@ -242,6 +243,8 @@ pub struct ClientPool {
|
||||
shard_id: ShardIndex,
|
||||
/// Authentication token, if any.
|
||||
auth_token: Option<String>,
|
||||
/// Compression to use.
|
||||
compression: Option<CompressionEncoding>,
|
||||
/// Channel pool to acquire channels from.
|
||||
channel_pool: Arc<ChannelPool>,
|
||||
/// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
|
||||
@@ -281,6 +284,7 @@ impl ClientPool {
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_token: Option<String>,
|
||||
compression: Option<CompressionEncoding>,
|
||||
max_clients: Option<NonZero<usize>>,
|
||||
) -> Arc<Self> {
|
||||
let pool = Arc::new(Self {
|
||||
@@ -288,6 +292,7 @@ impl ClientPool {
|
||||
timeline_id,
|
||||
shard_id,
|
||||
auth_token,
|
||||
compression,
|
||||
channel_pool,
|
||||
idle: Mutex::default(),
|
||||
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
|
||||
@@ -331,7 +336,7 @@ impl ClientPool {
|
||||
self.timeline_id,
|
||||
self.shard_id,
|
||||
self.auth_token.clone(),
|
||||
None,
|
||||
self.compression,
|
||||
)?;
|
||||
|
||||
Ok(ClientGuard {
|
||||
@@ -574,20 +579,22 @@ impl StreamPool {
|
||||
// Acquire a client from the pool and create a stream.
|
||||
let mut client = client_pool.get().await?;
|
||||
|
||||
let (req_tx, req_rx) = mpsc::channel(1);
|
||||
let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
|
||||
// NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
|
||||
// theoretically deadlock if both the client and server block on sends (since we're not
|
||||
// reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
|
||||
// low queue depths, but it was seen to happen with the libpq protocol so better safe than
|
||||
// sorry. It should never buffer more than the queue depth anyway, but using an unbounded
|
||||
// channel guarantees that it will never block.
|
||||
let (req_tx, req_rx) = mpsc::unbounded_channel();
|
||||
let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
|
||||
let mut resp_stream = client.get_pages(req_stream).await?;
|
||||
|
||||
// Track caller response channels by request ID. If the task returns early, these response
|
||||
// channels will be dropped and the waiting callers will receive an error.
|
||||
//
|
||||
// TODO: on timeouts, retries will send additional requests for the same request ID, which
|
||||
// get piled up behind the already sent request. We should, at the very least, add deadlines
|
||||
// for the request such that it's cancelled on the server side. This also means that only
|
||||
// the last caller gets the response, and it may get a response for an earlier attempt. This
|
||||
// needs rethinking.
|
||||
//
|
||||
// TODO: consider allocating separate request IDs for each retry.
|
||||
// NB: this will leak entries if the server doesn't respond to a request (by request ID).
|
||||
// It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
|
||||
// block further use. But we could consider reaping closed channels after some time.
|
||||
let mut callers = HashMap::new();
|
||||
|
||||
// Process requests and responses.
|
||||
@@ -601,10 +608,17 @@ impl StreamPool {
|
||||
};
|
||||
|
||||
// Store the response channel by request ID.
|
||||
if callers.contains_key(&req.request_id) {
|
||||
// Error on request ID duplicates. Ignore callers that went away.
|
||||
_ = resp_tx.send(Err(tonic::Status::invalid_argument(
|
||||
format!("duplicate request ID: {}", req.request_id),
|
||||
)));
|
||||
continue;
|
||||
}
|
||||
callers.insert(req.request_id, resp_tx);
|
||||
|
||||
// Send the request on the stream. Bail out if the send fails.
|
||||
req_tx.send(req).await.map_err(|_| {
|
||||
// Send the request on the stream. Bail out if the stream is closed.
|
||||
req_tx.send(req).map_err(|_| {
|
||||
tonic::Status::unavailable("stream closed")
|
||||
})?;
|
||||
}
|
||||
@@ -616,10 +630,9 @@ impl StreamPool {
|
||||
return Ok(())
|
||||
};
|
||||
|
||||
// Send the response to the caller. Ignore errors if the caller went away. This
|
||||
// may have happened with e.g. a timeout retry, where multiple requests may have
|
||||
// been sent for the same ID.
|
||||
// Send the response to the caller. Ignore errors if the caller went away.
|
||||
let Some(resp_tx) = callers.remove(&resp.request_id) else {
|
||||
warn!("received response for unknown request ID: {}", resp.request_id);
|
||||
continue;
|
||||
};
|
||||
_ = resp_tx.send(Ok(resp));
|
||||
@@ -686,6 +699,15 @@ impl Drop for StreamGuard {
|
||||
|
||||
// Release the queue depth reservation on drop. This can prematurely decrement it if dropped
|
||||
// before the response is received, but that's okay.
|
||||
//
|
||||
// TODO: actually, it's probably not okay. Queue depth release should be moved into the
|
||||
// stream task, such that it continues to account for the queue depth slot until the server
|
||||
// responds. Otherwise, if a slow request times out and keeps blocking the stream, the
|
||||
// server will keep waiting on it and we can pile on subsequent requests (including the
|
||||
// timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
|
||||
// requests on e.g. LSN waits and layer downloads, instead returning early to free up the
|
||||
// stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
|
||||
// blocking. TBD.
|
||||
let mut streams = pool.streams.lock().unwrap();
|
||||
let entry = streams.get_mut(&self.id).expect("unknown stream");
|
||||
assert!(entry.idle_since.is_none(), "active stream marked idle");
|
||||
|
||||
@@ -8,7 +8,6 @@ use utils::backoff::exponential_backoff_duration;
|
||||
/// A retry handler for Pageserver gRPC requests.
|
||||
///
|
||||
/// This is used instead of backoff::retry for better control and observability.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Retry;
|
||||
|
||||
impl Retry {
|
||||
@@ -24,14 +23,14 @@ impl Retry {
|
||||
/// If true, log successful requests. For debugging.
|
||||
const LOG_SUCCESS: bool = false;
|
||||
|
||||
/// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
|
||||
/// using the current tracing span for context.
|
||||
/// Runs the given async closure with timeouts and retries (exponential backoff), passing the
|
||||
/// attempt number starting at 0. Logs errors, using the current tracing span for context.
|
||||
///
|
||||
/// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
|
||||
/// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
|
||||
pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
|
||||
where
|
||||
F: FnMut() -> O,
|
||||
F: FnMut(usize) -> O, // takes attempt number, starting at 0
|
||||
O: Future<Output = tonic::Result<T>>,
|
||||
{
|
||||
let started = Instant::now();
|
||||
@@ -48,7 +47,7 @@ impl Retry {
|
||||
}
|
||||
|
||||
let request_started = Instant::now();
|
||||
tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
|
||||
tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tonic::Status::deadline_exceeded(format!(
|
||||
@@ -132,7 +131,6 @@ impl Retry {
|
||||
tonic::Code::Aborted => true,
|
||||
tonic::Code::Cancelled => true,
|
||||
tonic::Code::DeadlineExceeded => true, // maybe transient slowness
|
||||
tonic::Code::Internal => true, // maybe transient failure?
|
||||
tonic::Code::ResourceExhausted => true,
|
||||
tonic::Code::Unavailable => true,
|
||||
|
||||
@@ -140,6 +138,10 @@ impl Retry {
|
||||
tonic::Code::AlreadyExists => false,
|
||||
tonic::Code::DataLoss => false,
|
||||
tonic::Code::FailedPrecondition => false,
|
||||
// NB: don't retry Internal. It is intended for serious errors such as invariant
|
||||
// violations, and is also used for client-side invariant checks that would otherwise
|
||||
// result in retry loops.
|
||||
tonic::Code::Internal => false,
|
||||
tonic::Code::InvalidArgument => false,
|
||||
tonic::Code::NotFound => false,
|
||||
tonic::Code::OutOfRange => false,
|
||||
|
||||
@@ -5,27 +5,24 @@ use bytes::Bytes;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::shard::{ShardCount, ShardIndex};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
|
||||
/// TODO: add tests for this.
|
||||
pub struct GetPageSplitter {
|
||||
/// The original request ID. Used for all shard requests.
|
||||
request_id: page_api::RequestID,
|
||||
/// Split requests by shard index.
|
||||
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
|
||||
/// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
|
||||
/// the response pages in the same order as the original request.
|
||||
/// The response being assembled. Preallocated with empty pages, to be filled in.
|
||||
response: page_api::GetPageResponse,
|
||||
/// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
|
||||
/// to assemble the response pages in the same order as the original request.
|
||||
block_shards: Vec<ShardIndex>,
|
||||
/// Page responses by shard index. Will be assembled into a single response.
|
||||
responses: HashMap<ShardIndex, Vec<Bytes>>,
|
||||
}
|
||||
|
||||
impl GetPageSplitter {
|
||||
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
|
||||
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
|
||||
/// The caller must ensure that the request has at least one block number, or this will panic.
|
||||
pub fn is_single_shard(
|
||||
pub fn for_single_shard(
|
||||
req: &page_api::GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
@@ -35,8 +32,12 @@ impl GetPageSplitter {
|
||||
return Some(ShardIndex::unsharded());
|
||||
}
|
||||
|
||||
// Find the base shard index for the first page, and compare with the rest.
|
||||
let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
|
||||
// Find the first page's shard, for comparison. If there are no pages, just return the first
|
||||
// shard (caller likely checked already, otherwise the server will reject it).
|
||||
let Some(&first_page) = req.block_numbers.first() else {
|
||||
return Some(ShardIndex::new(ShardNumber(0), count));
|
||||
};
|
||||
let key = rel_block_to_key(req.rel, first_page);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
|
||||
req.block_numbers
|
||||
@@ -57,19 +58,19 @@ impl GetPageSplitter {
|
||||
) -> Self {
|
||||
// The caller should make sure we don't split requests unnecessarily.
|
||||
debug_assert!(
|
||||
Self::is_single_shard(&req, count, stripe_size).is_none(),
|
||||
Self::for_single_shard(&req, count, stripe_size).is_none(),
|
||||
"unnecessary request split"
|
||||
);
|
||||
|
||||
// Split the requests by shard index.
|
||||
let mut requests = HashMap::with_capacity(2); // common case
|
||||
let mut block_shards = Vec::with_capacity(req.block_numbers.len());
|
||||
for blkno in req.block_numbers {
|
||||
for &blkno in &req.block_numbers {
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
let shard_id = ShardIndex::new(shard_number, count);
|
||||
|
||||
let shard_req = requests
|
||||
requests
|
||||
.entry(shard_id)
|
||||
.or_insert_with(|| page_api::GetPageRequest {
|
||||
request_id: req.request_id,
|
||||
@@ -77,20 +78,39 @@ impl GetPageSplitter {
|
||||
rel: req.rel,
|
||||
read_lsn: req.read_lsn,
|
||||
block_numbers: Vec::new(),
|
||||
});
|
||||
shard_req.block_numbers.push(blkno);
|
||||
})
|
||||
.block_numbers
|
||||
.push(blkno);
|
||||
block_shards.push(shard_id);
|
||||
}
|
||||
|
||||
Self {
|
||||
// Construct a response to be populated by shard responses. Preallocate empty page slots
|
||||
// with the expected block numbers.
|
||||
let response = page_api::GetPageResponse {
|
||||
request_id: req.request_id,
|
||||
responses: HashMap::with_capacity(requests.len()),
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
rel: req.rel,
|
||||
pages: req
|
||||
.block_numbers
|
||||
.into_iter()
|
||||
.map(|block_number| {
|
||||
page_api::Page {
|
||||
block_number,
|
||||
image: Bytes::new(), // empty page slot to be filled in
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
Self {
|
||||
requests,
|
||||
response,
|
||||
block_shards,
|
||||
}
|
||||
}
|
||||
|
||||
/// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
|
||||
/// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
|
||||
pub fn drain_requests(
|
||||
&mut self,
|
||||
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
|
||||
@@ -99,70 +119,91 @@ impl GetPageSplitter {
|
||||
|
||||
/// Adds a response from the given shard. The response must match the request ID and have an OK
|
||||
/// status code. A response must not already exist for the given shard ID.
|
||||
pub fn add_response(&mut self, shard_id: ShardIndex, response: page_api::GetPageResponse) {
|
||||
// NB: this is called below a `Retry::with()`, so unrecoverable errors should not use a
|
||||
// retryable status code (e.g. `Internal`).
|
||||
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn add_response(
|
||||
&mut self,
|
||||
shard_id: ShardIndex,
|
||||
response: page_api::GetPageResponse,
|
||||
) -> tonic::Result<()> {
|
||||
// The caller should already have converted status codes into tonic::Status.
|
||||
assert_eq!(
|
||||
response.status_code,
|
||||
page_api::GetPageStatusCode::Ok,
|
||||
"non-OK response"
|
||||
);
|
||||
if response.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected non-OK response for shard {shard_id}: {} {}",
|
||||
response.status_code,
|
||||
response.reason.unwrap_or_default()
|
||||
)));
|
||||
}
|
||||
|
||||
// The stream pool ensures the response matches the request ID.
|
||||
assert_eq!(response.request_id, self.request_id, "response ID mismatch");
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Add the response data to the map.
|
||||
let old = self.responses.insert(shard_id, response.page_images);
|
||||
// Place the shard response pages into the assembled response, in request order.
|
||||
let mut pages = response.pages.into_iter();
|
||||
|
||||
// We only dispatch one request per shard.
|
||||
assert!(old.is_none(), "duplicate response for shard {shard_id}");
|
||||
for (i, &s) in self.block_shards.iter().enumerate() {
|
||||
if shard_id != s {
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(slot) = self.response.pages.get_mut(i) else {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"no block_shards slot {i} for shard {shard_id}"
|
||||
)));
|
||||
};
|
||||
let Some(page) = pages.next() else {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"missing page {} in shard {shard_id} response",
|
||||
slot.block_number
|
||||
)));
|
||||
};
|
||||
if page.block_number != slot.block_number {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {shard_id} returned wrong page at index {i}, expected {} got {}",
|
||||
slot.block_number, page.block_number
|
||||
)));
|
||||
}
|
||||
if !slot.image.is_empty() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {shard_id} returned duplicate page {} at index {i}",
|
||||
slot.block_number
|
||||
)));
|
||||
}
|
||||
|
||||
*slot = page;
|
||||
}
|
||||
|
||||
// Make sure we've consumed all pages from the shard response.
|
||||
if let Some(extra_page) = pages.next() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"shard {shard_id} returned extra page: {}",
|
||||
extra_page.block_number
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Assembles the shard responses into a single response. Responses must be present for all
|
||||
/// relevant shards, and the total number of pages must match the original request.
|
||||
/// Fetches the final, assembled response.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// NB: this is called below a `Retry::with()`, so unrecoverable errors should not use a
|
||||
// retryable status code (e.g. `Internal`).
|
||||
|
||||
let mut response = page_api::GetPageResponse {
|
||||
request_id: self.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
page_images: Vec::with_capacity(self.block_shards.len()),
|
||||
};
|
||||
|
||||
// Set up per-shard page iterators we can pull from.
|
||||
let mut shard_responses = HashMap::with_capacity(self.responses.len());
|
||||
for (shard_id, responses) in self.responses {
|
||||
shard_responses.insert(shard_id, responses.into_iter());
|
||||
}
|
||||
|
||||
// Reassemble the responses in the same order as the original request.
|
||||
for shard_id in &self.block_shards {
|
||||
let page = shard_responses
|
||||
.get_mut(shard_id)
|
||||
.ok_or_else(|| {
|
||||
tonic::Status::data_loss(format!("missing response for shard {shard_id}"))
|
||||
})?
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
tonic::Status::data_loss(format!("missing page from shard {shard_id}"))
|
||||
})?;
|
||||
response.page_images.push(page);
|
||||
}
|
||||
|
||||
// Make sure there are no additional pages.
|
||||
for (shard_id, mut pages) in shard_responses {
|
||||
if pages.next().is_some() {
|
||||
return Err(tonic::Status::out_of_range(format!(
|
||||
"extra pages returned from shard {shard_id}"
|
||||
pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Check that the response is complete.
|
||||
for (i, page) in self.response.pages.iter().enumerate() {
|
||||
if page.image.is_empty() {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"missing page {} for shard {}",
|
||||
page.block_number,
|
||||
self.block_shards
|
||||
.get(i)
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "?".to_string())
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(response)
|
||||
Ok(self.response)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
postgres_ffi.workspace = true
|
||||
serde.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
85
pageserver/ctl/src/download_remote_object.rs
Normal file
85
pageserver/ctl/src/download_remote_object.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
/// Download a specific object from remote storage to a local file.
|
||||
///
|
||||
/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment
|
||||
/// variable, in the same TOML format that the pageserver itself understands. This allows the
|
||||
/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3,
|
||||
/// Azure Blob Storage and local files), as long as the credentials are available via the
|
||||
/// standard environment variables expected by the underlying SDKs.
|
||||
///
|
||||
/// Examples for setting the environment variable:
|
||||
///
|
||||
/// ```bash
|
||||
/// # AWS S3 (region can also be provided via AWS_REGION)
|
||||
/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }'
|
||||
///
|
||||
/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY)
|
||||
/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }'
|
||||
/// ```
|
||||
#[derive(Parser)]
|
||||
pub(crate) struct DownloadRemoteObjectCmd {
|
||||
/// Key / path of the object to download (relative to the remote storage prefix).
|
||||
///
|
||||
/// Examples:
|
||||
/// "wal/3aa8f.../00000001000000000000000A"
|
||||
/// "pageserver/v1/tenants/<tenant_id>/timelines/<timeline_id>/layer_12345"
|
||||
pub remote_path: String,
|
||||
|
||||
/// Path of the local file to create. Existing file will be overwritten.
|
||||
///
|
||||
/// Examples:
|
||||
/// "./segment"
|
||||
/// "/tmp/layer_12345.parquet"
|
||||
pub output_file: Utf8PathBuf,
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> {
|
||||
use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig};
|
||||
|
||||
// Fetch remote storage configuration from the environment
|
||||
let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| {
|
||||
anyhow::anyhow!(
|
||||
"'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config"
|
||||
)
|
||||
})?;
|
||||
|
||||
let config = RemoteStorageConfig::from_toml_str(&config_str)?;
|
||||
|
||||
// Initialise remote storage client
|
||||
let storage = GenericRemoteStorage::from_config(&config).await?;
|
||||
|
||||
// RemotePath must be relative – leading slashes confuse the parser.
|
||||
let remote_path_str = cmd.remote_path.trim_start_matches('/');
|
||||
let remote_path = RemotePath::from_string(remote_path_str)?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
println!(
|
||||
"Downloading '{remote_path}' from remote storage bucket {:?} ...",
|
||||
config.storage.bucket_name()
|
||||
);
|
||||
|
||||
// Start the actual download
|
||||
let download = storage
|
||||
.download(&remote_path, &DownloadOpts::default(), &cancel)
|
||||
.await?;
|
||||
|
||||
// Stream to file
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let tmp_path = cmd.output_file.with_extension("tmp");
|
||||
let mut file = tokio::fs::File::create(&tmp_path).await?;
|
||||
tokio::io::copy(&mut reader, &mut file).await?;
|
||||
file.sync_all().await?;
|
||||
// Atomically move into place
|
||||
tokio::fs::rename(&tmp_path, &cmd.output_file).await?;
|
||||
|
||||
println!(
|
||||
"Downloaded to '{}'. Last modified: {:?}, etag: {}",
|
||||
cmd.output_file, download.last_modified, download.etag
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,10 +1,180 @@
|
||||
use anyhow::Context;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{Context, Ok};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver::tenant::{
|
||||
IndexPart,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
remote_timeline_client::{index::LayerFileMetadata, remote_layer_path},
|
||||
storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak},
|
||||
};
|
||||
use pageserver_api::key::Key;
|
||||
use serde::Serialize;
|
||||
use std::collections::BTreeMap;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::TenantShardId,
|
||||
};
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
pub(crate) enum IndexPartCmd {
|
||||
Dump { path: Utf8PathBuf },
|
||||
Dump {
|
||||
path: Utf8PathBuf,
|
||||
},
|
||||
/// Find all layers that need to be searched to construct the given page at the given LSN.
|
||||
Search {
|
||||
#[arg(long)]
|
||||
tenant_id: String,
|
||||
#[arg(long)]
|
||||
timeline_id: String,
|
||||
#[arg(long)]
|
||||
path: Utf8PathBuf,
|
||||
#[arg(long)]
|
||||
key: String,
|
||||
#[arg(long)]
|
||||
lsn: String,
|
||||
},
|
||||
/// List all visible delta and image layers at the latest LSN.
|
||||
ListVisibleLayers {
|
||||
#[arg(long)]
|
||||
path: Utf8PathBuf,
|
||||
},
|
||||
}
|
||||
|
||||
fn create_layer_map_from_index_part(
|
||||
index_part: &IndexPart,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
{
|
||||
let mut updates = layer_map.batch_update();
|
||||
for (key, value) in index_part.layer_metadata.iter() {
|
||||
updates.insert_historic(PersistentLayerDesc::from_filename(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
key.clone(),
|
||||
value.file_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
layer_map
|
||||
}
|
||||
|
||||
async fn search_layers(
|
||||
tenant_id: &str,
|
||||
timeline_id: &str,
|
||||
path: &Utf8PathBuf,
|
||||
key: &str,
|
||||
lsn: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::from_str(tenant_id).unwrap();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::from_str(timeline_id).unwrap();
|
||||
let index_json = {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
IndexPart::from_json_bytes(&bytes).unwrap()
|
||||
};
|
||||
let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id);
|
||||
let key = Key::from_hex(key)?;
|
||||
|
||||
let lsn = Lsn::from_str(lsn).unwrap();
|
||||
let mut end_lsn = lsn;
|
||||
loop {
|
||||
let result = layer_map.search(key, end_lsn);
|
||||
match result {
|
||||
Some(SearchResult { layer, lsn_floor }) => {
|
||||
let disk_layer = match layer {
|
||||
ReadableLayerWeak::PersistentLayer(layer) => layer,
|
||||
ReadableLayerWeak::InMemoryLayer(_) => {
|
||||
anyhow::bail!("unexpected in-memory layer")
|
||||
}
|
||||
};
|
||||
|
||||
let metadata = index_json
|
||||
.layer_metadata
|
||||
.get(&disk_layer.layer_name())
|
||||
.unwrap();
|
||||
println!(
|
||||
"{}",
|
||||
remote_layer_path(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
metadata.shard,
|
||||
&disk_layer.layer_name(),
|
||||
metadata.generation
|
||||
)
|
||||
);
|
||||
end_lsn = lsn_floor;
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
struct VisibleLayers {
|
||||
pub total_images: u64,
|
||||
pub total_image_bytes: u64,
|
||||
pub total_deltas: u64,
|
||||
pub total_delta_bytes: u64,
|
||||
pub layer_metadata: BTreeMap<LayerName, LayerFileMetadata>,
|
||||
}
|
||||
|
||||
impl VisibleLayers {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
layer_metadata: BTreeMap::new(),
|
||||
total_images: 0,
|
||||
total_image_bytes: 0,
|
||||
total_deltas: 0,
|
||||
total_delta_bytes: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) {
|
||||
match name {
|
||||
LayerName::Image(_) => {
|
||||
self.total_images += 1;
|
||||
self.total_image_bytes += layer.file_size;
|
||||
}
|
||||
LayerName::Delta(_) => {
|
||||
self.total_deltas += 1;
|
||||
self.total_delta_bytes += layer.file_size;
|
||||
}
|
||||
}
|
||||
self.layer_metadata.insert(name, layer);
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let bytes = tokio::fs::read(path).await.context("read file")?;
|
||||
let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
|
||||
let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id);
|
||||
let mut visible_layers = VisibleLayers::new();
|
||||
let (layers, _key_space) = layer_map.get_visibility(Vec::new());
|
||||
for (layer, visibility) in layers {
|
||||
if visibility == LayerVisibilityHint::Visible {
|
||||
visible_layers.add_layer(
|
||||
layer.layer_name(),
|
||||
index_part
|
||||
.layer_metadata
|
||||
.get(&layer.layer_name())
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
}
|
||||
let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?;
|
||||
println!("{output}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
@@ -16,5 +186,13 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||
println!("{output}");
|
||||
Ok(())
|
||||
}
|
||||
IndexPartCmd::Search {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
path,
|
||||
key,
|
||||
lsn,
|
||||
} => search_layers(tenant_id, timeline_id, path, key, lsn).await,
|
||||
IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//!
|
||||
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||
|
||||
mod download_remote_object;
|
||||
mod draw_timeline_dir;
|
||||
mod index_part;
|
||||
mod key;
|
||||
@@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::{Parser, Subcommand};
|
||||
use download_remote_object::DownloadRemoteObjectCmd;
|
||||
use index_part::IndexPartCmd;
|
||||
use layers::LayerCmd;
|
||||
use page_trace::PageTraceCmd;
|
||||
@@ -63,6 +65,7 @@ enum Commands {
|
||||
/// Debug print a hex key found from logs
|
||||
Key(key::DescribeKeyCommand),
|
||||
PageTrace(PageTraceCmd),
|
||||
DownloadRemoteObject(DownloadRemoteObjectCmd),
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
@@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
|
||||
Commands::DownloadRemoteObject(cmd) => {
|
||||
download_remote_object::main(&cmd).await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -153,7 +153,7 @@ message GetDbSizeResponse {
|
||||
message GetPageRequest {
|
||||
// A request ID. Will be included in the response. Should be unique for
|
||||
// in-flight requests on the stream.
|
||||
uint64 request_id = 1;
|
||||
RequestID request_id = 1;
|
||||
// The request class.
|
||||
GetPageClass request_class = 2;
|
||||
// The LSN to read at.
|
||||
@@ -177,6 +177,14 @@ message GetPageRequest {
|
||||
repeated uint32 block_number = 5;
|
||||
}
|
||||
|
||||
// A Request ID. Should be unique for in-flight requests on a stream. Included in the response.
|
||||
message RequestID {
|
||||
// The base request ID.
|
||||
uint64 id = 1;
|
||||
// The request attempt. Starts at 0, incremented on each retry.
|
||||
uint32 attempt = 2;
|
||||
}
|
||||
|
||||
// A GetPageRequest class. Primarily intended for observability, but may also be
|
||||
// used for prioritization in the future.
|
||||
enum GetPageClass {
|
||||
@@ -199,13 +207,26 @@ enum GetPageClass {
|
||||
// the entire batch is ready, so no one can make use of the individual pages.
|
||||
message GetPageResponse {
|
||||
// The original request's ID.
|
||||
uint64 request_id = 1;
|
||||
// The response status code.
|
||||
RequestID request_id = 1;
|
||||
// The response status code. If not OK, the rel and page fields will be empty.
|
||||
GetPageStatusCode status_code = 2;
|
||||
// A string describing the status, if any.
|
||||
string reason = 3;
|
||||
// The 8KB page images, in the same order as the request. Empty if status_code != OK.
|
||||
repeated bytes page_image = 4;
|
||||
// The relation that the pages belong to.
|
||||
RelTag rel = 4;
|
||||
// The page(s), in the same order as the request.
|
||||
repeated Page page = 5;
|
||||
}
|
||||
|
||||
// A page.
|
||||
//
|
||||
// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block
|
||||
// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway.
|
||||
message Page {
|
||||
// The page number.
|
||||
uint32 block_number = 1;
|
||||
// The materialized page image, as an 8KB byte vector.
|
||||
bytes image = 2;
|
||||
}
|
||||
|
||||
// A GetPageResponse status code.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Context as _;
|
||||
use futures::future::ready;
|
||||
use futures::{Stream, StreamExt as _, TryStreamExt as _};
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio_util::io::StreamReader;
|
||||
@@ -34,9 +35,7 @@ impl Client {
|
||||
E: TryInto<Endpoint> + Send + Sync + 'static,
|
||||
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
|
||||
{
|
||||
let endpoint: Endpoint = endpoint
|
||||
.try_into()
|
||||
.map_err(|err| anyhow!("invalid endpoint: {err}"))?;
|
||||
let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?;
|
||||
let channel = endpoint.connect().await?;
|
||||
Self::new(
|
||||
channel,
|
||||
@@ -112,7 +111,7 @@ impl Client {
|
||||
) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
|
||||
let reqs = reqs.map(proto::GetPageRequest::from);
|
||||
let resps = self.inner.get_pages(reqs).await?.into_inner();
|
||||
Ok(resps.map_ok(GetPageResponse::from))
|
||||
Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
|
||||
@@ -358,7 +358,10 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
|
||||
return Err(ProtocolError::Missing("block_number"));
|
||||
}
|
||||
Ok(Self {
|
||||
request_id: pb.request_id,
|
||||
request_id: pb
|
||||
.request_id
|
||||
.ok_or(ProtocolError::Missing("request_id"))?
|
||||
.into(),
|
||||
request_class: pb.request_class.into(),
|
||||
read_lsn: pb
|
||||
.read_lsn
|
||||
@@ -373,7 +376,7 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
|
||||
impl From<GetPageRequest> for proto::GetPageRequest {
|
||||
fn from(request: GetPageRequest) -> Self {
|
||||
Self {
|
||||
request_id: request.request_id,
|
||||
request_id: Some(request.request_id.into()),
|
||||
request_class: request.request_class.into(),
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
rel: Some(request.rel.into()),
|
||||
@@ -382,8 +385,51 @@ impl From<GetPageRequest> for proto::GetPageRequest {
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage request ID.
|
||||
pub type RequestID = u64;
|
||||
/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream.
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
pub struct RequestID {
|
||||
/// The base request ID.
|
||||
pub id: u64,
|
||||
// The request attempt. Starts at 0, incremented on each retry.
|
||||
pub attempt: u32,
|
||||
}
|
||||
|
||||
impl RequestID {
|
||||
/// Creates a new RequestID with the given ID and an initial attempt of 0.
|
||||
pub fn new(id: u64) -> Self {
|
||||
Self { id, attempt: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for RequestID {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}.{}", self.id, self.attempt)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<proto::RequestId> for RequestID {
|
||||
fn from(pb: proto::RequestId) -> Self {
|
||||
Self {
|
||||
id: pb.id,
|
||||
attempt: pb.attempt,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for RequestID {
|
||||
fn from(id: u64) -> Self {
|
||||
Self::new(id)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RequestID> for proto::RequestId {
|
||||
fn from(request_id: RequestID) -> Self {
|
||||
Self {
|
||||
id: request_id.id,
|
||||
attempt: request_id.attempt,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage request class.
|
||||
#[derive(Clone, Copy, Debug, strum_macros::Display)]
|
||||
@@ -458,32 +504,41 @@ impl From<GetPageClass> for i32 {
|
||||
pub struct GetPageResponse {
|
||||
/// The original request's ID.
|
||||
pub request_id: RequestID,
|
||||
/// The response status code.
|
||||
/// The response status code. If not OK, the `rel` and `pages` fields will be empty.
|
||||
pub status_code: GetPageStatusCode,
|
||||
/// A string describing the status, if any.
|
||||
pub reason: Option<String>,
|
||||
/// The 8KB page images, in the same order as the request. Empty if status != OK.
|
||||
pub page_images: Vec<Bytes>,
|
||||
/// The relation that the pages belong to.
|
||||
pub rel: RelTag,
|
||||
// The page(s), in the same order as the request.
|
||||
pub pages: Vec<Page>,
|
||||
}
|
||||
|
||||
impl From<proto::GetPageResponse> for GetPageResponse {
|
||||
fn from(pb: proto::GetPageResponse) -> Self {
|
||||
Self {
|
||||
request_id: pb.request_id,
|
||||
impl TryFrom<proto::GetPageResponse> for GetPageResponse {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::GetPageResponse) -> Result<Self, ProtocolError> {
|
||||
Ok(Self {
|
||||
request_id: pb
|
||||
.request_id
|
||||
.ok_or(ProtocolError::Missing("request_id"))?
|
||||
.into(),
|
||||
status_code: pb.status_code.into(),
|
||||
reason: Some(pb.reason).filter(|r| !r.is_empty()),
|
||||
page_images: pb.page_image,
|
||||
}
|
||||
rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
|
||||
pages: pb.page.into_iter().map(Page::from).collect(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetPageResponse> for proto::GetPageResponse {
|
||||
fn from(response: GetPageResponse) -> Self {
|
||||
Self {
|
||||
request_id: response.request_id,
|
||||
request_id: Some(response.request_id.into()),
|
||||
status_code: response.status_code.into(),
|
||||
reason: response.reason.unwrap_or_default(),
|
||||
page_image: response.page_images,
|
||||
rel: Some(response.rel.into()),
|
||||
page: response.pages.into_iter().map(proto::Page::from).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -516,11 +571,39 @@ impl GetPageResponse {
|
||||
request_id,
|
||||
status_code,
|
||||
reason: Some(status.message().to_string()),
|
||||
page_images: Vec::new(),
|
||||
rel: RelTag::default(),
|
||||
pages: Vec::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// A page.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Page {
|
||||
/// The page number.
|
||||
pub block_number: u32,
|
||||
/// The materialized page image, as an 8KB byte vector.
|
||||
pub image: Bytes,
|
||||
}
|
||||
|
||||
impl From<proto::Page> for Page {
|
||||
fn from(pb: proto::Page) -> Self {
|
||||
Self {
|
||||
block_number: pb.block_number,
|
||||
image: pb.image,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Page> for proto::Page {
|
||||
fn from(page: Page) -> Self {
|
||||
Self {
|
||||
block_number: page.block_number,
|
||||
image: page.image,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage response status code.
|
||||
///
|
||||
/// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
|
||||
|
||||
@@ -30,9 +30,9 @@ metrics.workspace = true
|
||||
tonic.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -10,12 +10,14 @@ use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{Stream, StreamExt as _};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client_grpc::{self as client_grpc, ShardSpec};
|
||||
use pageserver_page_api as page_api;
|
||||
use rand::prelude::*;
|
||||
use tokio::task::JoinSet;
|
||||
@@ -41,6 +43,10 @@ pub(crate) struct Args {
|
||||
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
/// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic
|
||||
/// no-frills `page_api::Client`. Only valid with grpc:// connstrings.
|
||||
#[clap(long)]
|
||||
rich_client: bool,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
@@ -360,6 +366,7 @@ async fn main_impl(
|
||||
let client: Box<dyn Client> = match scheme.as_str() {
|
||||
"postgresql" | "postgres" => {
|
||||
assert!(!args.compression, "libpq does not support compression");
|
||||
assert!(!args.rich_client, "rich client requires grpc://");
|
||||
Box::new(
|
||||
LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
|
||||
.await
|
||||
@@ -367,6 +374,16 @@ async fn main_impl(
|
||||
)
|
||||
}
|
||||
|
||||
"grpc" if args.rich_client => Box::new(
|
||||
RichGrpcClient::new(
|
||||
&args.page_service_connstring,
|
||||
worker_id.timeline,
|
||||
args.compression,
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
|
||||
"grpc" => Box::new(
|
||||
GrpcClient::new(
|
||||
&args.page_service_connstring,
|
||||
@@ -685,7 +702,7 @@ impl Client for GrpcClient {
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = page_api::GetPageRequest {
|
||||
request_id: req_id,
|
||||
request_id: req_id.into(),
|
||||
request_class: page_api::GetPageClass::Normal,
|
||||
read_lsn: page_api::ReadLsn {
|
||||
request_lsn: req_lsn,
|
||||
@@ -705,6 +722,79 @@ impl Client for GrpcClient {
|
||||
"unexpected status code: {}",
|
||||
resp.status_code,
|
||||
);
|
||||
Ok((resp.request_id, resp.page_images))
|
||||
Ok((
|
||||
resp.request_id.id,
|
||||
resp.pages.into_iter().map(|p| p.image).collect(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// A rich gRPC Pageserver client.
|
||||
struct RichGrpcClient {
|
||||
inner: Arc<client_grpc::PageserverClient>,
|
||||
requests: FuturesUnordered<
|
||||
Pin<Box<dyn Future<Output = anyhow::Result<page_api::GetPageResponse>> + Send>>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl RichGrpcClient {
|
||||
async fn new(
|
||||
connstring: &str,
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let inner = Arc::new(client_grpc::PageserverClient::new(
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
ShardSpec::new(
|
||||
[(ShardIndex::unsharded(), connstring.to_string())].into(),
|
||||
None,
|
||||
)?,
|
||||
None,
|
||||
compression.then_some(tonic::codec::CompressionEncoding::Zstd),
|
||||
)?);
|
||||
Ok(Self {
|
||||
inner,
|
||||
requests: FuturesUnordered::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for RichGrpcClient {
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = page_api::GetPageRequest {
|
||||
request_id: req_id.into(),
|
||||
request_class: page_api::GetPageClass::Normal,
|
||||
read_lsn: page_api::ReadLsn {
|
||||
request_lsn: req_lsn,
|
||||
not_modified_since_lsn: Some(mod_lsn),
|
||||
},
|
||||
rel,
|
||||
block_numbers: blks,
|
||||
};
|
||||
let inner = self.inner.clone();
|
||||
self.requests.push(Box::pin(async move {
|
||||
inner
|
||||
.get_page(req)
|
||||
.await
|
||||
.map_err(|err| anyhow::anyhow!("{err}"))
|
||||
}));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let resp = self.requests.next().await.unwrap()?;
|
||||
Ok((
|
||||
resp.request_id.id,
|
||||
resp.pages.into_iter().map(|p| p.image).collect(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
|
||||
};
|
||||
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
|
||||
page_cache, page_service, task_mgr, virtual_file,
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
|
||||
MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::crashsafe::syncfs;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
@@ -763,6 +764,41 @@ fn start_pageserver(
|
||||
(http_task, https_task)
|
||||
};
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let metrics_collection_task = {
|
||||
let cancel = shutdown_pageserver.child_token();
|
||||
let task = crate::BACKGROUND_RUNTIME.spawn({
|
||||
let cancel = cancel.clone();
|
||||
let background_jobs_barrier = background_jobs_barrier.clone();
|
||||
async move {
|
||||
if conf.force_metric_collection_on_scrape {
|
||||
return;
|
||||
}
|
||||
|
||||
// first wait until background jobs are cleared to launch.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return; },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("cancelled metrics collection task, exiting...");
|
||||
break;
|
||||
},
|
||||
_ = interval.tick() => {}
|
||||
}
|
||||
tokio::task::spawn_blocking(|| {
|
||||
METRICS_COLLECTOR.run_once(true);
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
MetricsCollectionTask(CancellableTask { task, cancel })
|
||||
};
|
||||
/* END_HADRON */
|
||||
|
||||
let consumption_metrics_tasks = {
|
||||
let cancel = shutdown_pageserver.child_token();
|
||||
let task = crate::BACKGROUND_RUNTIME.spawn({
|
||||
@@ -844,6 +880,7 @@ fn start_pageserver(
|
||||
https_endpoint_listener,
|
||||
page_service,
|
||||
page_service_grpc,
|
||||
metrics_collection_task,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
@@ -889,8 +926,11 @@ async fn create_remote_storage_client(
|
||||
"Simulating remote failures for first {} attempts of each op",
|
||||
conf.test_remote_failures
|
||||
);
|
||||
remote_storage =
|
||||
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
|
||||
remote_storage = GenericRemoteStorage::unreliable_wrapper(
|
||||
remote_storage,
|
||||
conf.test_remote_failures,
|
||||
conf.test_remote_failures_probability,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(remote_storage)
|
||||
|
||||
@@ -28,7 +28,6 @@ use reqwest::Url;
|
||||
use storage_broker::Uri;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::logging::{LogFormat, SecretString};
|
||||
use utils::serde_percent::Percent;
|
||||
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -146,9 +145,13 @@ pub struct PageServerConf {
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
|
||||
// The number of allowed failures in remote storage operations.
|
||||
pub test_remote_failures: u64,
|
||||
// The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
|
||||
// Use 100 for 100% failure, 0 for no failure.
|
||||
pub test_remote_failures_probability: u64,
|
||||
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
|
||||
@@ -249,6 +252,14 @@ pub struct PageServerConf {
|
||||
pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
|
||||
|
||||
pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
|
||||
|
||||
/// Defines what is a big tenant for the purpose of image layer generation.
|
||||
/// See Timeline::should_check_if_image_layers_required
|
||||
pub image_layer_generation_large_timeline_threshold: Option<u64>,
|
||||
|
||||
/// Controls whether to collect all metrics on each scrape or to return potentially stale
|
||||
/// results.
|
||||
pub force_metric_collection_on_scrape: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -393,6 +404,7 @@ impl PageServerConf {
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
test_remote_failures_probability,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
@@ -428,6 +440,8 @@ impl PageServerConf {
|
||||
posthog_config,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
image_layer_generation_large_timeline_threshold,
|
||||
force_metric_collection_on_scrape,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -460,17 +474,9 @@ impl PageServerConf {
|
||||
metric_collection_endpoint,
|
||||
metric_collection_bucket,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction: Some(disk_usage_based_eviction.unwrap_or(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: Default::default(),
|
||||
},
|
||||
)),
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
test_remote_failures_probability,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api: control_plane_api
|
||||
@@ -494,6 +500,8 @@ impl PageServerConf {
|
||||
dev_mode,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
image_layer_generation_large_timeline_threshold,
|
||||
force_metric_collection_on_scrape,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
@@ -635,7 +643,7 @@ impl PageServerConf {
|
||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||
let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
|
||||
|
||||
let config_toml = pageserver_api::config::ConfigToml {
|
||||
let mut config_toml = pageserver_api::config::ConfigToml {
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
wal_redo_timeout: Duration::from_secs(60),
|
||||
pg_distrib_dir: Some(pg_distrib_dir),
|
||||
@@ -647,6 +655,15 @@ impl PageServerConf {
|
||||
control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Test authors tend to forget about the default 10min initial lease deadline
|
||||
// when writing tests, which turns their immediate gc requests via mgmt API
|
||||
// into no-ops. Override the binary default here, such that there is no initial
|
||||
// lease deadline by default in tests. Tests that care can always override it
|
||||
// themselves.
|
||||
// Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
|
||||
config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0);
|
||||
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
|
||||
}
|
||||
}
|
||||
@@ -710,8 +727,9 @@ mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder};
|
||||
use rstest::rstest;
|
||||
use utils::id::NodeId;
|
||||
use utils::{id::NodeId, serde_percent::Percent};
|
||||
|
||||
use super::PageServerConf;
|
||||
|
||||
@@ -811,19 +829,69 @@ mod tests {
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_disk_usage_based_eviction_is_valid() {
|
||||
let input = r#"
|
||||
#[rstest]
|
||||
#[
|
||||
case::omit_the_whole_config(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: Default::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: true,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
"#;
|
||||
"#,
|
||||
)]
|
||||
#[
|
||||
case::omit_enabled_field(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 1_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: true,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" }
|
||||
"#,
|
||||
)]
|
||||
#[case::disabled(
|
||||
DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(80).unwrap(),
|
||||
min_avail_bytes: 2_000_000_000,
|
||||
period: Duration::from_secs(60),
|
||||
eviction_order: EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
enabled: false,
|
||||
},
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
disk_usage_based_eviction = { enabled = false }
|
||||
"#
|
||||
)]
|
||||
fn test_config_disk_usage_based_eviction_is_valid(
|
||||
#[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
|
||||
#[case] input: &str,
|
||||
) {
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("disk_usage_based_eviction is valid");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap();
|
||||
let disk_usage_based_eviction = config.disk_usage_based_eviction.unwrap();
|
||||
assert_eq!(disk_usage_based_eviction.max_usage_pct.get(), 80);
|
||||
assert_eq!(disk_usage_based_eviction.min_avail_bytes, 2_000_000_000);
|
||||
assert_eq!(disk_usage_based_eviction.period, Duration::from_secs(60));
|
||||
assert_eq!(disk_usage_based_eviction.eviction_order, Default::default());
|
||||
let disk_usage_based_eviction = config.disk_usage_based_eviction;
|
||||
assert_eq!(
|
||||
expected_disk_usage_based_eviction,
|
||||
disk_usage_based_eviction
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,7 +171,8 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
background_jobs_barrier: completion::Barrier,
|
||||
) -> Option<DiskUsageEvictionTask> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
let task_config = &conf.disk_usage_based_eviction;
|
||||
if !task_config.enabled {
|
||||
info!("disk usage based eviction task not configured");
|
||||
return None;
|
||||
};
|
||||
@@ -458,6 +459,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
match next {
|
||||
Ok(Ok(file_size)) => {
|
||||
METRICS.layers_evicted.inc();
|
||||
/*BEGIN_HADRON */
|
||||
METRICS.bytes_evicted.inc_by(file_size);
|
||||
/*END_HADRON */
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
}
|
||||
Ok(Err((
|
||||
@@ -1265,6 +1269,7 @@ mod filesystem_level_usage {
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: pageserver_api::config::EvictionOrder::default(),
|
||||
enabled: true,
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, atomic::AtomicBool},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
@@ -355,11 +359,17 @@ impl PerTenantProperties {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TenantFeatureResolver {
|
||||
inner: FeatureResolver,
|
||||
tenant_id: TenantId,
|
||||
cached_tenant_properties: Arc<ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>>,
|
||||
cached_tenant_properties: ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>,
|
||||
|
||||
// Add feature flag on the critical path below.
|
||||
//
|
||||
// If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of
|
||||
// resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the
|
||||
// housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions.
|
||||
pub feature_test_remote_size_flag: AtomicBool,
|
||||
}
|
||||
|
||||
impl TenantFeatureResolver {
|
||||
@@ -367,7 +377,8 @@ impl TenantFeatureResolver {
|
||||
Self {
|
||||
inner,
|
||||
tenant_id,
|
||||
cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
|
||||
cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())),
|
||||
feature_test_remote_size_flag: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,12 +407,14 @@ impl TenantFeatureResolver {
|
||||
self.inner.is_feature_flag_boolean(flag_key)
|
||||
}
|
||||
|
||||
pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) {
|
||||
let mut remote_size_mb = None;
|
||||
/// Refresh the cached properties and flags on the critical path.
|
||||
pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
|
||||
let mut remote_size_mb = Some(0.0);
|
||||
for timeline in tenant_shard.list_timelines() {
|
||||
let size = timeline.metrics.resident_physical_size_get();
|
||||
if size == 0 {
|
||||
remote_size_mb = None;
|
||||
break;
|
||||
}
|
||||
if let Some(ref mut remote_size_mb) = remote_size_mb {
|
||||
*remote_size_mb += size as f64 / 1024.0 / 1024.0;
|
||||
@@ -410,5 +423,12 @@ impl TenantFeatureResolver {
|
||||
self.cached_tenant_properties.store(Arc::new(
|
||||
PerTenantProperties { remote_size_mb }.into_posthog_properties(),
|
||||
));
|
||||
|
||||
// BEGIN: Update the feature flag on the critical path.
|
||||
self.feature_test_remote_size_flag.store(
|
||||
self.evaluate_boolean("test-remote-size-flag").is_ok(),
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
// END: Update the feature flag on the critical path.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,26 +116,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -618,7 +598,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
@@ -685,6 +665,17 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||
parameters:
|
||||
@@ -767,7 +758,7 @@ paths:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/:
|
||||
/v1/tenant:
|
||||
get:
|
||||
description: Get tenants list
|
||||
responses:
|
||||
@@ -847,7 +838,7 @@ paths:
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
/v1/tenant/{tenant_id}/config:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
//! Management HTTP API
|
||||
//!
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -61,6 +63,7 @@ use crate::context;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::feature_resolver::FeatureResolver;
|
||||
use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationConf;
|
||||
@@ -78,8 +81,8 @@ use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerNa
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
use crate::tenant::timeline::{
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
|
||||
WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
|
||||
CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
|
||||
WaitLsnWaiter, import_pgdata,
|
||||
};
|
||||
use crate::tenant::{
|
||||
GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
|
||||
@@ -2499,12 +2502,10 @@ async fn timeline_checkpoint_handler(
|
||||
.compact(&cancel, flags, &ctx)
|
||||
.await
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e),
|
||||
CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
if e.is_cancel() {
|
||||
ApiError::ShuttingDown
|
||||
} else {
|
||||
ApiError::InternalServerError(e.into_anyhow())
|
||||
}
|
||||
)?;
|
||||
}
|
||||
@@ -3215,6 +3216,30 @@ async fn get_utilization(
|
||||
.map_err(ApiError::InternalServerError)
|
||||
}
|
||||
|
||||
/// HADRON
|
||||
async fn list_tenant_visible_size_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let mut map = BTreeMap::new();
|
||||
for (tenant_shard_id, slot) in state.tenant_manager.list() {
|
||||
match slot {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
let visible_size = tenant.get_visible_size();
|
||||
map.insert(tenant_shard_id, visible_size);
|
||||
}
|
||||
TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, map)
|
||||
}
|
||||
|
||||
async fn list_aux_files(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -3630,6 +3655,17 @@ async fn activate_post_import_handler(
|
||||
.await
|
||||
}
|
||||
|
||||
// [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts
|
||||
// after we manually rectify situations such as local SSD data loss. We will eventually automate this.
|
||||
async fn hadron_reset_alert_gauges(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
LOCAL_DATA_LOSS_SUSPECTED.set(0);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||
@@ -3682,6 +3718,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn force_refresh_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant
|
||||
.feature_resolver
|
||||
.refresh_properties_and_flags(&tenant);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_evaluate_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -3698,7 +3751,7 @@ async fn tenant_evaluate_feature_flag(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
|
||||
// and we don't need to worry about it for now.
|
||||
let properties = tenant.feature_resolver.collect_properties();
|
||||
if as_type.as_deref() == Some("boolean") {
|
||||
@@ -3911,9 +3964,14 @@ pub fn make_router(
|
||||
.expect("construct launch timestamp header middleware"),
|
||||
);
|
||||
|
||||
let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
|
||||
|
||||
let prometheus_metrics_handler_wrapper =
|
||||
move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
|
||||
|
||||
Ok(router
|
||||
.data(state)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
|
||||
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
|
||||
.get("/profile/heap", |r| request_span(r, profile_heap_handler))
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
@@ -4119,6 +4177,7 @@ pub fn make_router(
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|
||||
|r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
|
||||
@@ -4147,6 +4206,9 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
|
||||
api_handler(r, tenant_evaluate_feature_flag)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| {
|
||||
api_handler(r, force_refresh_feature_flag)
|
||||
})
|
||||
.put("/v1/feature_flag/:flag_key", |r| {
|
||||
testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
|
||||
})
|
||||
@@ -4156,5 +4218,8 @@ pub fn make_router(
|
||||
.post("/v1/feature_flag_spec", |r| {
|
||||
api_handler(r, update_feature_flag_spec)
|
||||
})
|
||||
.post("/hadron-internal/reset_alert_gauges", |r| {
|
||||
api_handler(r, hadron_reset_alert_gauges)
|
||||
})
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct HttpsEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
// HADRON
|
||||
pub struct MetricsCollectionTask(pub CancellableTask);
|
||||
|
||||
impl CancellableTask {
|
||||
pub async fn shutdown(self) {
|
||||
self.cancel.cancel();
|
||||
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
|
||||
https_listener: Option<HttpsEndpointListener>,
|
||||
page_service: page_service::Listener,
|
||||
grpc_task: Option<CancellableTask>,
|
||||
metrics_collection_task: MetricsCollectionTask,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
|
||||
// HADRON
|
||||
timed(
|
||||
metrics_collection_task.0.shutdown(),
|
||||
"shutdown metrics collections metrics",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
timed(
|
||||
consumption_metrics_worker.0.shutdown(),
|
||||
"shutdown consumption metrics",
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::cell::Cell;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::RawFd;
|
||||
@@ -102,7 +103,18 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
|
||||
/* BEGIN_HADRON */
|
||||
pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_active_storage_operations_count",
|
||||
"Count of active storage operations with operation, tenant and timeline dimensions",
|
||||
&["operation", "tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
/*END_HADRON */
|
||||
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
@@ -2810,6 +2822,49 @@ pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
|
||||
|
||||
pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_local_data_loss_suspected",
|
||||
"Non-zero value indicates that pageserver local data loss is suspected (and highly likely)."
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish
|
||||
// it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by
|
||||
// Postgres compute to fetch data from pageservers.
|
||||
// A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the
|
||||
// request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify
|
||||
// issues with compute configuration, caused by either the compute node itself being stuck in the wrong
|
||||
// configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration
|
||||
// and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests
|
||||
// are indicative of bugs (and unavailability).
|
||||
pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_misrouted_pagestream_requests_total",
|
||||
"Number of pageserver pagestream requests that were routed to the wrong pageserver"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories:
|
||||
// - success
|
||||
// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors)
|
||||
// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.)
|
||||
pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_pagestream_handler_results_total",
|
||||
"Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)",
|
||||
&["outcome"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Constants for pageserver_pagestream_handler_results_total's outcome labels
|
||||
pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success";
|
||||
pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error";
|
||||
pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error";
|
||||
|
||||
// Metrics collected on WAL redo operations
|
||||
//
|
||||
// We collect the time spent in actual WAL redo ('redo'), and time waiting
|
||||
@@ -3048,13 +3103,19 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
stopped: Cell<bool>,
|
||||
}
|
||||
|
||||
impl StorageTimeMetricsTimer {
|
||||
fn new(metrics: StorageTimeMetrics) -> Self {
|
||||
/*BEGIN_HADRON */
|
||||
// record the active operation as the timer starts
|
||||
metrics.timeline_active_count.inc();
|
||||
/*END_HADRON */
|
||||
Self {
|
||||
metrics,
|
||||
start: Instant::now(),
|
||||
stopped: Cell::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3070,6 +3131,10 @@ impl StorageTimeMetricsTimer {
|
||||
self.metrics.timeline_sum.inc_by(seconds);
|
||||
self.metrics.timeline_count.inc();
|
||||
self.metrics.global_histogram.observe(seconds);
|
||||
/* BEGIN_HADRON*/
|
||||
self.stopped.set(true);
|
||||
self.metrics.timeline_active_count.dec();
|
||||
/*END_HADRON */
|
||||
duration
|
||||
}
|
||||
|
||||
@@ -3080,6 +3145,16 @@ impl StorageTimeMetricsTimer {
|
||||
}
|
||||
}
|
||||
|
||||
/*BEGIN_HADRON */
|
||||
impl Drop for StorageTimeMetricsTimer {
|
||||
fn drop(&mut self) {
|
||||
if !self.stopped.get() {
|
||||
self.metrics.timeline_active_count.dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
/*END_HADRON */
|
||||
|
||||
pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
|
||||
|
||||
impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
|
||||
@@ -3105,6 +3180,10 @@ pub(crate) struct StorageTimeMetrics {
|
||||
timeline_sum: Counter,
|
||||
/// Number of oeprations, per operation, tenant_id and timeline_id
|
||||
timeline_count: IntCounter,
|
||||
/*BEGIN_HADRON */
|
||||
/// Number of active operations per operation, tenant_id, and timeline_id
|
||||
timeline_active_count: IntGauge,
|
||||
/*END_HADRON */
|
||||
/// Global histogram having only the "operation" label.
|
||||
global_histogram: Histogram,
|
||||
}
|
||||
@@ -3124,6 +3203,11 @@ impl StorageTimeMetrics {
|
||||
let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
/*BEGIN_HADRON */
|
||||
let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE
|
||||
.get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
|
||||
.unwrap();
|
||||
/*END_HADRON */
|
||||
let global_histogram = STORAGE_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[operation])
|
||||
.unwrap();
|
||||
@@ -3131,6 +3215,7 @@ impl StorageTimeMetrics {
|
||||
StorageTimeMetrics {
|
||||
timeline_sum,
|
||||
timeline_count,
|
||||
timeline_active_count,
|
||||
global_histogram,
|
||||
}
|
||||
}
|
||||
@@ -3544,6 +3629,14 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
/* BEGIN_HADRON */
|
||||
let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[
|
||||
op,
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
/*END_HADRON */
|
||||
}
|
||||
|
||||
for op in StorageIoSizeOperation::VARIANTS {
|
||||
@@ -4336,6 +4429,9 @@ pub(crate) mod disk_usage_based_eviction {
|
||||
pub(crate) layers_collected: IntCounter,
|
||||
pub(crate) layers_selected: IntCounter,
|
||||
pub(crate) layers_evicted: IntCounter,
|
||||
/*BEGIN_HADRON */
|
||||
pub(crate) bytes_evicted: IntCounter,
|
||||
/*END_HADRON */
|
||||
}
|
||||
|
||||
impl Default for Metrics {
|
||||
@@ -4372,12 +4468,21 @@ pub(crate) mod disk_usage_based_eviction {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
/*BEGIN_HADRON */
|
||||
let bytes_evicted = register_int_counter!(
|
||||
"pageserver_disk_usage_based_eviction_evicted_bytes_total",
|
||||
"Amount of bytes successfully evicted"
|
||||
)
|
||||
.unwrap();
|
||||
/*END_HADRON */
|
||||
|
||||
Self {
|
||||
tenant_collection_time,
|
||||
tenant_layer_count,
|
||||
layers_collected,
|
||||
layers_selected,
|
||||
layers_evicted,
|
||||
bytes_evicted,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4497,6 +4602,7 @@ pub fn preinitialize_metrics(
|
||||
&CIRCUIT_BREAKERS_UNBROKEN,
|
||||
&PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
|
||||
&WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS,
|
||||
&MISROUTED_PAGESTREAM_REQUESTS,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
@@ -4534,6 +4640,7 @@ pub fn preinitialize_metrics(
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
LOCAL_DATA_LOSS_SUSPECTED.get();
|
||||
|
||||
// histograms
|
||||
[
|
||||
|
||||
@@ -50,6 +50,7 @@ use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, Bu
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::service::Interceptor as _;
|
||||
use tonic::transport::server::TcpConnectInfo;
|
||||
use tracing::*;
|
||||
use utils::auth::{Claims, Scope, SwappableJwtAuth};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
@@ -69,7 +70,7 @@ use crate::context::{
|
||||
};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
SmgrOpTimer, TimelineMetrics,
|
||||
MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{LsnRange, Version};
|
||||
use crate::span::{
|
||||
@@ -90,7 +91,8 @@ use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation};
|
||||
/// is not yet in state [`TenantState::Active`].
|
||||
///
|
||||
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
/// HADRON: reduced timeout and we will retry in Cache::get().
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
/// Threshold at which to log slow GetPage requests.
|
||||
const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
|
||||
@@ -1127,6 +1129,7 @@ impl PageServerHandler {
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
MISROUTED_PAGESTREAM_REQUESTS.inc();
|
||||
return respond_error!(
|
||||
span,
|
||||
PageStreamError::Reconnect(
|
||||
@@ -1438,20 +1441,57 @@ impl PageServerHandler {
|
||||
let (response_msg, ctx) = match handler_result {
|
||||
Err(e) => match &e.err {
|
||||
PageStreamError::Shutdown => {
|
||||
// BEGIN HADRON
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR])
|
||||
.inc();
|
||||
// END HADRON
|
||||
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropping connection due to shutdown"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
PageStreamError::Reconnect(reason) => {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
PageStreamError::Reconnect(_reason) => {
|
||||
span.in_scope(|| {
|
||||
// BEGIN HADRON
|
||||
// We can get here because the compute node is pointing at the wrong PS. We
|
||||
// already have a metric to keep track of this so suppressing this log to
|
||||
// reduce log spam. The information in this log message is not going to be that
|
||||
// helpful given the volume of logs that can be generated.
|
||||
// info!("handler requested reconnect: {reason}")
|
||||
// END HADRON
|
||||
});
|
||||
// BEGIN HADRON
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[
|
||||
metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
|
||||
])
|
||||
.inc();
|
||||
// END HADRON
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
PageStreamError::Read(_)
|
||||
| PageStreamError::LsnTimeout(_)
|
||||
| PageStreamError::NotFound(_)
|
||||
| PageStreamError::BadRequest(_) => {
|
||||
// BEGIN HADRON
|
||||
if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err {
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[
|
||||
metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
|
||||
])
|
||||
.inc();
|
||||
} else {
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[
|
||||
metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR,
|
||||
])
|
||||
.inc();
|
||||
}
|
||||
// END HADRON
|
||||
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
@@ -1469,7 +1509,15 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
},
|
||||
Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)),
|
||||
Ok((response_msg, _op_timer_already_observed, ctx)) => {
|
||||
// BEGIN HADRON
|
||||
PAGESTREAM_HANDLER_RESULTS_TOTAL
|
||||
.with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS])
|
||||
.inc();
|
||||
// END HADRON
|
||||
|
||||
(response_msg, Some(ctx))
|
||||
}
|
||||
};
|
||||
|
||||
let ctx = ctx.map(|req_ctx| {
|
||||
@@ -3292,9 +3340,12 @@ impl GrpcPageServiceHandler {
|
||||
}
|
||||
|
||||
/// Generates a PagestreamRequest header from a ReadLsn and request ID.
|
||||
fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest {
|
||||
fn make_hdr(
|
||||
read_lsn: page_api::ReadLsn,
|
||||
req_id: Option<page_api::RequestID>,
|
||||
) -> PagestreamRequest {
|
||||
PagestreamRequest {
|
||||
reqid: req_id,
|
||||
reqid: req_id.map(|r| r.id).unwrap_or_default(),
|
||||
request_lsn: read_lsn.request_lsn,
|
||||
not_modified_since: read_lsn
|
||||
.not_modified_since_lsn
|
||||
@@ -3352,6 +3403,8 @@ impl GrpcPageServiceHandler {
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: verify that the requested pages belong to this shard.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
@@ -3404,7 +3457,7 @@ impl GrpcPageServiceHandler {
|
||||
|
||||
batch.push(BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, req.request_id),
|
||||
hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)),
|
||||
rel: req.rel,
|
||||
blkno,
|
||||
},
|
||||
@@ -3434,12 +3487,16 @@ impl GrpcPageServiceHandler {
|
||||
request_id: req.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
page_images: Vec::with_capacity(results.len()),
|
||||
rel: req.rel,
|
||||
pages: Vec::with_capacity(results.len()),
|
||||
};
|
||||
|
||||
for result in results {
|
||||
match result {
|
||||
Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page),
|
||||
Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page {
|
||||
block_number: r.req.blkno,
|
||||
image: r.page,
|
||||
}),
|
||||
Ok((resp, _, _)) => {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected response: {resp:?}"
|
||||
@@ -3482,7 +3539,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(rel=%req.rel, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamExistsRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
rel: req.rel,
|
||||
};
|
||||
|
||||
@@ -3632,7 +3689,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamDbSizeRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
dbnode: req.db_oid,
|
||||
};
|
||||
|
||||
@@ -3691,20 +3748,24 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
break;
|
||||
},
|
||||
};
|
||||
let req = if let Some(req) = req? {
|
||||
req
|
||||
} else {
|
||||
break;
|
||||
};
|
||||
let req_id = req.request_id;
|
||||
let Some(req) = req? else { break };
|
||||
let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
|
||||
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await;
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status.
|
||||
Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
|
||||
// the stream with a tonic::Status. Log the error regardless, since
|
||||
// ObservabilityLayer can't automatically log stream errors.
|
||||
Err(status) => {
|
||||
// TODO: it would be nice if we could propagate the get_page() fields here.
|
||||
span.in_scope(|| {
|
||||
warn!("request failed with {:?}: {}", status.code(), status.message());
|
||||
});
|
||||
page_api::GetPageResponse::try_from_status(status, req_id)?.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -3728,7 +3789,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(rel=%req.rel, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamNblocksRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
rel: req.rel,
|
||||
};
|
||||
|
||||
@@ -3761,7 +3822,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
|
||||
|
||||
let req = PagestreamGetSlruSegmentRequest {
|
||||
hdr: Self::make_hdr(req.read_lsn, 0),
|
||||
hdr: Self::make_hdr(req.read_lsn, None),
|
||||
kind: req.kind as u8,
|
||||
segno: req.segno,
|
||||
};
|
||||
@@ -3842,40 +3903,85 @@ impl<S: tonic::server::NamedService> tonic::server::NamedService for Observabili
|
||||
const NAME: &'static str = S::NAME; // propagate inner service name
|
||||
}
|
||||
|
||||
impl<S, B> tower::Service<http::Request<B>> for ObservabilityLayerService<S>
|
||||
impl<S, Req, Resp> tower::Service<http::Request<Req>> for ObservabilityLayerService<S>
|
||||
where
|
||||
S: tower::Service<http::Request<B>>,
|
||||
S: tower::Service<http::Request<Req>, Response = http::Response<Resp>> + Send,
|
||||
S::Future: Send + 'static,
|
||||
{
|
||||
type Response = S::Response;
|
||||
type Error = S::Error;
|
||||
type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>;
|
||||
|
||||
fn call(&mut self, mut req: http::Request<B>) -> Self::Future {
|
||||
fn call(&mut self, mut req: http::Request<Req>) -> Self::Future {
|
||||
// Record the request start time as a request extension.
|
||||
//
|
||||
// TODO: we should start a timer here instead, but it currently requires a timeline handle
|
||||
// and SmgrQueryType, which we don't have yet. Refactor it to provide it later.
|
||||
req.extensions_mut().insert(ReceivedAt(Instant::now()));
|
||||
|
||||
// Create a basic tracing span. Enter the span for the current thread (to use it for inner
|
||||
// sync code like interceptors), and instrument the future (to use it for inner async code
|
||||
// like the page service itself).
|
||||
// Extract the peer address and gRPC method.
|
||||
let peer = req
|
||||
.extensions()
|
||||
.get::<TcpConnectInfo>()
|
||||
.and_then(|info| info.remote_addr())
|
||||
.map(|addr| addr.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let method = req
|
||||
.uri()
|
||||
.path()
|
||||
.split('/')
|
||||
.nth(2)
|
||||
.unwrap_or(req.uri().path())
|
||||
.to_string();
|
||||
|
||||
// Create a basic tracing span.
|
||||
//
|
||||
// The instrument() call below is not sufficient. It only affects the returned future, and
|
||||
// only takes effect when the caller polls it. Any sync code executed when we call
|
||||
// self.inner.call() below (such as interceptors) runs outside of the returned future, and
|
||||
// is not affected by it. We therefore have to enter the span on the current thread too.
|
||||
// Enter the span for the current thread and instrument the future. It is not sufficient to
|
||||
// only instrument the future, since it only takes effect after the future is returned and
|
||||
// polled, not when the inner service is called below (e.g. during interceptor execution).
|
||||
let span = info_span!(
|
||||
"grpc:pageservice",
|
||||
// Set by TenantMetadataInterceptor.
|
||||
// These will be populated by TenantMetadataInterceptor.
|
||||
tenant_id = field::Empty,
|
||||
timeline_id = field::Empty,
|
||||
shard_id = field::Empty,
|
||||
// NB: empty fields must be listed first above. Otherwise, the field names will be
|
||||
// clobbered when the empty fields are populated. They will be output last regardless.
|
||||
%peer,
|
||||
%method,
|
||||
);
|
||||
let _guard = span.enter();
|
||||
|
||||
Box::pin(self.inner.call(req).instrument(span.clone()))
|
||||
// Construct a future for calling the inner service, but don't await it. This avoids having
|
||||
// to clone the inner service into the future below.
|
||||
let call = self.inner.call(req);
|
||||
|
||||
async move {
|
||||
// Await the inner service call.
|
||||
let result = call.await;
|
||||
|
||||
// Log gRPC error statuses. This won't include request info from handler spans, but it
|
||||
// will catch all errors (even those emitted before handler spans are constructed). Only
|
||||
// unary request errors are logged here, not streaming response errors.
|
||||
if let Ok(ref resp) = result
|
||||
&& let Some(status) = tonic::Status::from_header_map(resp.headers())
|
||||
&& status.code() != tonic::Code::Ok
|
||||
{
|
||||
// TODO: it would be nice if we could propagate the handler span's request fields
|
||||
// here. This could e.g. be done by attaching the request fields to
|
||||
// tonic::Status::metadata via a proc macro.
|
||||
warn!(
|
||||
"request failed with {:?}: {}",
|
||||
status.code(),
|
||||
status.message()
|
||||
);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
.instrument(span.clone())
|
||||
.boxed()
|
||||
}
|
||||
|
||||
fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
|
||||
|
||||
@@ -141,6 +141,23 @@ pub(crate) enum CollectKeySpaceError {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl CollectKeySpaceError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
CollectKeySpaceError::Decode(_) => false,
|
||||
CollectKeySpaceError::PageRead(e) => e.is_cancel(),
|
||||
CollectKeySpaceError::Cancelled => true,
|
||||
}
|
||||
}
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
CollectKeySpaceError::Decode(e) => anyhow::Error::new(e),
|
||||
CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e),
|
||||
CollectKeySpaceError::Cancelled => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CollectKeySpaceError {
|
||||
fn from(err: PageReconstructError) -> Self {
|
||||
match err {
|
||||
|
||||
@@ -34,7 +34,7 @@ use once_cell::sync::Lazy;
|
||||
pub use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{self, RelSizeMigration};
|
||||
use pageserver_api::models::{
|
||||
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
WalRedoManagerStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
|
||||
@@ -142,6 +142,9 @@ mod gc_block;
|
||||
mod gc_result;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod debug;
|
||||
|
||||
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -180,6 +183,7 @@ pub(super) struct AttachedTenantConf {
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: pageserver_api::models::TenantConfig,
|
||||
location: AttachedLocationConfig,
|
||||
) -> Self {
|
||||
@@ -191,9 +195,7 @@ impl AttachedTenantConf {
|
||||
let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
|
||||
Some(
|
||||
tokio::time::Instant::now()
|
||||
+ tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(LsnLease::DEFAULT_LENGTH),
|
||||
+ TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf),
|
||||
)
|
||||
} else {
|
||||
// We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
|
||||
@@ -208,10 +210,13 @@ impl AttachedTenantConf {
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
fn try_from(
|
||||
conf: &'static PageServerConf,
|
||||
location_conf: LocationConf,
|
||||
) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
Ok(Self::new(location_conf.tenant_conf, *attach_conf))
|
||||
Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf))
|
||||
}
|
||||
LocationMode::Secondary(_) => {
|
||||
anyhow::bail!(
|
||||
@@ -386,7 +391,7 @@ pub struct TenantShard {
|
||||
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) feature_resolver: TenantFeatureResolver,
|
||||
pub(crate) feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
impl std::fmt::Debug for TenantShard {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
@@ -3286,7 +3291,9 @@ impl TenantShard {
|
||||
// Ignore this, we likely raced with unarchival.
|
||||
OffloadError::NotArchived => Ok(()),
|
||||
OffloadError::AlreadyInProgress => Ok(()),
|
||||
err => Err(err),
|
||||
OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
|
||||
// don't break the anyhow chain
|
||||
OffloadError::Other(err) => Err(CompactionError::Other(err)),
|
||||
})?;
|
||||
}
|
||||
|
||||
@@ -3314,27 +3321,13 @@ impl TenantShard {
|
||||
|
||||
/// Trips the compaction circuit breaker if appropriate.
|
||||
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
|
||||
match err {
|
||||
err if err.is_cancel() => {}
|
||||
CompactionError::ShuttingDown => (),
|
||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||
// shouldn't block compaction.
|
||||
CompactionError::Offload(_) => {}
|
||||
CompactionError::CollectKeySpaceError(err) => {
|
||||
// CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::Other(err) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::AlreadyRunning(_) => {}
|
||||
if err.is_cancel() {
|
||||
return;
|
||||
}
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
|
||||
/// Cancel scheduled compaction tasks
|
||||
@@ -3411,7 +3404,7 @@ impl TenantShard {
|
||||
}
|
||||
|
||||
// Update the feature resolver with the latest tenant-spcific data.
|
||||
self.feature_resolver.update_cached_tenant_properties(self);
|
||||
self.feature_resolver.refresh_properties_and_flags(self);
|
||||
}
|
||||
|
||||
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
|
||||
@@ -4178,6 +4171,15 @@ impl TenantShard {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
// HADRON
|
||||
pub fn get_image_creation_timeout(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf.image_layer_force_creation_period.or(self
|
||||
.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_force_creation_period)
|
||||
}
|
||||
|
||||
pub fn get_pitr_interval(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
@@ -4205,10 +4207,16 @@ impl TenantShard {
|
||||
}
|
||||
|
||||
pub fn get_lsn_lease_length(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf)
|
||||
}
|
||||
|
||||
pub fn get_lsn_lease_length_impl(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: &pageserver_api::models::TenantConfig,
|
||||
) -> Duration {
|
||||
tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
.unwrap_or(conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
pub fn get_timeline_offloading_enabled(&self) -> bool {
|
||||
@@ -4494,10 +4502,10 @@ impl TenantShard {
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
basebackup_cache,
|
||||
feature_resolver: TenantFeatureResolver::new(
|
||||
feature_resolver: Arc::new(TenantFeatureResolver::new(
|
||||
feature_resolver,
|
||||
tenant_shard_id.tenant_id,
|
||||
),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5711,6 +5719,16 @@ impl TenantShard {
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// HADRON
|
||||
/// Return the visible size of all timelines in this tenant.
|
||||
pub(crate) fn get_visible_size(&self) -> u64 {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines
|
||||
.values()
|
||||
.map(|t| t.metrics.visible_physical_size_gauge.get())
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
|
||||
/// manifest in `Self::remote_tenant_manifest`.
|
||||
///
|
||||
@@ -6009,22 +6027,24 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn do_try_load(
|
||||
pub(crate) async fn do_try_load_with_redo(
|
||||
&self,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
|
||||
|
||||
let tenant = Arc::new(TenantShard::new(
|
||||
TenantState::Attaching,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
self.tenant_conf.clone(),
|
||||
self.generation,
|
||||
ShardParameters::default(),
|
||||
))
|
||||
AttachedTenantConf::try_from(
|
||||
self.conf,
|
||||
LocationConf::attached_single(
|
||||
self.tenant_conf.clone(),
|
||||
self.generation,
|
||||
ShardParameters::default(),
|
||||
),
|
||||
)
|
||||
.unwrap(),
|
||||
self.shard_identity,
|
||||
Some(walredo_mgr),
|
||||
@@ -6049,6 +6069,14 @@ pub(crate) mod harness {
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
pub(crate) async fn do_try_load(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
self.do_try_load_with_redo(walredo_mgr, ctx).await
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||
self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
|
||||
}
|
||||
@@ -6125,7 +6153,7 @@ mod tests {
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::keyspace::KeySpaceRandomAccum;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease};
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
#[cfg(feature = "testing")]
|
||||
use rand::SeedableRng;
|
||||
@@ -6675,17 +6703,13 @@ mod tests {
|
||||
tline.freeze_and_flush().await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -9384,17 +9408,21 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_lsn_lease() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
|
||||
.await
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
// set a non-zero lease length to test the feature
|
||||
tenant
|
||||
.update_tenant_config(|mut conf| {
|
||||
conf.lsn_lease_length = Some(LsnLease::DEFAULT_LENGTH);
|
||||
Ok(conf)
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
|
||||
366
pageserver/src/tenant/debug.rs
Normal file
366
pageserver/src/tenant/debug.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
use std::{ops::Range, str::FromStr, sync::Arc};
|
||||
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use base64::{Engine as _, engine::general_purpose::STANDARD};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
keyspace::KeySpace,
|
||||
shard::{ShardIdentity, ShardStripeSize},
|
||||
};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn};
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardIndex, ShardNumber},
|
||||
};
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::ValueReconstructState,
|
||||
walredo::harness::RedoHarness,
|
||||
};
|
||||
|
||||
use super::{
|
||||
WalRedoManager, WalredoManagerId,
|
||||
harness::TenantHarness,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState},
|
||||
};
|
||||
|
||||
fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes {
|
||||
// To match the logic in libs/wal_decoder/src/serialized_batch.rs
|
||||
let mut new_image: BytesMut = img_bytes.into();
|
||||
if is_fpw && !page_is_new(&new_image) {
|
||||
page_set_lsn(&mut new_image, next_record_lsn);
|
||||
}
|
||||
assert_eq!(new_image.len(), BLCKSZ as usize);
|
||||
new_image.freeze()
|
||||
}
|
||||
|
||||
async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let redo_harness = RedoHarness::new()?;
|
||||
let span = redo_harness.span();
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
let tenant = TenantHarness::create_custom(
|
||||
"search_key",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::unsharded(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?
|
||||
.do_try_load_with_redo(
|
||||
Arc::new(WalRedoManager::Prod(
|
||||
WalredoManagerId::next(),
|
||||
redo_harness.manager,
|
||||
)),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
|
||||
.await?;
|
||||
let contents = tokio::fs::read_to_string(input)
|
||||
.await
|
||||
.map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}")))
|
||||
.unwrap();
|
||||
let lines = contents.lines();
|
||||
let mut last_wal_lsn: Option<Lsn> = None;
|
||||
let state = {
|
||||
let mut state = ValueReconstructState::default();
|
||||
let mut is_fpw = false;
|
||||
let mut is_first_line = true;
|
||||
for line in lines {
|
||||
if is_first_line {
|
||||
is_first_line = false;
|
||||
if line.trim() == "FPW" {
|
||||
is_fpw = true;
|
||||
}
|
||||
continue; // Skip the first line.
|
||||
}
|
||||
// Each input line is in the "<next_record_lsn>,<base64>" format.
|
||||
let (lsn_str, payload_b64) = line
|
||||
.split_once(',')
|
||||
.expect("Invalid input format: expected '<lsn>,<base64>'");
|
||||
|
||||
// Parse the LSN and decode the payload.
|
||||
let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format");
|
||||
let bytes = Bytes::from(
|
||||
STANDARD
|
||||
.decode(payload_b64.trim())
|
||||
.expect("Invalid base64 payload"),
|
||||
);
|
||||
|
||||
// The first line is considered the base image, the rest are WAL records.
|
||||
if state.img.is_none() {
|
||||
state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes)));
|
||||
} else {
|
||||
let wal_record = NeonWalRecord::Postgres {
|
||||
will_init: false,
|
||||
rec: bytes,
|
||||
};
|
||||
state.records.push((lsn, wal_record));
|
||||
last_wal_lsn.replace(lsn);
|
||||
}
|
||||
}
|
||||
state
|
||||
};
|
||||
|
||||
assert!(state.img.is_some(), "No base image found");
|
||||
assert!(!state.records.is_empty(), "No WAL records found");
|
||||
let result = timeline
|
||||
.reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage)
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
|
||||
eprintln!("final image: {:?}", STANDARD.encode(result));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn search_key(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
dir: String,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(4),
|
||||
};
|
||||
|
||||
let redo_harness = RedoHarness::new()?;
|
||||
let span = redo_harness.span();
|
||||
let tenant_conf = pageserver_api::models::TenantConfig {
|
||||
..Default::default()
|
||||
};
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
let tenant = TenantHarness::create_custom(
|
||||
"search_key",
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
ShardIdentity::new(
|
||||
shard_index.shard_number,
|
||||
shard_index.shard_count,
|
||||
ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?
|
||||
.do_try_load_with_redo(
|
||||
Arc::new(WalRedoManager::Prod(
|
||||
WalredoManagerId::next(),
|
||||
redo_harness.manager,
|
||||
)),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
|
||||
.await?;
|
||||
|
||||
let mut delta_layers: Vec<Layer> = Vec::new();
|
||||
let mut img_layer: Option<Layer> = Option::None;
|
||||
let mut dir = tokio::fs::read_dir(dir).await?;
|
||||
loop {
|
||||
let entry = dir.next_entry().await?;
|
||||
if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() {
|
||||
break;
|
||||
}
|
||||
let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap();
|
||||
let layer_name = match LayerName::from_str(path.file_name().unwrap()) {
|
||||
Ok(name) => name,
|
||||
Err(_) => {
|
||||
eprintln!("Skipped invalid layer: {path}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let layer = Layer::for_resident(
|
||||
tenant.conf,
|
||||
&timeline,
|
||||
path.clone(),
|
||||
layer_name,
|
||||
LayerFileMetadata::new(
|
||||
tokio::fs::metadata(path.clone()).await?.len(),
|
||||
Generation::new(1),
|
||||
shard_index,
|
||||
),
|
||||
);
|
||||
if layer.layer_desc().is_delta() {
|
||||
delta_layers.push(layer.into());
|
||||
} else if img_layer.is_none() {
|
||||
img_layer = Some(layer.into());
|
||||
} else {
|
||||
anyhow::bail!("Found multiple image layers");
|
||||
}
|
||||
}
|
||||
// sort delta layers based on the descending order of LSN
|
||||
delta_layers.sort_by(|a, b| {
|
||||
b.layer_desc()
|
||||
.get_lsn_range()
|
||||
.start
|
||||
.cmp(&a.layer_desc().get_lsn_range().start)
|
||||
});
|
||||
|
||||
let mut state = ValuesReconstructState::new(IoConcurrency::Sequential);
|
||||
|
||||
let key_space = KeySpace::single(Range {
|
||||
start: key,
|
||||
end: key.next(),
|
||||
});
|
||||
let lsn_range = Range {
|
||||
start: img_layer
|
||||
.as_ref()
|
||||
.map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()),
|
||||
end: lsn,
|
||||
};
|
||||
for delta_layer in delta_layers.iter() {
|
||||
delta_layer
|
||||
.get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
img_layer
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, result) in std::mem::take(&mut state.keys) {
|
||||
let state = result.collect_pending_ios().await?;
|
||||
if state.img.is_some() {
|
||||
eprintln!(
|
||||
"image: {}: {:x?}",
|
||||
state.img.as_ref().unwrap().0,
|
||||
STANDARD.encode(state.img.as_ref().unwrap().1.clone())
|
||||
);
|
||||
}
|
||||
for delta in state.records.iter() {
|
||||
match &delta.1 {
|
||||
NeonWalRecord::Postgres { will_init, rec } => {
|
||||
eprintln!(
|
||||
"delta: {}: will_init: {}, {:x?}",
|
||||
delta.0,
|
||||
will_init,
|
||||
STANDARD.encode(rec)
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
eprintln!("delta: {}: {:x?}", delta.0, delta.1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let result = timeline
|
||||
.reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage)
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
eprintln!("final image: {lsn} : {result:?}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Redo all WALs against the base image in the input file. Return the base64 encoded final image.
|
||||
/// Each line in the input file must be in the form "<lsn>,<base64>" where:
|
||||
/// * `<lsn>` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`.
|
||||
/// * `<base64>` is the base64‐encoded page image (first line) or WAL record (subsequent lines).
|
||||
///
|
||||
/// The first line provides the base image of a page. The LSN is the LSN of "next record" following
|
||||
/// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping
|
||||
/// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200.
|
||||
///
|
||||
/// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the
|
||||
/// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here
|
||||
/// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should
|
||||
/// be 0/1.
|
||||
#[derive(Parser)]
|
||||
struct RedoWalsCmd {
|
||||
#[clap(long)]
|
||||
input: String,
|
||||
#[clap(long)]
|
||||
key: String,
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_redo_wals() -> anyhow::Result<()> {
|
||||
let args = std::env::args().collect_vec();
|
||||
let pos = args
|
||||
.iter()
|
||||
.position(|arg| arg == "--")
|
||||
.unwrap_or(args.len());
|
||||
let slice = &args[pos..args.len()];
|
||||
let cmd = match RedoWalsCmd::try_parse_from(slice) {
|
||||
Ok(cmd) => cmd,
|
||||
Err(err) => {
|
||||
eprintln!("{err}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let key = Key::from_hex(&cmd.key).unwrap();
|
||||
redo_wals(&cmd.input, key).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Search for a page at the given LSN in all layers of the data_dir.
|
||||
/// Return the base64-encoded image and all WAL records, as well as the final reconstructed image.
|
||||
#[derive(Parser)]
|
||||
struct SearchKeyCmd {
|
||||
#[clap(long)]
|
||||
tenant_id: String,
|
||||
#[clap(long)]
|
||||
timeline_id: String,
|
||||
#[clap(long)]
|
||||
data_dir: String,
|
||||
#[clap(long)]
|
||||
key: String,
|
||||
#[clap(long)]
|
||||
lsn: String,
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_search_key() -> anyhow::Result<()> {
|
||||
let args = std::env::args().collect_vec();
|
||||
let pos = args
|
||||
.iter()
|
||||
.position(|arg| arg == "--")
|
||||
.unwrap_or(args.len());
|
||||
let slice = &args[pos..args.len()];
|
||||
let cmd = match SearchKeyCmd::try_parse_from(slice) {
|
||||
Ok(cmd) => cmd,
|
||||
Err(err) => {
|
||||
eprintln!("{err}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap();
|
||||
let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap();
|
||||
let key = Key::from_hex(&cmd.key).unwrap();
|
||||
let lsn = Lsn::from_str(&cmd.lsn).unwrap();
|
||||
search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -43,7 +43,7 @@ use crate::controller_upcall_client::{
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
@@ -538,6 +538,21 @@ pub async fn init_tenant_mgr(
|
||||
// Determine which tenants are to be secondary or attached, and in which generation
|
||||
let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?;
|
||||
|
||||
// Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants.
|
||||
// This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node
|
||||
// so the Storage Controller has not had the time to move tenants out.
|
||||
let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes {
|
||||
tenant_configs.is_empty() && !tenant_modes.is_empty()
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if data_loss_suspected {
|
||||
tracing::error!(
|
||||
"Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants"
|
||||
);
|
||||
}
|
||||
LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 });
|
||||
|
||||
tracing::info!(
|
||||
"Attaching {} tenants at startup, warming up {} at a time",
|
||||
tenant_configs.len(),
|
||||
@@ -664,7 +679,7 @@ pub async fn init_tenant_mgr(
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
@@ -842,8 +857,11 @@ impl TenantManager {
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
AttachedTenantConf::try_from(
|
||||
self.conf,
|
||||
new_location_config.clone(),
|
||||
)
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
@@ -1046,7 +1064,7 @@ impl TenantManager {
|
||||
// Testing hack: if we are configured with no control plane, then drop the generation
|
||||
// from upserts. This enables creating generation-less tenants even though neon_local
|
||||
// always uses generations when calling the location conf API.
|
||||
let attached_conf = AttachedTenantConf::try_from(new_location_config)
|
||||
let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config)
|
||||
.map_err(UpsertLocationError::BadRequest)?;
|
||||
|
||||
let tenant = tenant_spawn(
|
||||
@@ -1250,7 +1268,7 @@ impl TenantManager {
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
AttachedTenantConf::try_from(self.conf, config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
@@ -2131,7 +2149,7 @@ impl TenantManager {
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
|
||||
AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
|
||||
@@ -141,11 +141,29 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
let fs_size = usize::try_from(fs_size)
|
||||
.with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
let mut metadata = None;
|
||||
match storage {
|
||||
// Pass the file path as a storage metadata to minimize changes to neon.
|
||||
// Otherwise, we need to change the upload interface.
|
||||
GenericRemoteStorage::AzureBlob(s) => {
|
||||
let block_size_mb = s.put_block_size_mb.unwrap_or(0);
|
||||
if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 {
|
||||
metadata = Some(remote_storage::StorageMetadata::from([(
|
||||
"databricks_azure_put_block",
|
||||
local_path.as_str(),
|
||||
)]));
|
||||
}
|
||||
}
|
||||
GenericRemoteStorage::LocalFs(_) => {}
|
||||
GenericRemoteStorage::AwsS3(_) => {}
|
||||
GenericRemoteStorage::Unreliable(_) => {}
|
||||
};
|
||||
/* END_HADRON */
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload(reader, fs_size, remote_path, None, cancel)
|
||||
.upload(reader, fs_size, remote_path, metadata, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{local_path}'"))
|
||||
}
|
||||
|
||||
@@ -225,7 +225,7 @@ impl fmt::Display for ImageLayerName {
|
||||
/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
|
||||
/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
|
||||
/// and [`crate::tenant::storage_layer::layer::local_layer_path`])
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)]
|
||||
pub enum LayerName {
|
||||
Image(ImageLayerName),
|
||||
Delta(DeltaLayerName),
|
||||
|
||||
@@ -17,23 +17,35 @@ use tracing::*;
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
use utils::completion::Barrier;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||
use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
|
||||
use crate::tenant::blob_io::WriteBlobError;
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::compaction::CompactionOutcome;
|
||||
use crate::tenant::{TenantShard, TenantState};
|
||||
use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
|
||||
|
||||
/// Semaphore limiting concurrent background tasks (across all tenants).
|
||||
///
|
||||
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
|
||||
static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
|
||||
let total_threads = TOKIO_WORKER_THREADS.get();
|
||||
|
||||
/*BEGIN_HADRON*/
|
||||
// ideally we should run at least one compaction task per tenant in order to (1) maximize
|
||||
// compaction throughput (2) avoid head-of-line blocking of large compactions. However doing
|
||||
// that may create too many compaction tasks with lots of memory overheads. So we limit the
|
||||
// number of compaction tasks based on the available CPU core count.
|
||||
// Need to revisit.
|
||||
// let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD")
|
||||
// .ok()
|
||||
// .and_then(|s| s.parse().ok())
|
||||
// .unwrap_or(4);
|
||||
// let permits = usize::max(1, total_threads * tasks_per_thread);
|
||||
// // assert!(permits < total_threads, "need threads for other work");
|
||||
/*END_HADRON*/
|
||||
|
||||
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(permits < total_threads, "need threads for other work");
|
||||
@@ -295,48 +307,12 @@ pub(crate) fn log_compaction_error(
|
||||
task_cancelled: bool,
|
||||
degrade_to_warning: bool,
|
||||
) {
|
||||
use CompactionError::*;
|
||||
let is_cancel = err.is_cancel();
|
||||
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
|
||||
let level = match err {
|
||||
e if e.is_cancel() => return,
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
AlreadyRunning(_) => Level::ERROR,
|
||||
CollectKeySpaceError(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
let root_cause = err.root_cause();
|
||||
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let buffered_writer_flush_task_canelled = root_cause
|
||||
.downcast_ref::<FlushTaskError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let write_blob_cancelled = root_cause
|
||||
.downcast_ref::<WriteBlobError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let gate_closed = root_cause
|
||||
.downcast_ref::<GateError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let is_stopping = upload_queue
|
||||
|| timeline
|
||||
|| buffered_writer_flush_task_canelled
|
||||
|| write_blob_cancelled
|
||||
|| gate_closed;
|
||||
|
||||
if is_stopping {
|
||||
Level::INFO
|
||||
} else {
|
||||
Level::ERROR
|
||||
}
|
||||
}
|
||||
let level = if is_cancel || task_cancelled {
|
||||
Level::INFO
|
||||
} else {
|
||||
Level::ERROR
|
||||
};
|
||||
|
||||
if let Some((error_count, sleep_duration)) = retry_info {
|
||||
|
||||
@@ -40,7 +40,6 @@ use layer_manager::{
|
||||
Shutdown,
|
||||
};
|
||||
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
|
||||
use pageserver_api::key::{
|
||||
@@ -119,7 +118,6 @@ use crate::pgdatadir_mapping::{
|
||||
MAX_AUX_FILE_V2_DELTAS, MetricsUpdate,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::AttachmentMode;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
@@ -202,7 +200,7 @@ pub struct TimelineResources {
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
pub basebackup_cache: Arc<BasebackupCache>,
|
||||
pub feature_resolver: TenantFeatureResolver,
|
||||
pub feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
@@ -353,6 +351,13 @@ pub struct Timeline {
|
||||
last_image_layer_creation_check_at: AtomicLsn,
|
||||
last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
|
||||
|
||||
// HADRON
|
||||
/// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
|
||||
/// on this key range.
|
||||
force_image_creation_lsn: AtomicLsn,
|
||||
/// The last time instant when force_image_creation_lsn is computed.
|
||||
force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: LogicalSize,
|
||||
|
||||
@@ -450,7 +455,7 @@ pub struct Timeline {
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
feature_resolver: TenantFeatureResolver,
|
||||
feature_resolver: Arc<TenantFeatureResolver>,
|
||||
}
|
||||
|
||||
pub(crate) enum PreviousHeatmap {
|
||||
@@ -587,6 +592,28 @@ pub(crate) enum PageReconstructError {
|
||||
MissingKey(Box<MissingKeyError>),
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
PageReconstructError::Other(_) => false,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(),
|
||||
PageReconstructError::Cancelled => true,
|
||||
PageReconstructError::WalRedo(_) => false,
|
||||
PageReconstructError::MissingKey(_) => false,
|
||||
}
|
||||
}
|
||||
#[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
PageReconstructError::Other(e) => e,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(),
|
||||
PageReconstructError::Cancelled => anyhow::Error::new(self),
|
||||
PageReconstructError::WalRedo(e) => e,
|
||||
PageReconstructError::MissingKey(_) => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageReconstructError {
|
||||
fn from(value: anyhow::Error) -> Self {
|
||||
// with walingest.rs many PageReconstructError are wrapped in as anyhow::Error
|
||||
@@ -740,17 +767,6 @@ impl std::fmt::Display for MissingKeyError {
|
||||
}
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Cancelled => true,
|
||||
Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
@@ -953,13 +969,35 @@ pub enum WaitLsnError {
|
||||
Timeout(String),
|
||||
}
|
||||
|
||||
impl WaitLsnError {
|
||||
pub(crate) fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
WaitLsnError::Shutdown => true,
|
||||
WaitLsnError::BadState(timeline_state) => match timeline_state {
|
||||
TimelineState::Loading => false,
|
||||
TimelineState::Active => false,
|
||||
TimelineState::Stopping => true,
|
||||
TimelineState::Broken { .. } => false,
|
||||
},
|
||||
WaitLsnError::Timeout(_) => false,
|
||||
}
|
||||
}
|
||||
pub(crate) fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
WaitLsnError::Shutdown => anyhow::Error::new(self),
|
||||
WaitLsnError::BadState(_) => anyhow::Error::new(self),
|
||||
WaitLsnError::Timeout(_) => anyhow::Error::new(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for tonic::Status {
|
||||
fn from(err: WaitLsnError) -> Self {
|
||||
use tonic::Code;
|
||||
let code = match &err {
|
||||
WaitLsnError::Timeout(_) => Code::Internal,
|
||||
WaitLsnError::BadState(_) => Code::Internal,
|
||||
WaitLsnError::Shutdown => Code::Unavailable,
|
||||
let code = if err.is_cancel() {
|
||||
Code::Unavailable
|
||||
} else {
|
||||
Code::Internal
|
||||
};
|
||||
tonic::Status::new(code, err.to_string())
|
||||
}
|
||||
@@ -971,7 +1009,7 @@ impl From<WaitLsnError> for tonic::Status {
|
||||
impl From<CreateImageLayersError> for CompactionError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
||||
CreateImageLayersError::Cancelled => CompactionError::new_cancelled(),
|
||||
CreateImageLayersError::Other(e) => {
|
||||
CompactionError::Other(e.context("create image layers"))
|
||||
}
|
||||
@@ -1086,6 +1124,26 @@ enum ImageLayerCreationOutcome {
|
||||
Skip,
|
||||
}
|
||||
|
||||
enum RepartitionError {
|
||||
Other(anyhow::Error),
|
||||
CollectKeyspace(CollectKeySpaceError),
|
||||
}
|
||||
|
||||
impl RepartitionError {
|
||||
fn is_cancel(&self) -> bool {
|
||||
match self {
|
||||
RepartitionError::Other(_) => false,
|
||||
RepartitionError::CollectKeyspace(e) => e.is_cancel(),
|
||||
}
|
||||
}
|
||||
fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
RepartitionError::Other(e) => e,
|
||||
RepartitionError::CollectKeyspace(e) => e.into_anyhow(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -1772,30 +1830,31 @@ impl Timeline {
|
||||
existing_lease.clone()
|
||||
}
|
||||
Entry::Vacant(vacant) => {
|
||||
// Reject already GC-ed LSN if we are in AttachedSingle and
|
||||
// not blocked by the lsn lease deadline.
|
||||
// Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted.
|
||||
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
|
||||
lsn,
|
||||
*latest_gc_cutoff_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// We allow create lease for those below the planned gc cutoff if we are still within the grace period
|
||||
// of GC blocking.
|
||||
let validate = {
|
||||
let conf = self.tenant_conf.load();
|
||||
conf.location.attach_mode == AttachmentMode::Single
|
||||
&& !conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
!conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
};
|
||||
|
||||
if init || validate {
|
||||
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
|
||||
lsn,
|
||||
*latest_gc_cutoff_lsn
|
||||
);
|
||||
}
|
||||
if lsn < planned_cutoff {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
|
||||
lsn,
|
||||
planned_cutoff
|
||||
);
|
||||
}
|
||||
// Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines
|
||||
// whether it is a initial lease creation or a renewal.
|
||||
if (init || validate) && lsn < planned_cutoff {
|
||||
bail!(
|
||||
"tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
|
||||
lsn,
|
||||
planned_cutoff
|
||||
);
|
||||
}
|
||||
|
||||
let dt: DateTime<Utc> = valid_until.into();
|
||||
@@ -2065,22 +2124,7 @@ impl Timeline {
|
||||
match &result {
|
||||
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
||||
Err(e) if e.is_cancel() => {}
|
||||
Err(CompactionError::ShuttingDown) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::AlreadyRunning(_)) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::Other(_)) => {
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
Err(CompactionError::CollectKeySpaceError(_)) => {
|
||||
// Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
// Don't change the current value on offload failure or shutdown. We don't want to
|
||||
// abruptly stall nor resume L0 flushes in these cases.
|
||||
Err(CompactionError::Offload(_)) => {}
|
||||
Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed),
|
||||
};
|
||||
|
||||
result
|
||||
@@ -2809,6 +2853,18 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
// HADRON
|
||||
fn get_image_creation_timeout(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.image_layer_force_creation_period
|
||||
.or(self
|
||||
.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_force_creation_period)
|
||||
}
|
||||
|
||||
fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
|
||||
let tenant_conf = &self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -3078,7 +3134,9 @@ impl Timeline {
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
last_image_layer_creation_check_instant: Mutex::new(None),
|
||||
|
||||
// HADRON
|
||||
force_image_creation_lsn: AtomicLsn::new(0),
|
||||
force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_latest_cache: RwLock::new(HashMap::new()),
|
||||
rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
|
||||
@@ -3129,7 +3187,7 @@ impl Timeline {
|
||||
|
||||
basebackup_cache: resources.basebackup_cache,
|
||||
|
||||
feature_resolver: resources.feature_resolver,
|
||||
feature_resolver: resources.feature_resolver.clone(),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -4970,7 +5028,7 @@ impl Timeline {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
@@ -4999,6 +5057,7 @@ impl Timeline {
|
||||
.create_image_layers(
|
||||
&partitions,
|
||||
self.initdb_lsn,
|
||||
None,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
LastImageLayerCreationStatus::Initial,
|
||||
@@ -5220,18 +5279,18 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> {
|
||||
let Ok(mut guard) = self.partitioning.try_write_guard() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
return Err(RepartitionError::Other(anyhow!(
|
||||
"repartition() called concurrently"
|
||||
)));
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
return Err(RepartitionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
)));
|
||||
}
|
||||
@@ -5252,7 +5311,10 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let (dense_ks, sparse_ks) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.await
|
||||
.map_err(RepartitionError::CollectKeyspace)?;
|
||||
let dense_partitioning = dense_ks.partition(
|
||||
&self.shard_identity,
|
||||
partition_size,
|
||||
@@ -5267,14 +5329,19 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition? True if we want to generate.
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
async fn time_for_new_image_layer(
|
||||
&self,
|
||||
partition: &KeySpace,
|
||||
lsn: Lsn,
|
||||
force_image_creation_lsn: Option<Lsn>,
|
||||
) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let Ok(layers) = guard.layer_map() else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let mut min_image_lsn: Lsn = Lsn::MAX;
|
||||
let mut max_deltas = 0;
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn);
|
||||
@@ -5309,9 +5376,22 @@ impl Timeline {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
min_image_lsn = min(min_image_lsn, img_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
// HADRON
|
||||
if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
|
||||
info!(
|
||||
"forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
|
||||
partition.ranges[0].start,
|
||||
partition.ranges[0].end,
|
||||
min_image_lsn,
|
||||
force_image_creation_lsn.unwrap()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
debug!(
|
||||
max_deltas,
|
||||
"none of the partitioned ranges had >= {threshold} deltas"
|
||||
@@ -5537,7 +5617,7 @@ impl Timeline {
|
||||
/// suffer from the lack of image layers
|
||||
/// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval
|
||||
fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
|
||||
const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
|
||||
let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
|
||||
|
||||
let last_checks_at = self.last_image_layer_creation_check_at.load();
|
||||
let distance = lsn
|
||||
@@ -5551,12 +5631,12 @@ impl Timeline {
|
||||
let mut time_based_decision = false;
|
||||
let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
|
||||
if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
|
||||
let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
|
||||
{
|
||||
self.get_checkpoint_timeout()
|
||||
} else {
|
||||
Duration::from_secs(3600 * 48)
|
||||
};
|
||||
let check_required_after =
|
||||
if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
|
||||
self.get_checkpoint_timeout()
|
||||
} else {
|
||||
Duration::from_secs(3600 * 48)
|
||||
};
|
||||
|
||||
time_based_decision = match *last_check_instant {
|
||||
Some(last_check) => {
|
||||
@@ -5584,10 +5664,12 @@ impl Timeline {
|
||||
/// true = we have generate all image layers, false = we preempt the process for L0 compaction.
|
||||
///
|
||||
/// `partition_mode` is only for logging purpose and is not used anywhere in this function.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn create_image_layers(
|
||||
self: &Arc<Timeline>,
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force_image_creation_lsn: Option<Lsn>,
|
||||
mode: ImageLayerCreationMode,
|
||||
ctx: &RequestContext,
|
||||
last_status: LastImageLayerCreationStatus,
|
||||
@@ -5691,7 +5773,11 @@ impl Timeline {
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
|
||||
if !check_for_image_layers
|
||||
|| !self
|
||||
.time_for_new_image_layer(partition, lsn, force_image_creation_lsn)
|
||||
.await
|
||||
{
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
@@ -6012,57 +6098,88 @@ impl Drop for Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Top-level failure to compact.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CompactionError {
|
||||
#[error("The timeline or pageserver is shutting down")]
|
||||
ShuttingDown,
|
||||
/// Compaction tried to offload a timeline and failed
|
||||
#[error("Failed to offload timeline: {0}")]
|
||||
Offload(OffloadError),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
#[error("Failed to collect keyspace: {0}")]
|
||||
CollectKeySpaceError(#[from] CollectKeySpaceError),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
#[error("Compaction already running: {0}")]
|
||||
AlreadyRunning(&'static str),
|
||||
}
|
||||
pub(crate) use compaction_error::CompactionError;
|
||||
/// In a private mod to enforce that [`CompactionError::is_cancel`] is used
|
||||
/// instead of `match`ing on [`CompactionError::ShuttingDown`].
|
||||
mod compaction_error {
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
impl CompactionError {
|
||||
/// Errors that can be ignored, i.e., cancel and shutdown.
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::ShuttingDown
|
||||
| Self::AlreadyRunning(_)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::Cancelled
|
||||
))
|
||||
| Self::Offload(OffloadError::Cancelled)
|
||||
)
|
||||
use crate::{
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized},
|
||||
virtual_file::owned_buffers_io::write::FlushTaskError,
|
||||
};
|
||||
|
||||
/// Top-level failure to compact. Use [`Self::is_cancel`].
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CompactionError {
|
||||
/// Use [`Self::is_cancel`] instead of checking for this variant.
|
||||
#[error("The timeline or pageserver is shutting down")]
|
||||
#[allow(private_interfaces)]
|
||||
ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`].
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
/// Critical errors that indicate data corruption.
|
||||
pub fn is_critical(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::CollectKeySpaceError(
|
||||
CollectKeySpaceError::Decode(_)
|
||||
| CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
#[derive(Debug)]
|
||||
struct ForbidMatching;
|
||||
|
||||
impl From<OffloadError> for CompactionError {
|
||||
fn from(e: OffloadError) -> Self {
|
||||
match e {
|
||||
OffloadError::Cancelled => Self::ShuttingDown,
|
||||
_ => Self::Offload(e),
|
||||
impl CompactionError {
|
||||
pub fn new_cancelled() -> Self {
|
||||
Self::ShuttingDown(ForbidMatching)
|
||||
}
|
||||
/// Errors that can be ignored, i.e., cancel and shutdown.
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
let other = match self {
|
||||
CompactionError::ShuttingDown(_) => return true,
|
||||
CompactionError::Other(other) => other,
|
||||
};
|
||||
|
||||
// The write path of compaction in particular often lacks differentiated
|
||||
// handling errors stemming from cancellation from other errors.
|
||||
// So, if requested, we also check the ::Other variant by downcasting.
|
||||
// The list below has been found empirically from flaky tests and production logs.
|
||||
// The process is simple: on ::Other(), compaction will print the enclosed
|
||||
// anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the
|
||||
// line where the write path / compaction code does undifferentiated error handling
|
||||
// from a non-anyhow type to an anyhow type. Add the type to the list of downcasts
|
||||
// below, following the same is_cancel() pattern.
|
||||
|
||||
let root_cause = other.root_cause();
|
||||
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let buffered_writer_flush_task_canelled = root_cause
|
||||
.downcast_ref::<FlushTaskError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let write_blob_cancelled = root_cause
|
||||
.downcast_ref::<WriteBlobError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
let gate_closed = root_cause
|
||||
.downcast_ref::<GateError>()
|
||||
.is_some_and(|e| e.is_cancel());
|
||||
upload_queue
|
||||
|| timeline
|
||||
|| buffered_writer_flush_task_canelled
|
||||
|| write_blob_cancelled
|
||||
|| gate_closed
|
||||
}
|
||||
pub fn into_anyhow(self) -> anyhow::Error {
|
||||
match self {
|
||||
CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self),
|
||||
CompactionError::Other(e) => e,
|
||||
}
|
||||
}
|
||||
pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
|
||||
if err.is_cancel() {
|
||||
Self::new_cancelled()
|
||||
} else {
|
||||
Self::Other(err.into_anyhow())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6074,7 +6191,7 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
|
||||
CompactionError::Other(anyhow::anyhow!(value))
|
||||
}
|
||||
super::upload_queue::NotInitialized::ShuttingDown
|
||||
| super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
|
||||
| super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6084,7 +6201,7 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
|
||||
match e {
|
||||
super::storage_layer::layer::DownloadError::TimelineShutdown
|
||||
| super::storage_layer::layer::DownloadError::DownloadCancelled => {
|
||||
CompactionError::ShuttingDown
|
||||
CompactionError::new_cancelled()
|
||||
}
|
||||
super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
|
||||
| super::storage_layer::layer::DownloadError::DownloadRequired
|
||||
@@ -6103,14 +6220,14 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
|
||||
|
||||
impl From<layer_manager::Shutdown> for CompactionError {
|
||||
fn from(_: layer_manager::Shutdown) -> Self {
|
||||
CompactionError::ShuttingDown
|
||||
CompactionError::new_cancelled()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<super::storage_layer::errors::PutError> for CompactionError {
|
||||
fn from(e: super::storage_layer::errors::PutError) -> Self {
|
||||
if e.is_cancel() {
|
||||
CompactionError::ShuttingDown
|
||||
CompactionError::new_cancelled()
|
||||
} else {
|
||||
CompactionError::Other(e.into_anyhow())
|
||||
}
|
||||
@@ -6209,7 +6326,7 @@ impl Timeline {
|
||||
let mut guard = tokio::select! {
|
||||
guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -6765,7 +6882,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
||||
async fn reconstruct_value(
|
||||
pub(crate) async fn reconstruct_value(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
|
||||
@@ -4,10 +4,11 @@
|
||||
//!
|
||||
//! The old legacy algorithm is implemented directly in `timeline.rs`.
|
||||
|
||||
use std::cmp::min;
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use super::layer_manager::LayerManagerLockHolder;
|
||||
use super::{
|
||||
@@ -16,7 +17,8 @@ use super::{
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||
use crate::tenant::timeline::{DeltaEntry, RepartitionError};
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
@@ -32,6 +34,7 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use pageserver_compaction::helpers::{fully_contains, overlaps_with};
|
||||
use pageserver_compaction::interface::*;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -44,6 +47,7 @@ use wal_decoder::models::value::Value;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
@@ -64,7 +68,7 @@ use crate::tenant::timeline::{
|
||||
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
|
||||
ResidentLayer, drop_layer_manager_rlock,
|
||||
};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
|
||||
@@ -571,7 +575,7 @@ impl GcCompactionQueue {
|
||||
}
|
||||
match res {
|
||||
Ok(res) => Ok(res),
|
||||
Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
|
||||
Err(e) if e.is_cancel() => Err(e),
|
||||
Err(_) => {
|
||||
// There are some cases where traditional gc might collect some layer
|
||||
// files causing gc-compaction cannot read the full history of the key.
|
||||
@@ -591,9 +595,9 @@ impl GcCompactionQueue {
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
|
||||
return Err(CompactionError::AlreadyRunning(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.",
|
||||
));
|
||||
return Err(CompactionError::Other(anyhow::anyhow!(
|
||||
"cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."
|
||||
)));
|
||||
};
|
||||
let has_pending_tasks;
|
||||
let mut yield_for_l0 = false;
|
||||
@@ -1259,13 +1263,19 @@ impl Timeline {
|
||||
// Is the timeline being deleted?
|
||||
if self.is_stopping() {
|
||||
trace!("Dropping out of compaction on timeline shutdown");
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
|
||||
// Define partitioning schema if needed
|
||||
|
||||
// HADRON
|
||||
let force_image_creation_lsn = self
|
||||
.get_or_compute_force_image_creation_lsn(cancel, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
// 1. L0 Compact
|
||||
let l0_outcome = {
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
@@ -1273,6 +1283,7 @@ impl Timeline {
|
||||
.compact_level0(
|
||||
target_file_size,
|
||||
options.flags.contains(CompactFlags::ForceL0Compaction),
|
||||
force_image_creation_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1375,6 +1386,7 @@ impl Timeline {
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
force_image_creation_lsn,
|
||||
mode,
|
||||
&image_ctx,
|
||||
self.last_image_layer_creation_status
|
||||
@@ -1417,22 +1429,33 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Suppress errors when cancelled.
|
||||
Err(_) if self.cancel.is_cancelled() => {}
|
||||
//
|
||||
// Log other errors but continue. Failure to repartition is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
|
||||
// key-value store, ignoring the datadir layout. Log the error but continue.
|
||||
//
|
||||
// TODO:
|
||||
// 1. shouldn't we return early here if we observe cancellation
|
||||
// 2. Experiment: can we stop checking self.cancel here?
|
||||
Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch
|
||||
Err(err) if err.is_cancel() => {}
|
||||
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
Err(err) if err.is_critical() => {
|
||||
Err(RepartitionError::CollectKeyspace(
|
||||
e @ CollectKeySpaceError::Decode(_)
|
||||
| e @ CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
),
|
||||
)) => {
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
"could not compact, repartitioning keyspace failed: {err:?}"
|
||||
"could not compact, repartitioning keyspace failed: {e:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Log other errors. No partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
|
||||
// key-value store, ignoring the datadir layout. Log the error but continue.
|
||||
Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
|
||||
Err(e) => error!(
|
||||
"could not compact, repartitioning keyspace failed: {:?}",
|
||||
e.into_anyhow()
|
||||
),
|
||||
};
|
||||
|
||||
let partition_count = self.partitioning.read().0.0.parts.len();
|
||||
@@ -1460,6 +1483,63 @@ impl Timeline {
|
||||
Ok(CompactionOutcome::Done)
|
||||
}
|
||||
|
||||
/* BEGIN_HADRON */
|
||||
// Get the force image creation LSN. Compute it if the last computed LSN is too old.
|
||||
async fn get_or_compute_force_image_creation_lsn(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<Lsn>> {
|
||||
const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
|
||||
let image_layer_force_creation_period = self.get_image_creation_timeout();
|
||||
if image_layer_force_creation_period.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
|
||||
let force_image_creation_lsn_computed_at =
|
||||
*self.force_image_creation_lsn_computed_at.lock().unwrap();
|
||||
if force_image_creation_lsn_computed_at.is_none()
|
||||
|| force_image_creation_lsn_computed_at.unwrap().elapsed()
|
||||
> FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
|
||||
{
|
||||
let now: SystemTime = SystemTime::now();
|
||||
let timestamp = now
|
||||
.checked_sub(image_layer_force_creation_period)
|
||||
.ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"image creation timeout is too large: {image_layer_force_creation_period:?}"
|
||||
)
|
||||
})?;
|
||||
let timestamp = to_pg_timestamp(timestamp);
|
||||
let force_image_creation_lsn = match self
|
||||
.find_lsn_for_timestamp(timestamp, cancel, ctx)
|
||||
.await?
|
||||
{
|
||||
LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
|
||||
_ => {
|
||||
let gc_lsn = *self.get_applied_gc_cutoff_lsn();
|
||||
tracing::info!(
|
||||
"no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
|
||||
gc_lsn
|
||||
);
|
||||
gc_lsn
|
||||
}
|
||||
};
|
||||
self.force_image_creation_lsn
|
||||
.store(force_image_creation_lsn);
|
||||
*self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
|
||||
tracing::info!(
|
||||
"computed force image creation LSN: {}",
|
||||
force_image_creation_lsn
|
||||
);
|
||||
Ok(Some(force_image_creation_lsn))
|
||||
} else {
|
||||
Ok(Some(self.force_image_creation_lsn.load()))
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// Check for layers that are elegible to be rewritten:
|
||||
/// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
|
||||
/// we don't indefinitely retain keys in this shard that aren't needed.
|
||||
@@ -1612,7 +1692,7 @@ impl Timeline {
|
||||
|
||||
for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
|
||||
@@ -1710,7 +1790,7 @@ impl Timeline {
|
||||
Ok(()) => {},
|
||||
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
|
||||
Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
},
|
||||
// Don't wait if there's L0 compaction to do. We don't need to update the outcome
|
||||
@@ -1789,6 +1869,7 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
target_file_size: u64,
|
||||
force_compaction_ignore_threshold: bool,
|
||||
force_compaction_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let CompactLevel0Phase1Result {
|
||||
@@ -1809,6 +1890,7 @@ impl Timeline {
|
||||
stats,
|
||||
target_file_size,
|
||||
force_compaction_ignore_threshold,
|
||||
force_compaction_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(phase1_span)
|
||||
@@ -1831,6 +1913,7 @@ impl Timeline {
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
force_compaction_ignore_threshold: bool,
|
||||
force_compaction_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
let begin = tokio::time::Instant::now();
|
||||
@@ -1860,11 +1943,28 @@ impl Timeline {
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
level0_deltas = level0_deltas.len(),
|
||||
threshold, "too few deltas to compact"
|
||||
);
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
// HADRON
|
||||
let min_lsn = level0_deltas
|
||||
.iter()
|
||||
.map(|a| a.get_lsn_range().start)
|
||||
.reduce(min);
|
||||
if force_compaction_lsn.is_some()
|
||||
&& min_lsn.is_some()
|
||||
&& min_lsn.unwrap() < force_compaction_lsn.unwrap()
|
||||
{
|
||||
info!(
|
||||
"forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}",
|
||||
level0_deltas.len(),
|
||||
min_lsn.unwrap(),
|
||||
force_compaction_lsn.unwrap()
|
||||
);
|
||||
} else {
|
||||
debug!(
|
||||
level0_deltas = level0_deltas.len(),
|
||||
threshold, "too few deltas to compact"
|
||||
);
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1973,7 +2073,7 @@ impl Timeline {
|
||||
let mut all_keys = Vec::new();
|
||||
for l in deltas_to_compact.iter() {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
let keys = delta
|
||||
@@ -2066,7 +2166,7 @@ impl Timeline {
|
||||
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
|
||||
@@ -2174,7 +2274,7 @@ impl Timeline {
|
||||
// avoid hitting the cancellation token on every key. in benches, we end up
|
||||
// shuffling an order of million keys per layer, this means we'll check it
|
||||
// around tens of times per layer.
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let same_key = prev_key == Some(key);
|
||||
@@ -2259,7 +2359,7 @@ impl Timeline {
|
||||
if writer.is_none() {
|
||||
if self.cancel.is_cancelled() {
|
||||
// to be somewhat responsive to cancellation, check for each new layer
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(
|
||||
@@ -2515,10 +2615,13 @@ impl Timeline {
|
||||
// Is the timeline being deleted?
|
||||
if self.is_stopping() {
|
||||
trace!("Dropping out of compaction on timeline shutdown");
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
let (dense_ks, _sparse_ks) = self
|
||||
.collect_keyspace(end_lsn, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::from_collect_keyspace)?;
|
||||
// TODO(chi): ignore sparse_keyspace for now, compact it in the future.
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
|
||||
|
||||
@@ -3174,7 +3277,7 @@ impl Timeline {
|
||||
let gc_lock = async {
|
||||
tokio::select! {
|
||||
guard = self.gc_lock.lock() => Ok(guard),
|
||||
_ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
|
||||
_ = cancel.cancelled() => Err(CompactionError::new_cancelled()),
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3447,7 +3550,7 @@ impl Timeline {
|
||||
}
|
||||
total_layer_size += layer.layer_desc().file_size;
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
let should_yield = yield_for_l0
|
||||
&& self
|
||||
@@ -3594,7 +3697,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
return Err(CompactionError::new_cancelled());
|
||||
}
|
||||
|
||||
let should_yield = yield_for_l0
|
||||
|
||||
@@ -212,8 +212,12 @@
|
||||
//! to the parent shard during a shard split. Eventually, the shard split task will
|
||||
//! shut down the parent => case (1).
|
||||
|
||||
use std::collections::{HashMap, hash_map};
|
||||
use std::sync::{Arc, Mutex, Weak};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::hash_map;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
use std::time::Duration;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::{instrument, trace};
|
||||
@@ -333,6 +337,44 @@ enum RoutingResult<T: Types> {
|
||||
}
|
||||
|
||||
impl<T: Types> Cache<T> {
|
||||
/* BEGIN_HADRON */
|
||||
/// A wrapper of do_get to resolve the tenant shard for a get page request.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
const GET_MAX_RETRIES: usize = 10;
|
||||
const RETRY_BACKOFF: Duration = Duration::from_millis(100);
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self
|
||||
.do_get(timeline_id, shard_selector, tenant_manager)
|
||||
.await
|
||||
{
|
||||
Ok(handle) => return Ok(handle),
|
||||
Err(e) => {
|
||||
// Retry on tenant manager error to handle tenant split more gracefully
|
||||
if attempt < GET_MAX_RETRIES {
|
||||
tracing::warn!(
|
||||
"Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
|
||||
attempt,
|
||||
e
|
||||
);
|
||||
tokio::time::sleep(RETRY_BACKOFF).await;
|
||||
continue;
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* END_HADRON */
|
||||
|
||||
/// See module-level comment for details.
|
||||
///
|
||||
/// Does NOT check for the shutdown state of [`Types::Timeline`].
|
||||
@@ -341,7 +383,7 @@ impl<T: Types> Cache<T> {
|
||||
/// and if so, return an error that causes the page service to
|
||||
/// close the connection.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
async fn do_get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
@@ -879,6 +921,7 @@ mod tests {
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
|
||||
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
|
||||
|
||||
cache
|
||||
|
||||
@@ -17,8 +17,6 @@ pub(crate) enum OffloadError {
|
||||
Cancelled,
|
||||
#[error("Timeline is not archived")]
|
||||
NotArchived,
|
||||
#[error(transparent)]
|
||||
RemoteStorage(anyhow::Error),
|
||||
#[error("Offload or deletion already in progress")]
|
||||
AlreadyInProgress,
|
||||
#[error("Unexpected offload error: {0}")]
|
||||
@@ -29,7 +27,7 @@ impl From<TenantManifestError> for OffloadError {
|
||||
fn from(e: TenantManifestError) -> Self {
|
||||
match e {
|
||||
TenantManifestError::Cancelled => Self::Cancelled,
|
||||
TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
|
||||
TenantManifestError::RemoteStorage(e) => Self::Other(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,12 +182,19 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
// If we've not received any updates from the broker from a while, are waiting for WAL
|
||||
// and have no safekeeper connection or connection candidates, then it might be that
|
||||
// the broker subscription is wedged. Drop the currrent subscription and re-subscribe
|
||||
// with the goal of unblocking it.
|
||||
_ = broker_reset_interval.tick() => {
|
||||
if wait_lsn_status.borrow().is_some() {
|
||||
tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...")
|
||||
}
|
||||
let awaiting_lsn = wait_lsn_status.borrow().is_some();
|
||||
let no_candidates = connection_manager_state.wal_stream_candidates.is_empty();
|
||||
let no_connection = connection_manager_state.wal_connection.is_none();
|
||||
|
||||
broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
if awaiting_lsn && no_candidates && no_connection {
|
||||
tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
|
||||
broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
}
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user