mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 17:10:38 +00:00
Merge remote-tracking branch 'origin/problame/batching-sidecar-task' into problame/batching-metrics-improvements
This commit is contained in:
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
|
||||
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
|
||||
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
// Create if not exists and make sure all the contents are durable before proceeding.
|
||||
|
||||
@@ -14,6 +14,7 @@ use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::logging::SecretString;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
use reqwest::Url;
|
||||
@@ -144,6 +145,10 @@ pub struct PageServerConf {
|
||||
/// JWT token for use with the control plane API.
|
||||
pub control_plane_api_token: Option<SecretString>,
|
||||
|
||||
pub import_pgdata_upcall_api: Option<Url>,
|
||||
pub import_pgdata_upcall_api_token: Option<SecretString>,
|
||||
pub import_pgdata_aws_endpoint_url: Option<Url>,
|
||||
|
||||
/// If true, pageserver will make best-effort to operate without a control plane: only
|
||||
/// for use in major incidents.
|
||||
pub control_plane_emergency_mode: bool,
|
||||
@@ -183,7 +188,9 @@ pub struct PageServerConf {
|
||||
/// Optionally disable disk syncs (unsafe!)
|
||||
pub no_sync: bool,
|
||||
|
||||
pub page_service_pipelining: Option<pageserver_api::config::PageServicePipeliningConfig>,
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -326,6 +333,9 @@ impl PageServerConf {
|
||||
control_plane_api,
|
||||
control_plane_api_token,
|
||||
control_plane_emergency_mode,
|
||||
import_pgdata_upcall_api,
|
||||
import_pgdata_upcall_api_token,
|
||||
import_pgdata_aws_endpoint_url,
|
||||
heatmap_upload_concurrency,
|
||||
secondary_download_concurrency,
|
||||
ingest_batch_size,
|
||||
@@ -340,6 +350,7 @@ impl PageServerConf {
|
||||
virtual_file_io_engine,
|
||||
tenant_config,
|
||||
no_sync,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
} = config_toml;
|
||||
|
||||
@@ -380,6 +391,10 @@ impl PageServerConf {
|
||||
image_compression,
|
||||
timeline_offloading,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
import_pgdata_upcall_api,
|
||||
import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
|
||||
import_pgdata_aws_endpoint_url,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@@ -1144,18 +1144,24 @@ pub(crate) mod mock {
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
cancel: CancellationToken,
|
||||
executed: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl ConsumerState {
|
||||
async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
|
||||
let mut executed = 0;
|
||||
|
||||
async fn consume(&mut self, remote_storage: &GenericRemoteStorage) {
|
||||
info!("Executing all pending deletions");
|
||||
|
||||
// Transform all executor messages to generic frontend messages
|
||||
while let Ok(msg) = self.executor_rx.try_recv() {
|
||||
loop {
|
||||
use either::Either;
|
||||
let msg = tokio::select! {
|
||||
left = self.executor_rx.recv() => Either::Left(left),
|
||||
right = self.rx.recv() => Either::Right(right),
|
||||
};
|
||||
match msg {
|
||||
DeleterMessage::Delete(objects) => {
|
||||
Either::Left(None) => break,
|
||||
Either::Right(None) => break,
|
||||
Either::Left(Some(DeleterMessage::Delete(objects))) => {
|
||||
for path in objects {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
@@ -1165,18 +1171,13 @@ pub(crate) mod mock {
|
||||
error!("Failed to delete {path}, leaking object! ({e})");
|
||||
}
|
||||
}
|
||||
executed += 1;
|
||||
self.executed.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
DeleterMessage::Flush(flush_op) => {
|
||||
Either::Left(Some(DeleterMessage::Flush(flush_op))) => {
|
||||
flush_op.notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while let Ok(msg) = self.rx.try_recv() {
|
||||
match msg {
|
||||
ListWriterQueueMessage::Delete(op) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::Delete(op))) => {
|
||||
let mut objects = op.objects;
|
||||
for (layer, meta) in op.layers {
|
||||
objects.push(remote_layer_path(
|
||||
@@ -1198,33 +1199,27 @@ pub(crate) mod mock {
|
||||
error!("Failed to delete {path}, leaking object! ({e})");
|
||||
}
|
||||
}
|
||||
executed += 1;
|
||||
self.executed.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
ListWriterQueueMessage::Flush(op) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::Flush(op))) => {
|
||||
op.notify();
|
||||
}
|
||||
ListWriterQueueMessage::FlushExecute(op) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => {
|
||||
// We have already executed all prior deletions because mock does them inline
|
||||
op.notify();
|
||||
}
|
||||
ListWriterQueueMessage::Recover(_) => {
|
||||
Either::Right(Some(ListWriterQueueMessage::Recover(_))) => {
|
||||
// no-op in mock
|
||||
}
|
||||
}
|
||||
info!("All pending deletions have been executed");
|
||||
}
|
||||
|
||||
executed
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockDeletionQueue {
|
||||
tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
|
||||
executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
executed: Arc<AtomicUsize>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
consumer: std::sync::Mutex<ConsumerState>,
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
}
|
||||
|
||||
@@ -1235,29 +1230,34 @@ pub(crate) mod mock {
|
||||
|
||||
let executed = Arc::new(AtomicUsize::new(0));
|
||||
|
||||
let mut consumer = ConsumerState {
|
||||
rx,
|
||||
executor_rx,
|
||||
cancel: CancellationToken::new(),
|
||||
executed: executed.clone(),
|
||||
};
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
consumer.consume(remote_storage).await;
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
tx,
|
||||
executor_tx,
|
||||
executed,
|
||||
remote_storage,
|
||||
consumer: std::sync::Mutex::new(ConsumerState {
|
||||
rx,
|
||||
executor_rx,
|
||||
cancel: CancellationToken::new(),
|
||||
}),
|
||||
lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::await_holding_lock)]
|
||||
pub async fn pump(&self) {
|
||||
if let Some(remote_storage) = &self.remote_storage {
|
||||
// Permit holding mutex across await, because this is only ever
|
||||
// called once at a time in tests.
|
||||
let mut locked = self.consumer.lock().unwrap();
|
||||
let count = locked.consume(remote_storage).await;
|
||||
self.executed.fetch_add(count, Ordering::Relaxed);
|
||||
}
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
self.executor_tx
|
||||
.send(DeleterMessage::Flush(FlushOp { tx }))
|
||||
.await
|
||||
.expect("Failed to send flush message");
|
||||
rx.await.ok();
|
||||
}
|
||||
|
||||
pub(crate) fn new_client(&self) -> DeletionQueueClient {
|
||||
|
||||
@@ -15,6 +15,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use utils::backoff;
|
||||
use utils::pausable_failpoint;
|
||||
|
||||
use crate::metrics;
|
||||
|
||||
@@ -90,6 +91,7 @@ impl Deleter {
|
||||
/// Block until everything in accumulator has been executed
|
||||
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
|
||||
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
|
||||
pausable_failpoint!("deletion-queue-before-execute-pause");
|
||||
match self.remote_delete().await {
|
||||
Ok(()) => {
|
||||
// Note: we assume that the remote storage layer returns Ok(()) if some
|
||||
|
||||
@@ -623,6 +623,8 @@ paths:
|
||||
existing_initdb_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
import_pgdata:
|
||||
$ref: "#/components/schemas/TimelineCreateRequestImportPgdata"
|
||||
responses:
|
||||
"201":
|
||||
description: Timeline was created, or already existed with matching parameters
|
||||
@@ -979,6 +981,34 @@ components:
|
||||
$ref: "#/components/schemas/TenantConfig"
|
||||
effective_config:
|
||||
$ref: "#/components/schemas/TenantConfig"
|
||||
TimelineCreateRequestImportPgdata:
|
||||
type: object
|
||||
required:
|
||||
- location
|
||||
- idempotency_key
|
||||
properties:
|
||||
idempotency_key:
|
||||
type: string
|
||||
location:
|
||||
$ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation"
|
||||
TimelineCreateRequestImportPgdataLocation:
|
||||
type: object
|
||||
properties:
|
||||
AwsS3:
|
||||
$ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3"
|
||||
TimelineCreateRequestImportPgdataLocationAwsS3:
|
||||
type: object
|
||||
properties:
|
||||
region:
|
||||
type: string
|
||||
bucket:
|
||||
type: string
|
||||
key:
|
||||
type: string
|
||||
required:
|
||||
- region
|
||||
- bucket
|
||||
- key
|
||||
TimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -40,6 +40,7 @@ use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::TimelineArchivalConfigRequest;
|
||||
use pageserver_api::models::TimelineCreateRequestMode;
|
||||
use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
|
||||
use pageserver_api::models::TimelinesInfoAndOffloaded;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
use pageserver_api::models::TopTenantShardsRequest;
|
||||
@@ -55,6 +56,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::profile_cpu_handler;
|
||||
use utils::http::endpoint::prometheus_metrics_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::request::must_parse_query_param;
|
||||
@@ -80,6 +82,7 @@ use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::import_pgdata;
|
||||
use crate::tenant::timeline::offload::offload_timeline;
|
||||
use crate::tenant::timeline::offload::OffloadError;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
@@ -125,7 +128,7 @@ pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
allowlist_routes: &'static [&'static str],
|
||||
remote_storage: GenericRemoteStorage,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
@@ -146,10 +149,13 @@ impl State {
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
let allowlist_routes = &[
|
||||
"/v1/status",
|
||||
"/v1/doc",
|
||||
"/swagger.yml",
|
||||
"/metrics",
|
||||
"/profile/cpu",
|
||||
];
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_manager,
|
||||
@@ -576,6 +582,35 @@ async fn timeline_create_handler(
|
||||
ancestor_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
}),
|
||||
TimelineCreateRequestMode::ImportPgdata {
|
||||
import_pgdata:
|
||||
TimelineCreateRequestModeImportPgdata {
|
||||
location,
|
||||
idempotency_key,
|
||||
},
|
||||
} => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata {
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new(
|
||||
idempotency_key.0,
|
||||
),
|
||||
new_timeline_id,
|
||||
location: {
|
||||
use import_pgdata::index_part_format::Location;
|
||||
use pageserver_api::models::ImportPgdataLocation;
|
||||
match location {
|
||||
#[cfg(feature = "testing")]
|
||||
ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path },
|
||||
ImportPgdataLocation::AwsS3 {
|
||||
region,
|
||||
bucket,
|
||||
key,
|
||||
} => Location::AwsS3 {
|
||||
region,
|
||||
bucket,
|
||||
key,
|
||||
},
|
||||
}
|
||||
},
|
||||
}),
|
||||
};
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||
@@ -3148,7 +3183,7 @@ pub fn make_router(
|
||||
if auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
let state = get_state(request);
|
||||
if state.allowlist_routes.contains(request.uri()) {
|
||||
if state.allowlist_routes.contains(&request.uri().path()) {
|
||||
None
|
||||
} else {
|
||||
state.auth.as_deref()
|
||||
@@ -3167,6 +3202,7 @@ pub fn make_router(
|
||||
Ok(router
|
||||
.data(state)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
.put("/v1/failpoints", |r| {
|
||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||
|
||||
@@ -356,6 +356,25 @@ async fn timed<Fut: std::future::Future>(
|
||||
}
|
||||
}
|
||||
|
||||
/// Like [`timed`], but the warning timeout only starts after `cancel` has been cancelled.
|
||||
async fn timed_after_cancellation<Fut: std::future::Future>(
|
||||
fut: Fut,
|
||||
name: &str,
|
||||
warn_at: std::time::Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> <Fut as std::future::Future>::Output {
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
timed(fut, name, warn_at).await
|
||||
}
|
||||
ret = &mut fut => {
|
||||
ret
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod timed_tests {
|
||||
use super::timed;
|
||||
|
||||
@@ -3,7 +3,7 @@ use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
|
||||
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
|
||||
Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
|
||||
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -457,6 +457,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
|
||||
register_gauge_vec!(
|
||||
"pageserver_flush_wait_upload_seconds",
|
||||
"Time spent waiting for preceding uploads during layer flush",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_last_record_lsn",
|
||||
@@ -653,6 +662,35 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_relsize_cache_entries",
|
||||
"Number of entries in the relation size cache",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_relsize_cache_misses",
|
||||
"Relation size cache misses",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_relsize_cache_misses_old",
|
||||
"Relation size cache misses where the lookup LSN is older than the last relation update"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -2097,6 +2135,7 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
|
||||
pub(crate) clear_vm_bits_unknown: IntCounterVec,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
@@ -2125,6 +2164,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
|
||||
"Total number of zero gap blocks written on relation extends"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
clear_vm_bits_unknown: register_int_counter_vec!(
|
||||
"pageserver_wal_ingest_clear_vm_bits_unknown",
|
||||
"Number of ignored ClearVmBits operations due to unknown pages/relations",
|
||||
&["entity"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
@@ -2327,6 +2372,7 @@ pub(crate) struct TimelineMetrics {
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub flush_wait_upload_time_gauge: Gauge,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
pub logical_size_histo: StorageTimeMetrics,
|
||||
@@ -2370,6 +2416,9 @@ impl TimelineMetrics {
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let compact_time_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::Compact,
|
||||
&tenant_id,
|
||||
@@ -2507,6 +2556,7 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
flush_time_histo,
|
||||
flush_wait_upload_time_gauge,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
logical_size_histo,
|
||||
@@ -2554,6 +2604,14 @@ impl TimelineMetrics {
|
||||
self.resident_physical_size_gauge.get()
|
||||
}
|
||||
|
||||
pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
|
||||
self.flush_wait_upload_time_gauge.add(duration);
|
||||
crate::metrics::FLUSH_WAIT_UPLOAD_TIME
|
||||
.get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
|
||||
.unwrap()
|
||||
.add(duration);
|
||||
}
|
||||
|
||||
pub(crate) fn shutdown(&self) {
|
||||
let was_shutdown = self
|
||||
.shutdown
|
||||
@@ -2570,6 +2628,7 @@ impl TimelineMetrics {
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
{
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
|
||||
@@ -7,7 +7,10 @@ use bytes::Buf;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{PageServicePipeliningConfig, PageServiceProtocolPipeliningMode};
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::models::{self, TenantState};
|
||||
use pageserver_api::models::{
|
||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
@@ -36,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::sync::spsc_fold;
|
||||
use utils::{
|
||||
auth::{Claims, Scope, SwappableJwtAuth},
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -44,7 +48,6 @@ use utils::{
|
||||
};
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -62,6 +65,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::{basebackup, timed_after_cancellation};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
@@ -158,7 +162,7 @@ pub async fn libpq_listener_main(
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
pipelining_config: Option<PageServicePipeliningConfig>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
@@ -217,7 +221,7 @@ async fn page_service_conn_main(
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
pipelining_config: Option<PageServicePipeliningConfig>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> ConnectionHandlerResult {
|
||||
@@ -319,7 +323,7 @@ struct PageServerHandler {
|
||||
/// None only while pagestream protocol is being processed.
|
||||
timeline_handles: Option<TimelineHandles>,
|
||||
|
||||
pipelining_config: Option<PageServicePipeliningConfig>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
}
|
||||
|
||||
struct TimelineHandles {
|
||||
@@ -574,7 +578,7 @@ impl PageServerHandler {
|
||||
pub fn new(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
pipelining_config: Option<PageServicePipeliningConfig>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
@@ -620,7 +624,7 @@ impl PageServerHandler {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
parent_span: Span,
|
||||
) -> Result<Option<Box<BatchedFeMessage>>, QueryError>
|
||||
) -> Result<Option<BatchedFeMessage>, QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
|
||||
{
|
||||
@@ -722,7 +726,7 @@ impl PageServerHandler {
|
||||
span,
|
||||
error: $error,
|
||||
};
|
||||
Ok(Some(Box::new(error)))
|
||||
Ok(Some(error))
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -773,7 +777,7 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Some(Box::new(batched_msg)))
|
||||
Ok(Some(batched_msg))
|
||||
}
|
||||
|
||||
/// Post-condition: `batch` is Some()
|
||||
@@ -781,26 +785,25 @@ impl PageServerHandler {
|
||||
#[allow(clippy::boxed_local)]
|
||||
fn pagestream_do_batch(
|
||||
max_batch_size: NonZeroUsize,
|
||||
batch: &mut Option<Box<BatchedFeMessage>>,
|
||||
this_msg: Box<BatchedFeMessage>,
|
||||
) -> Option<Box<BatchedFeMessage>> {
|
||||
batch: &mut Result<BatchedFeMessage, QueryError>,
|
||||
this_msg: Result<BatchedFeMessage, QueryError>,
|
||||
) -> Result<(), Result<BatchedFeMessage, QueryError>> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
|
||||
match (batch.as_deref_mut(), *this_msg) {
|
||||
// nothing batched yet
|
||||
(None, this_msg) => {
|
||||
*batch = Some(Box::new(this_msg));
|
||||
None
|
||||
}
|
||||
let this_msg = match this_msg {
|
||||
Ok(this_msg) => this_msg,
|
||||
Err(e) => return Err(Err(e)),
|
||||
};
|
||||
|
||||
match (&mut *batch, this_msg) {
|
||||
// something batched already, let's see if we can add this message to the batch
|
||||
(
|
||||
Some(BatchedFeMessage::GetPage {
|
||||
Ok(BatchedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: accum_shard,
|
||||
pages: ref mut accum_pages,
|
||||
effective_request_lsn: accum_lsn,
|
||||
}),
|
||||
// would be nice to have box pattern here
|
||||
BatchedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: this_shard,
|
||||
@@ -833,12 +836,12 @@ impl PageServerHandler {
|
||||
{
|
||||
// ok to batch
|
||||
accum_pages.extend(this_pages);
|
||||
None
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(Some(_), this_msg) => {
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
Some(Box::new(this_msg)) // TODO: avoid re-box
|
||||
Err(Ok(this_msg))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -848,6 +851,7 @@ impl PageServerHandler {
|
||||
&mut self,
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
batch: BatchedFeMessage,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
@@ -984,7 +988,7 @@ impl PageServerHandler {
|
||||
}
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = self.cancel.cancelled() => {
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
@@ -1041,8 +1045,8 @@ impl PageServerHandler {
|
||||
.expect("implementation error: timeline_handles should not be locked");
|
||||
|
||||
let request_span = info_span!("request", shard_id = tracing::field::Empty);
|
||||
let (pgb_reader, timeline_handles) = match self.pipelining_config.clone() {
|
||||
Some(pipelining_config) => {
|
||||
let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
|
||||
PageServicePipeliningConfig::Pipelined(pipelining_config) => {
|
||||
self.handle_pagerequests_pipelined(
|
||||
pgb,
|
||||
pgb_reader,
|
||||
@@ -1055,7 +1059,7 @@ impl PageServerHandler {
|
||||
)
|
||||
.await
|
||||
}
|
||||
None => {
|
||||
PageServicePipeliningConfig::Serial => {
|
||||
self.handle_pagerequests_serial(
|
||||
pgb,
|
||||
pgb_reader,
|
||||
@@ -1067,7 +1071,7 @@ impl PageServerHandler {
|
||||
)
|
||||
.await
|
||||
}
|
||||
}?;
|
||||
};
|
||||
|
||||
debug!("pagestream subprotocol shut down cleanly");
|
||||
|
||||
@@ -1077,7 +1081,7 @@ impl PageServerHandler {
|
||||
let replaced = self.timeline_handles.replace(timeline_handles);
|
||||
assert!(replaced.is_none());
|
||||
|
||||
Ok(())
|
||||
result
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -1090,33 +1094,50 @@ impl PageServerHandler {
|
||||
mut timeline_handles: TimelineHandles,
|
||||
request_span: Span,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(PostgresBackendReader<IO>, TimelineHandles), QueryError>
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
Result<(), QueryError>,
|
||||
)
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
|
||||
{
|
||||
loop {
|
||||
let cancel = self.cancel.clone();
|
||||
let err = loop {
|
||||
let msg = Self::pagestream_read_message(
|
||||
&mut pgb_reader,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&mut timeline_handles,
|
||||
&self.cancel,
|
||||
&cancel,
|
||||
ctx,
|
||||
request_span.clone(),
|
||||
)
|
||||
.await?;
|
||||
.await;
|
||||
let msg = match msg {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => break e,
|
||||
};
|
||||
let msg = match msg {
|
||||
Some(msg) => msg,
|
||||
None => {
|
||||
debug!("pagestream subprotocol end observed");
|
||||
return Ok((pgb_reader, timeline_handles));
|
||||
return ((pgb_reader, timeline_handles), Ok(()));
|
||||
}
|
||||
};
|
||||
self.pagesteam_handle_batched_message(pgb_writer, *msg, ctx)
|
||||
.await?;
|
||||
}
|
||||
let err = self
|
||||
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
|
||||
.await;
|
||||
match err {
|
||||
Ok(()) => {}
|
||||
Err(e) => break e,
|
||||
}
|
||||
};
|
||||
((pgb_reader, timeline_handles), Err(err))
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// May leak tokio tasks if not polled to completion.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn handle_pagerequests_pipelined<IO>(
|
||||
&mut self,
|
||||
@@ -1126,185 +1147,175 @@ impl PageServerHandler {
|
||||
timeline_id: TimelineId,
|
||||
mut timeline_handles: TimelineHandles,
|
||||
request_span: Span,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
pipelining_config: PageServicePipeliningConfigPipelined,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(PostgresBackendReader<IO>, TimelineHandles), QueryError>
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
Result<(), QueryError>,
|
||||
)
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
|
||||
{
|
||||
let PageServicePipeliningConfig {
|
||||
//
|
||||
// Pipelined pagestream handling consists of
|
||||
// - a Batcher that reads requests off the wire and
|
||||
// and batches them if possible,
|
||||
// - an Executor that processes the batched requests.
|
||||
//
|
||||
// The batch is built up inside an `spsc_fold` channel,
|
||||
// shared betwen Batcher (Sender) and Executor (Receiver).
|
||||
//
|
||||
// The Batcher continously folds client requests into the batch,
|
||||
// while the Executor can at any time take out what's in the batch
|
||||
// in order to process it.
|
||||
// This means the next batch builds up while the Executor
|
||||
// executes the last batch.
|
||||
//
|
||||
// CANCELLATION
|
||||
//
|
||||
// We run both Batcher and Executor futures to completion before
|
||||
// returning from this function.
|
||||
//
|
||||
// If Executor exits first, it signals cancellation to the Batcher
|
||||
// via a CancellationToken that is child of `self.cancel`.
|
||||
// If Batcher exits first, it signals cancellation to the Executor
|
||||
// by dropping the spsc_fold channel Sender.
|
||||
//
|
||||
// CLEAN SHUTDOWN
|
||||
//
|
||||
// Clean shutdown means that the client ends the COPYBOTH session.
|
||||
// In response to such a client message, the Batcher exits.
|
||||
// The Executor continues to run, draining the spsc_fold channel.
|
||||
// Once drained, the spsc_fold recv will fail with a distinct error
|
||||
// indicating that the sender disconnected.
|
||||
// The Executor exits with Ok(()) in response to that error.
|
||||
//
|
||||
// Server initiated shutdown is not clean shutdown, but instead
|
||||
// is an error Err(QueryError::Shutdown) that is propagated through
|
||||
// error propagation.
|
||||
//
|
||||
// ERROR PROPAGATION
|
||||
//
|
||||
// When the Batcher encounter an error, it sends it as a value
|
||||
// through the spsc_fold channel and exits afterwards.
|
||||
// When the Executor observes such an error in the channel,
|
||||
// it exits returning that error value.
|
||||
//
|
||||
// This design ensures that the Executor stage will still process
|
||||
// the batch that was in flight when the Batcher encountered an error,
|
||||
// thereby beahving identical to a serial implementation.
|
||||
|
||||
let PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
protocol_pipelining_mode,
|
||||
execution,
|
||||
} = pipelining_config;
|
||||
|
||||
let (requests_tx, mut requests_rx) = tokio::sync::mpsc::channel(1);
|
||||
let read_messages = {
|
||||
let cancel = self.cancel.child_token();
|
||||
// Macro to _define_ a pipeline stage.
|
||||
macro_rules! pipeline_stage {
|
||||
($name:literal, $cancel:expr, $make_fut:expr) => {{
|
||||
let cancel: CancellationToken = $cancel;
|
||||
let stage_fut = $make_fut(cancel.clone());
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("exiting");
|
||||
}
|
||||
timed_after_cancellation(stage_fut, $name, Duration::from_millis(100), &cancel)
|
||||
.await
|
||||
}
|
||||
.instrument(tracing::info_span!($name))
|
||||
}};
|
||||
}
|
||||
|
||||
//
|
||||
// Batcher
|
||||
//
|
||||
|
||||
let cancel_batcher = self.cancel.child_token();
|
||||
let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
|
||||
let read_messages = pipeline_stage!(
|
||||
"read_messages",
|
||||
cancel_batcher.clone(),
|
||||
move |cancel_batcher| {
|
||||
let ctx = ctx.attached_child();
|
||||
async move {
|
||||
let mut pgb_reader = pgb_reader;
|
||||
let mut exit = false;
|
||||
while !exit {
|
||||
let read_res = Self::pagestream_read_message(
|
||||
&mut pgb_reader,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&mut timeline_handles,
|
||||
&cancel_batcher,
|
||||
&ctx,
|
||||
request_span.clone(),
|
||||
)
|
||||
.await;
|
||||
let Some(read_res) = read_res.transpose() else {
|
||||
debug!("client-initiated shutdown");
|
||||
break;
|
||||
};
|
||||
exit |= read_res.is_err();
|
||||
let could_send = batch_tx
|
||||
.send(read_res, |batch, res| {
|
||||
Self::pagestream_do_batch(max_batch_size, batch, res)
|
||||
})
|
||||
.await;
|
||||
exit |= could_send.is_err();
|
||||
}
|
||||
(pgb_reader, timeline_handles)
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
//
|
||||
// Executor
|
||||
//
|
||||
|
||||
let executor = pipeline_stage!("executor", self.cancel.clone(), move |cancel| {
|
||||
let ctx = ctx.attached_child();
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("exiting");
|
||||
}
|
||||
let mut pgb_reader = pgb_reader;
|
||||
let _cancel_batcher = cancel_batcher.drop_guard();
|
||||
loop {
|
||||
let msg = Self::pagestream_read_message(
|
||||
&mut pgb_reader,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&mut timeline_handles,
|
||||
&cancel,
|
||||
&ctx,
|
||||
request_span.clone(),
|
||||
)
|
||||
.await?;
|
||||
let msg = match msg {
|
||||
Some(msg) => msg,
|
||||
None => {
|
||||
debug!("pagestream subprotocol end observed");
|
||||
break;
|
||||
let maybe_batch = batch_rx.recv().await;
|
||||
let batch = match maybe_batch {
|
||||
Ok(batch) => batch,
|
||||
Err(spsc_fold::RecvError::SenderGone) => {
|
||||
debug!("upstream gone");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
match requests_tx.send(msg).await {
|
||||
Ok(()) => {}
|
||||
Err(tokio::sync::mpsc::error::SendError(_)) => {
|
||||
debug!("downstream is gone");
|
||||
break;
|
||||
let batch = match batch {
|
||||
Ok(batch) => batch,
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok((pgb_reader, timeline_handles))
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("read_messages"));
|
||||
|
||||
enum BatchState {
|
||||
Building(Option<Box<BatchedFeMessage>>),
|
||||
UpstreamDead(Option<Box<BatchedFeMessage>>),
|
||||
}
|
||||
impl BatchState {
|
||||
fn must_building_mut(&mut self) -> &mut Option<Box<BatchedFeMessage>> {
|
||||
match self {
|
||||
Self::Building(maybe_batch) => maybe_batch,
|
||||
Self::UpstreamDead(_) => panic!("upstream dead"),
|
||||
}
|
||||
}
|
||||
}
|
||||
let (batch_tx, mut batch_rx) = tokio::sync::watch::channel(Arc::new(
|
||||
std::sync::Mutex::new(BatchState::Building(None)),
|
||||
));
|
||||
let notify_batcher = Arc::new(tokio::sync::Notify::new());
|
||||
let batcher = {
|
||||
let notify_batcher = notify_batcher.clone();
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("exiting");
|
||||
}
|
||||
loop {
|
||||
let maybe_req = requests_rx.recv().await;
|
||||
let Some(req) = maybe_req else {
|
||||
batch_tx.send_modify(|pending_batch| {
|
||||
let mut guard = pending_batch.lock().unwrap();
|
||||
match &mut *guard {
|
||||
BatchState::Building(batch) => {
|
||||
*guard = BatchState::UpstreamDead(batch.take());
|
||||
}
|
||||
BatchState::UpstreamDead(_) => panic!("twice"),
|
||||
}
|
||||
});
|
||||
break;
|
||||
};
|
||||
// don't read new requests before this one has been processed
|
||||
let mut req = Some(req);
|
||||
loop {
|
||||
let mut wait_notified = None;
|
||||
let batched = batch_tx.send_if_modified(|pending_batch| {
|
||||
let mut guard = pending_batch.lock().unwrap();
|
||||
let building = guard.must_building_mut();
|
||||
match Self::pagestream_do_batch(
|
||||
max_batch_size,
|
||||
building,
|
||||
req.take().unwrap(),
|
||||
) {
|
||||
Some(req_was_not_batched) => {
|
||||
req.replace(req_was_not_batched);
|
||||
wait_notified = Some(notify_batcher.notified());
|
||||
false
|
||||
}
|
||||
None => true,
|
||||
}
|
||||
});
|
||||
if batched {
|
||||
break;
|
||||
} else {
|
||||
wait_notified.unwrap().await;
|
||||
}
|
||||
}
|
||||
self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("batcher"));
|
||||
});
|
||||
|
||||
let executor = async {
|
||||
let mut stop = false;
|
||||
while !stop {
|
||||
match batch_rx.changed().await {
|
||||
Ok(()) => {}
|
||||
Err(_) => {
|
||||
debug!("batch_rx observed disconnection of batcher");
|
||||
}
|
||||
};
|
||||
let maybe_batch = {
|
||||
let borrow = batch_rx.borrow();
|
||||
let mut guard = borrow.lock().unwrap();
|
||||
match &mut *guard {
|
||||
BatchState::Building(maybe_batch) => maybe_batch.take(),
|
||||
BatchState::UpstreamDead(maybe_batch) => {
|
||||
debug!("upstream dead");
|
||||
stop = true;
|
||||
maybe_batch.take()
|
||||
}
|
||||
}
|
||||
};
|
||||
let Some(batch) = maybe_batch else {
|
||||
break;
|
||||
};
|
||||
notify_batcher.notify_one();
|
||||
debug!("processing batch");
|
||||
self.pagesteam_handle_batched_message(pgb_writer, *batch, ctx)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
//
|
||||
// Execute the stages.
|
||||
//
|
||||
|
||||
let read_messages_res;
|
||||
let executor_res;
|
||||
match protocol_pipelining_mode {
|
||||
PageServiceProtocolPipeliningMode::ConcurrentFutures => {
|
||||
(read_messages_res, _, executor_res) =
|
||||
tokio::join!(read_messages, batcher, executor);
|
||||
match execution {
|
||||
PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
|
||||
tokio::join!(read_messages, executor)
|
||||
}
|
||||
PageServiceProtocolPipeliningMode::Tasks => {
|
||||
// cancelled via sensitivity to self.cancel
|
||||
let read_messages_task = tokio::task::spawn(read_messages);
|
||||
// cancelled when it observes read_messages_task disconnect the channel
|
||||
let batcher_task = tokio::task::spawn(batcher);
|
||||
executor_res = executor.await;
|
||||
read_messages_res = read_messages_task
|
||||
.await
|
||||
.context("read_messages task panicked, check logs for details")?;
|
||||
let _: () = batcher_task
|
||||
.await
|
||||
.context("batcher task panicked, check logs for details")?;
|
||||
PageServiceProtocolPipelinedExecutionStrategy::Tasks => {
|
||||
// These tasks are not tracked anywhere.
|
||||
let read_messages_task = tokio::spawn(read_messages);
|
||||
let (read_messages_task_res, executor_res_) =
|
||||
tokio::join!(read_messages_task, executor,);
|
||||
(
|
||||
read_messages_task_res.expect("propagated panic from read_messages"),
|
||||
executor_res_,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
match (read_messages_res, executor_res) {
|
||||
(Err(e), _) | (_, Err(e)) => {
|
||||
let e: QueryError = e;
|
||||
Err(e)
|
||||
}
|
||||
(Ok((pgb_reader, timeline_handles)), Ok(())) => Ok((pgb_reader, timeline_handles)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to handle the LSN from client request.
|
||||
@@ -1349,21 +1360,26 @@ impl PageServerHandler {
|
||||
));
|
||||
}
|
||||
|
||||
if request_lsn < **latest_gc_cutoff_lsn {
|
||||
// Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
|
||||
if request_lsn == Lsn::INVALID {
|
||||
return Err(PageStreamError::BadRequest(
|
||||
"invalid LSN(0) in request".into(),
|
||||
));
|
||||
}
|
||||
|
||||
// Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
|
||||
//
|
||||
// We may have older data available, but we make a best effort to detect this case and return an error,
|
||||
// to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
|
||||
if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
|
||||
let gc_info = &timeline.gc_info.read().unwrap();
|
||||
if !gc_info.leases.contains_key(&request_lsn) {
|
||||
// The requested LSN is below gc cutoff and is not guarded by a lease.
|
||||
|
||||
// Check explicitly for INVALID just to get a less scary error message if the
|
||||
// request is obviously bogus
|
||||
return Err(if request_lsn == Lsn::INVALID {
|
||||
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
||||
} else {
|
||||
return Err(
|
||||
PageStreamError::BadRequest(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
request_lsn, **latest_gc_cutoff_lsn
|
||||
).into())
|
||||
});
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::{
|
||||
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
|
||||
};
|
||||
use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
@@ -389,7 +392,9 @@ impl Timeline {
|
||||
result
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
/// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on
|
||||
/// other shards, by only accounting for relations the shard has pages for, and only accounting
|
||||
/// for pages up to the highest page number it has stored.
|
||||
pub(crate) async fn get_db_size(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
@@ -408,7 +413,10 @@ impl Timeline {
|
||||
Ok(total_blocks)
|
||||
}
|
||||
|
||||
/// Get size of a relation file
|
||||
/// Get size of a relation file. The relation must exist, otherwise an error is returned.
|
||||
///
|
||||
/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
|
||||
/// page number stored in the shard.
|
||||
pub(crate) async fn get_rel_size(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
@@ -444,7 +452,10 @@ impl Timeline {
|
||||
Ok(nblocks)
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
/// Does the relation exist?
|
||||
///
|
||||
/// Only shard 0 has a full view of the relations. Other shards only know about relations that
|
||||
/// the shard stores pages for.
|
||||
pub(crate) async fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
@@ -478,6 +489,9 @@ impl Timeline {
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
///
|
||||
/// Only shard 0 has a full view of the relations. Other shards only know about relations that
|
||||
/// the shard stores pages for.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
@@ -1129,9 +1143,12 @@ impl Timeline {
|
||||
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
|
||||
if lsn >= *cached_lsn {
|
||||
RELSIZE_CACHE_HITS.inc();
|
||||
return Some(*nblocks);
|
||||
}
|
||||
RELSIZE_CACHE_MISSES_OLD.inc();
|
||||
}
|
||||
RELSIZE_CACHE_MISSES.inc();
|
||||
None
|
||||
}
|
||||
|
||||
@@ -1156,6 +1173,7 @@ impl Timeline {
|
||||
}
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
entry.insert((lsn, nblocks));
|
||||
RELSIZE_CACHE_ENTRIES.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1163,13 +1181,17 @@ impl Timeline {
|
||||
/// Store cached relation size
|
||||
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.map.insert(tag, (lsn, nblocks));
|
||||
if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
|
||||
RELSIZE_CACHE_ENTRIES.inc();
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove cached relation size
|
||||
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.map.remove(tag);
|
||||
if rel_size_cache.map.remove(tag).is_some() {
|
||||
RELSIZE_CACHE_ENTRIES.dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1229,10 +1251,9 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
pub(crate) fn has_dirty_data(&self) -> bool {
|
||||
!self
|
||||
.pending_data_batch
|
||||
self.pending_data_batch
|
||||
.as_ref()
|
||||
.map_or(true, |b| b.is_empty())
|
||||
.map_or(false, |b| b.has_data())
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
@@ -1408,7 +1429,7 @@ impl<'a> DatadirModification<'a> {
|
||||
Some(pending_batch) => {
|
||||
pending_batch.extend(batch);
|
||||
}
|
||||
None if !batch.is_empty() => {
|
||||
None if batch.has_data() => {
|
||||
self.pending_data_batch = Some(batch);
|
||||
}
|
||||
None => {
|
||||
@@ -2276,9 +2297,9 @@ impl<'a> Version<'a> {
|
||||
//--- Metadata structs stored in key-value pairs in the repository.
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DbDirectory {
|
||||
pub(crate) struct DbDirectory {
|
||||
// (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
|
||||
dbdirs: HashMap<(Oid, Oid), bool>,
|
||||
pub(crate) dbdirs: HashMap<(Oid, Oid), bool>,
|
||||
}
|
||||
|
||||
// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
|
||||
@@ -2287,8 +2308,8 @@ struct DbDirectory {
|
||||
// "pg_twophsae/0000000A000002E4".
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct TwoPhaseDirectory {
|
||||
xids: HashSet<TransactionId>,
|
||||
pub(crate) struct TwoPhaseDirectory {
|
||||
pub(crate) xids: HashSet<TransactionId>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -2297,12 +2318,12 @@ struct TwoPhaseDirectoryV17 {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct RelDirectory {
|
||||
pub(crate) struct RelDirectory {
|
||||
// Set of relations that exist. (relfilenode, forknum)
|
||||
//
|
||||
// TODO: Store it as a btree or radix tree or something else that spans multiple
|
||||
// key-value pairs, if you have a lot of relations
|
||||
rels: HashSet<(Oid, u8)>,
|
||||
pub(crate) rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -2311,9 +2332,9 @@ struct RelSizeEntry {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct SlruSegmentDirectory {
|
||||
pub(crate) struct SlruSegmentDirectory {
|
||||
// Set of SLRU segments that exist.
|
||||
segments: HashSet<u32>,
|
||||
pub(crate) segments: HashSet<u32>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
|
||||
|
||||
@@ -381,6 +381,8 @@ pub enum TaskKind {
|
||||
UnitTest,
|
||||
|
||||
DetachAncestor,
|
||||
|
||||
ImportPgdata,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -43,7 +43,9 @@ use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::import_pgdata;
|
||||
use timeline::offload::offload_timeline;
|
||||
use timeline::ShutdownMode;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::JoinSet;
|
||||
@@ -373,7 +375,6 @@ pub struct Tenant {
|
||||
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Tenant {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
|
||||
@@ -860,6 +861,7 @@ impl Debug for SetStoppingError {
|
||||
pub(crate) enum CreateTimelineParams {
|
||||
Bootstrap(CreateTimelineParamsBootstrap),
|
||||
Branch(CreateTimelineParamsBranch),
|
||||
ImportPgdata(CreateTimelineParamsImportPgdata),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -877,7 +879,14 @@ pub(crate) struct CreateTimelineParamsBranch {
|
||||
pub(crate) ancestor_start_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`].
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct CreateTimelineParamsImportPgdata {
|
||||
pub(crate) new_timeline_id: TimelineId,
|
||||
pub(crate) location: import_pgdata::index_part_format::Location,
|
||||
pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
|
||||
}
|
||||
|
||||
/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`] in [`Tenant::start_creating_timeline`].
|
||||
///
|
||||
/// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
|
||||
///
|
||||
@@ -907,19 +916,50 @@ pub(crate) enum CreateTimelineIdempotency {
|
||||
ancestor_timeline_id: TimelineId,
|
||||
ancestor_start_lsn: Lsn,
|
||||
},
|
||||
ImportPgdata(CreatingTimelineIdempotencyImportPgdata),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
|
||||
}
|
||||
|
||||
/// What is returned by [`Tenant::start_creating_timeline`].
|
||||
#[must_use]
|
||||
enum StartCreatingTimelineResult<'t> {
|
||||
CreateGuard(TimelineCreateGuard<'t>),
|
||||
enum StartCreatingTimelineResult {
|
||||
CreateGuard(TimelineCreateGuard),
|
||||
Idempotent(Arc<Timeline>),
|
||||
}
|
||||
|
||||
enum TimelineInitAndSyncResult {
|
||||
ReadyToActivate(Arc<Timeline>),
|
||||
NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
|
||||
}
|
||||
|
||||
impl TimelineInitAndSyncResult {
|
||||
fn ready_to_activate(self) -> Option<Arc<Timeline>> {
|
||||
match self {
|
||||
Self::ReadyToActivate(timeline) => Some(timeline),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
struct TimelineInitAndSyncNeedsSpawnImportPgdata {
|
||||
timeline: Arc<Timeline>,
|
||||
import_pgdata: import_pgdata::index_part_format::Root,
|
||||
guard: TimelineCreateGuard,
|
||||
}
|
||||
|
||||
/// What is returned by [`Tenant::create_timeline`].
|
||||
enum CreateTimelineResult {
|
||||
Created(Arc<Timeline>),
|
||||
Idempotent(Arc<Timeline>),
|
||||
/// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
|
||||
/// we return this result, nor will this concrete object ever be added there.
|
||||
/// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
|
||||
ImportSpawned(Arc<Timeline>),
|
||||
}
|
||||
|
||||
impl CreateTimelineResult {
|
||||
@@ -927,18 +967,19 @@ impl CreateTimelineResult {
|
||||
match self {
|
||||
Self::Created(_) => "Created",
|
||||
Self::Idempotent(_) => "Idempotent",
|
||||
Self::ImportSpawned(_) => "ImportSpawned",
|
||||
}
|
||||
}
|
||||
fn timeline(&self) -> &Arc<Timeline> {
|
||||
match self {
|
||||
Self::Created(t) | Self::Idempotent(t) => t,
|
||||
Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
|
||||
}
|
||||
}
|
||||
/// Unit test timelines aren't activated, test has to do it if it needs to.
|
||||
#[cfg(test)]
|
||||
fn into_timeline_for_test(self) -> Arc<Timeline> {
|
||||
match self {
|
||||
Self::Created(t) | Self::Idempotent(t) => t,
|
||||
Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -962,33 +1003,13 @@ pub enum CreateTimelineError {
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum InitdbError {
|
||||
Other(anyhow::Error),
|
||||
pub enum InitdbError {
|
||||
#[error("Operation was cancelled")]
|
||||
Cancelled,
|
||||
Spawn(std::io::Result<()>),
|
||||
Failed(std::process::ExitStatus, Vec<u8>),
|
||||
}
|
||||
|
||||
impl fmt::Display for InitdbError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
InitdbError::Cancelled => write!(f, "Operation was cancelled"),
|
||||
InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e),
|
||||
InitdbError::Failed(status, stderr) => write!(
|
||||
f,
|
||||
"Command failed with status {:?}: {}",
|
||||
status,
|
||||
String::from_utf8_lossy(stderr)
|
||||
),
|
||||
InitdbError::Other(e) => write!(f, "Error: {:?}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for InitdbError {
|
||||
fn from(error: std::io::Error) -> Self {
|
||||
InitdbError::Spawn(Err(error))
|
||||
}
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
Inner(postgres_initdb::Error),
|
||||
}
|
||||
|
||||
enum CreateTimelineCause {
|
||||
@@ -996,6 +1017,15 @@ enum CreateTimelineCause {
|
||||
Delete,
|
||||
}
|
||||
|
||||
enum LoadTimelineCause {
|
||||
Attach,
|
||||
Unoffload,
|
||||
ImportPgdata {
|
||||
create_guard: TimelineCreateGuard,
|
||||
activate: ActivateTimelineArgs,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GcError {
|
||||
// The tenant is shutting down
|
||||
@@ -1072,24 +1102,35 @@ impl Tenant {
|
||||
/// it is marked as Active.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn timeline_init_and_sync(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
resources: TimelineResources,
|
||||
index_part: IndexPart,
|
||||
mut index_part: IndexPart,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
cause: LoadTimelineCause,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInitAndSyncResult> {
|
||||
let tenant_id = self.tenant_shard_id;
|
||||
|
||||
let idempotency = if metadata.ancestor_timeline().is_none() {
|
||||
CreateTimelineIdempotency::Bootstrap {
|
||||
pg_version: metadata.pg_version(),
|
||||
let import_pgdata = index_part.import_pgdata.take();
|
||||
let idempotency = match &import_pgdata {
|
||||
Some(import_pgdata) => {
|
||||
CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
|
||||
idempotency_key: import_pgdata.idempotency_key().clone(),
|
||||
})
|
||||
}
|
||||
} else {
|
||||
CreateTimelineIdempotency::Branch {
|
||||
ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
|
||||
ancestor_start_lsn: metadata.ancestor_lsn(),
|
||||
None => {
|
||||
if metadata.ancestor_timeline().is_none() {
|
||||
CreateTimelineIdempotency::Bootstrap {
|
||||
pg_version: metadata.pg_version(),
|
||||
}
|
||||
} else {
|
||||
CreateTimelineIdempotency::Branch {
|
||||
ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
|
||||
ancestor_start_lsn: metadata.ancestor_lsn(),
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1121,39 +1162,91 @@ impl Tenant {
|
||||
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
{
|
||||
// avoiding holding it across awaits
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
match timelines_accessor.entry(timeline_id) {
|
||||
// We should never try and load the same timeline twice during startup
|
||||
Entry::Occupied(_) => {
|
||||
unreachable!(
|
||||
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
||||
);
|
||||
match import_pgdata {
|
||||
Some(import_pgdata) if !import_pgdata.is_done() => {
|
||||
match cause {
|
||||
LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
|
||||
LoadTimelineCause::ImportPgdata { .. } => {
|
||||
unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3")
|
||||
}
|
||||
}
|
||||
Entry::Vacant(v) => {
|
||||
v.insert(Arc::clone(&timeline));
|
||||
timeline.maybe_spawn_flush_loop();
|
||||
let mut guard = self.timelines_creating.lock().unwrap();
|
||||
if !guard.insert(timeline_id) {
|
||||
// We should never try and load the same timeline twice during startup
|
||||
unreachable!("Timeline {tenant_id}/{timeline_id} is already being created")
|
||||
}
|
||||
let timeline_create_guard = TimelineCreateGuard {
|
||||
_tenant_gate_guard: self.gate.enter()?,
|
||||
owning_tenant: self.clone(),
|
||||
timeline_id,
|
||||
idempotency,
|
||||
// The users of this specific return value don't need the timline_path in there.
|
||||
timeline_path: timeline
|
||||
.conf
|
||||
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id),
|
||||
};
|
||||
Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
|
||||
TimelineInitAndSyncNeedsSpawnImportPgdata {
|
||||
timeline,
|
||||
import_pgdata,
|
||||
guard: timeline_create_guard,
|
||||
},
|
||||
))
|
||||
}
|
||||
};
|
||||
Some(_) | None => {
|
||||
{
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
match timelines_accessor.entry(timeline_id) {
|
||||
// We should never try and load the same timeline twice during startup
|
||||
Entry::Occupied(_) => {
|
||||
unreachable!(
|
||||
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
||||
);
|
||||
}
|
||||
Entry::Vacant(v) => {
|
||||
v.insert(Arc::clone(&timeline));
|
||||
timeline.maybe_spawn_flush_loop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sanity check: a timeline should have some content.
|
||||
anyhow::ensure!(
|
||||
ancestor.is_some()
|
||||
|| timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.expect("currently loading, layer manager cannot be shutdown already")
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.is_some(),
|
||||
"Timeline has no ancestor and no layer files"
|
||||
);
|
||||
// Sanity check: a timeline should have some content.
|
||||
anyhow::ensure!(
|
||||
ancestor.is_some()
|
||||
|| timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.expect("currently loading, layer manager cannot be shutdown already")
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.is_some(),
|
||||
"Timeline has no ancestor and no layer files"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
match cause {
|
||||
LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
|
||||
LoadTimelineCause::ImportPgdata {
|
||||
create_guard,
|
||||
activate,
|
||||
} => {
|
||||
// TODO: see the comment in the task code above how I'm not so certain
|
||||
// it is safe to activate here because of concurrent shutdowns.
|
||||
match activate {
|
||||
ActivateTimelineArgs::Yes { broker_client } => {
|
||||
info!("activating timeline after reload from pgdata import task");
|
||||
timeline.activate(self.clone(), broker_client, None, ctx);
|
||||
}
|
||||
ActivateTimelineArgs::No => (),
|
||||
}
|
||||
drop(create_guard);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Attach a tenant that's available in cloud storage.
|
||||
@@ -1578,24 +1671,46 @@ impl Tenant {
|
||||
}
|
||||
|
||||
// TODO again handle early failure
|
||||
self.load_remote_timeline(
|
||||
timeline_id,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to load remote timeline {} for tenant {}",
|
||||
timeline_id, self.tenant_shard_id
|
||||
let effect = self
|
||||
.load_remote_timeline(
|
||||
timeline_id,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
LoadTimelineCause::Attach,
|
||||
ctx,
|
||||
)
|
||||
})?;
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to load remote timeline {} for tenant {}",
|
||||
timeline_id, self.tenant_shard_id
|
||||
)
|
||||
})?;
|
||||
|
||||
match effect {
|
||||
TimelineInitAndSyncResult::ReadyToActivate(_) => {
|
||||
// activation happens later, on Tenant::activate
|
||||
}
|
||||
TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
|
||||
TimelineInitAndSyncNeedsSpawnImportPgdata {
|
||||
timeline,
|
||||
import_pgdata,
|
||||
guard,
|
||||
},
|
||||
) => {
|
||||
tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
|
||||
timeline,
|
||||
import_pgdata,
|
||||
ActivateTimelineArgs::No,
|
||||
guard,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Walk through deleted timelines, resume deletion
|
||||
@@ -1719,13 +1834,14 @@ impl Tenant {
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
|
||||
async fn load_remote_timeline(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
resources: TimelineResources,
|
||||
cause: LoadTimelineCause,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<TimelineInitAndSyncResult> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
info!("downloading index file for timeline {}", timeline_id);
|
||||
@@ -1752,6 +1868,7 @@ impl Tenant {
|
||||
index_part,
|
||||
remote_metadata,
|
||||
ancestor,
|
||||
cause,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1938,6 +2055,7 @@ impl Tenant {
|
||||
TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
|
||||
}
|
||||
TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
|
||||
TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled,
|
||||
})?;
|
||||
|
||||
let timeline_preload = self
|
||||
@@ -1976,6 +2094,7 @@ impl Tenant {
|
||||
index_part,
|
||||
remote_metadata,
|
||||
timeline_resources,
|
||||
LoadTimelineCause::Unoffload,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2213,7 +2332,7 @@ impl Tenant {
|
||||
///
|
||||
/// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
|
||||
pub(crate) async fn create_empty_timeline(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
@@ -2263,7 +2382,7 @@ impl Tenant {
|
||||
// Our current tests don't need the background loops.
|
||||
#[cfg(test)]
|
||||
pub async fn create_test_timeline(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
@@ -2302,7 +2421,7 @@ impl Tenant {
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn create_test_timeline_with_layers(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
@@ -2439,6 +2558,16 @@ impl Tenant {
|
||||
self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
|
||||
.await?
|
||||
}
|
||||
CreateTimelineParams::ImportPgdata(params) => {
|
||||
self.create_timeline_import_pgdata(
|
||||
params,
|
||||
ActivateTimelineArgs::Yes {
|
||||
broker_client: broker_client.clone(),
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
// At this point we have dropped our guard on [`Self::timelines_creating`], and
|
||||
@@ -2481,11 +2610,202 @@ impl Tenant {
|
||||
);
|
||||
timeline
|
||||
}
|
||||
CreateTimelineResult::ImportSpawned(timeline) => {
|
||||
info!("import task spawned, timeline will become visible and activated once the import is done");
|
||||
timeline
|
||||
}
|
||||
};
|
||||
|
||||
Ok(activated_timeline)
|
||||
}
|
||||
|
||||
/// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
|
||||
/// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
|
||||
/// [`Tenant::timelines`] map when the import completes.
|
||||
/// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
|
||||
/// for the response.
|
||||
async fn create_timeline_import_pgdata(
|
||||
self: &Arc<Tenant>,
|
||||
params: CreateTimelineParamsImportPgdata,
|
||||
activate: ActivateTimelineArgs,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CreateTimelineResult, CreateTimelineError> {
|
||||
let CreateTimelineParamsImportPgdata {
|
||||
new_timeline_id,
|
||||
location,
|
||||
idempotency_key,
|
||||
} = params;
|
||||
|
||||
let started_at = chrono::Utc::now().naive_utc();
|
||||
|
||||
//
|
||||
// There's probably a simpler way to upload an index part, but, remote_timeline_client
|
||||
// is the canonical way we do it.
|
||||
// - create an empty timeline in-memory
|
||||
// - use its remote_timeline_client to do the upload
|
||||
// - dispose of the uninit timeline
|
||||
// - keep the creation guard alive
|
||||
|
||||
let timeline_create_guard = match self
|
||||
.start_creating_timeline(
|
||||
new_timeline_id,
|
||||
CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
|
||||
idempotency_key: idempotency_key.clone(),
|
||||
}),
|
||||
)
|
||||
.await?
|
||||
{
|
||||
StartCreatingTimelineResult::CreateGuard(guard) => guard,
|
||||
StartCreatingTimelineResult::Idempotent(timeline) => {
|
||||
return Ok(CreateTimelineResult::Idempotent(timeline))
|
||||
}
|
||||
};
|
||||
|
||||
let mut uninit_timeline = {
|
||||
let this = &self;
|
||||
let initdb_lsn = Lsn(0);
|
||||
let _ctx = ctx;
|
||||
async move {
|
||||
let new_metadata = TimelineMetadata::new(
|
||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||
// make it valid, before calling finish_creation()
|
||||
Lsn(0),
|
||||
None,
|
||||
None,
|
||||
Lsn(0),
|
||||
initdb_lsn,
|
||||
initdb_lsn,
|
||||
15,
|
||||
);
|
||||
this.prepare_new_timeline(
|
||||
new_timeline_id,
|
||||
&new_metadata,
|
||||
timeline_create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
.await?;
|
||||
|
||||
let in_progress = import_pgdata::index_part_format::InProgress {
|
||||
idempotency_key,
|
||||
location,
|
||||
started_at,
|
||||
};
|
||||
let index_part = import_pgdata::index_part_format::Root::V1(
|
||||
import_pgdata::index_part_format::V1::InProgress(in_progress),
|
||||
);
|
||||
uninit_timeline
|
||||
.raw_timeline()
|
||||
.unwrap()
|
||||
.remote_client
|
||||
.schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?;
|
||||
|
||||
// wait_completion happens in caller
|
||||
|
||||
let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();
|
||||
|
||||
tokio::spawn(self.clone().create_timeline_import_pgdata_task(
|
||||
timeline.clone(),
|
||||
index_part,
|
||||
activate,
|
||||
timeline_create_guard,
|
||||
));
|
||||
|
||||
// NB: the timeline doesn't exist in self.timelines at this point
|
||||
Ok(CreateTimelineResult::ImportSpawned(timeline))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
|
||||
async fn create_timeline_import_pgdata_task(
|
||||
self: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
index_part: import_pgdata::index_part_format::Root,
|
||||
activate: ActivateTimelineArgs,
|
||||
timeline_create_guard: TimelineCreateGuard,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
info!("starting");
|
||||
scopeguard::defer! {info!("exiting")};
|
||||
|
||||
let res = self
|
||||
.create_timeline_import_pgdata_task_impl(
|
||||
timeline,
|
||||
index_part,
|
||||
activate,
|
||||
timeline_create_guard,
|
||||
)
|
||||
.await;
|
||||
if let Err(err) = &res {
|
||||
error!(?err, "task failed");
|
||||
// TODO sleep & retry, sensitive to tenant shutdown
|
||||
// TODO: allow timeline deletion requests => should cancel the task
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_timeline_import_pgdata_task_impl(
|
||||
self: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
index_part: import_pgdata::index_part_format::Root,
|
||||
activate: ActivateTimelineArgs,
|
||||
timeline_create_guard: TimelineCreateGuard,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
|
||||
|
||||
info!("importing pgdata");
|
||||
import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
|
||||
.await
|
||||
.context("import")?;
|
||||
info!("import done");
|
||||
|
||||
//
|
||||
// Reload timeline from remote.
|
||||
// This proves that the remote state is attachable, and it reuses the code.
|
||||
//
|
||||
// TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
|
||||
// timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
|
||||
// But our activate() call might launch new background tasks after Tenant::shutdown
|
||||
// already went past shutting down the Tenant::timelines, which this timeline here is no part of.
|
||||
// I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
|
||||
// down while bootstrapping/branching + activating), but, the race condition is much more likely
|
||||
// to manifest because of the long runtime of this import task.
|
||||
|
||||
// in theory this shouldn't even .await anything except for coop yield
|
||||
info!("shutting down timeline");
|
||||
timeline.shutdown(ShutdownMode::Hard).await;
|
||||
info!("timeline shut down, reloading from remote");
|
||||
// TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc<Timeline>
|
||||
// let Some(timeline) = Arc::into_inner(timeline) else {
|
||||
// anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere");
|
||||
// };
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
// load from object storage like Tenant::attach does
|
||||
let resources = self.build_timeline_resources(timeline_id);
|
||||
let index_part = resources
|
||||
.remote_client
|
||||
.download_index_file(&self.cancel)
|
||||
.await?;
|
||||
let index_part = match index_part {
|
||||
MaybeDeletedIndexPart::Deleted(_) => {
|
||||
// likely concurrent delete call, cplane should prevent this
|
||||
anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but")
|
||||
}
|
||||
MaybeDeletedIndexPart::IndexPart(p) => p,
|
||||
};
|
||||
let metadata = index_part.metadata.clone();
|
||||
self
|
||||
.load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
|
||||
create_guard: timeline_create_guard, activate, }, &ctx)
|
||||
.await?
|
||||
.ready_to_activate()
|
||||
.context("implementation error: reloaded timeline still needs import after import reported success")?;
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_timeline(
|
||||
self: Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
@@ -2895,6 +3215,18 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
if let ShutdownMode::Reload = shutdown_mode {
|
||||
tracing::info!("Flushing deletion queue");
|
||||
if let Err(e) = self.deletion_queue_client.flush().await {
|
||||
match e {
|
||||
DeletionQueueError::ShuttingDown => {
|
||||
// This is the only error we expect for now. In the future, if more error
|
||||
// variants are added, we should handle them here.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits
|
||||
// them to continue to do work during their shutdown methods, e.g. flushing data.
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
@@ -3337,6 +3669,13 @@ where
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
enum ActivateTimelineArgs {
|
||||
Yes {
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
},
|
||||
No,
|
||||
}
|
||||
|
||||
impl Tenant {
|
||||
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.load().tenant_conf.clone()
|
||||
@@ -3520,6 +3859,7 @@ impl Tenant {
|
||||
/// `validate_ancestor == false` is used when a timeline is created for deletion
|
||||
/// and we might not have the ancestor present anymore which is fine for to be
|
||||
/// deleted timelines.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn create_timeline_struct(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
@@ -4283,16 +4623,17 @@ impl Tenant {
|
||||
/// If the timeline was already created in the meantime, we check whether this
|
||||
/// request conflicts or is idempotent , based on `state`.
|
||||
async fn start_creating_timeline(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
new_timeline_id: TimelineId,
|
||||
idempotency: CreateTimelineIdempotency,
|
||||
) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
|
||||
) -> Result<StartCreatingTimelineResult, CreateTimelineError> {
|
||||
let allow_offloaded = false;
|
||||
match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
|
||||
Ok(create_guard) => {
|
||||
pausable_failpoint!("timeline-creation-after-uninit");
|
||||
Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
|
||||
}
|
||||
Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown),
|
||||
Err(TimelineExclusionError::AlreadyCreating) => {
|
||||
// Creation is in progress, we cannot create it again, and we cannot
|
||||
// check if this request matches the existing one, so caller must try
|
||||
@@ -4582,7 +4923,7 @@ impl Tenant {
|
||||
&'a self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
create_guard: TimelineCreateGuard<'a>,
|
||||
create_guard: TimelineCreateGuard,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
) -> anyhow::Result<UninitializedTimeline<'a>> {
|
||||
@@ -4642,7 +4983,7 @@ impl Tenant {
|
||||
/// The `allow_offloaded` parameter controls whether to tolerate the existence of
|
||||
/// offloaded timelines or not.
|
||||
fn create_timeline_create_guard(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
idempotency: CreateTimelineIdempotency,
|
||||
allow_offloaded: bool,
|
||||
@@ -4902,48 +5243,16 @@ async fn run_initdb(
|
||||
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
|
||||
let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
|
||||
initdb_command
|
||||
.args(["--pgdata", initdb_target_dir.as_ref()])
|
||||
.args(["--username", &conf.superuser])
|
||||
.args(["--encoding", "utf8"])
|
||||
.args(["--locale", &conf.locale])
|
||||
.arg("--no-instructions")
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.stdin(std::process::Stdio::null())
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped());
|
||||
|
||||
// Before version 14, only the libc provide was available.
|
||||
if pg_version > 14 {
|
||||
// Version 17 brought with it a builtin locale provider which only provides
|
||||
// C and C.UTF-8. While being safer for collation purposes since it is
|
||||
// guaranteed to be consistent throughout a major release, it is also more
|
||||
// performant.
|
||||
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
|
||||
|
||||
initdb_command.args(["--locale-provider", locale_provider]);
|
||||
}
|
||||
|
||||
let initdb_proc = initdb_command.spawn()?;
|
||||
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_proc.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
initdb_output.stderr,
|
||||
));
|
||||
}
|
||||
let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
|
||||
superuser: &conf.superuser,
|
||||
locale: &conf.locale,
|
||||
initdb_bin: &initdb_bin_path,
|
||||
pg_version,
|
||||
library_search_path: &initdb_lib_dir,
|
||||
pgdata: initdb_target_dir,
|
||||
})
|
||||
.await
|
||||
.map_err(InitdbError::Inner);
|
||||
|
||||
// This isn't true cancellation support, see above. Still return an error to
|
||||
// excercise the cancellation code path.
|
||||
@@ -4951,7 +5260,7 @@ async fn run_initdb(
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
res
|
||||
}
|
||||
|
||||
/// Dump contents of a layer file to stdout.
|
||||
@@ -5047,6 +5356,7 @@ pub(crate) mod harness {
|
||||
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
|
||||
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
|
||||
timeline_offloading: Some(tenant_conf.timeline_offloading),
|
||||
wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ use serde_json::Value;
|
||||
use std::num::NonZeroU64;
|
||||
use std::time::Duration;
|
||||
use utils::generation::Generation;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub(crate) enum AttachmentMode {
|
||||
@@ -353,6 +354,9 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub timeline_offloading: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -418,6 +422,9 @@ impl TenantConfOpt {
|
||||
timeline_offloading: self
|
||||
.lazy_slru_download
|
||||
.unwrap_or(global_conf.timeline_offloading),
|
||||
wal_receiver_protocol_override: self
|
||||
.wal_receiver_protocol_override
|
||||
.or(global_conf.wal_receiver_protocol_override),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -472,6 +479,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
lsn_lease_length: value.lsn_lease_length.map(humantime),
|
||||
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
|
||||
timeline_offloading: value.timeline_offloading,
|
||||
wal_receiver_protocol_override: value.wal_receiver_protocol_override,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1960,7 +1960,7 @@ impl TenantManager {
|
||||
attempt.before_reset_tenant();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Flush).await {
|
||||
match tenant.shutdown(progress, ShutdownMode::Reload).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value().expect("it was just shutdown");
|
||||
}
|
||||
|
||||
@@ -199,7 +199,7 @@ use utils::backoff::{
|
||||
use utils::pausable_failpoint;
|
||||
use utils::shard::ShardNumber;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
use std::time::Duration;
|
||||
@@ -223,7 +223,7 @@ use crate::task_mgr::shutdown_token;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::download::download_retry;
|
||||
use crate::tenant::storage_layer::AsLayerDesc;
|
||||
use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
|
||||
use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable};
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
@@ -244,6 +244,7 @@ use self::index::IndexPart;
|
||||
use super::config::AttachedLocationConfig;
|
||||
use super::metadata::MetadataUpdate;
|
||||
use super::storage_layer::{Layer, LayerName, ResidentLayer};
|
||||
use super::timeline::import_pgdata;
|
||||
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
|
||||
use super::{DeleteTimelineError, Generation};
|
||||
|
||||
@@ -813,6 +814,18 @@ impl RemoteTimelineClient {
|
||||
Ok(need_wait)
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
|
||||
pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
|
||||
self: &Arc<Self>,
|
||||
state: Option<import_pgdata::index_part_format::Root>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.dirty.import_pgdata = state;
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
@@ -1090,7 +1103,7 @@ impl RemoteTimelineClient {
|
||||
"scheduled layer file upload {layer}",
|
||||
);
|
||||
|
||||
let op = UploadOp::UploadLayer(layer, metadata);
|
||||
let op = UploadOp::UploadLayer(layer, metadata, None);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
@@ -1805,7 +1818,7 @@ impl RemoteTimelineClient {
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
UploadOp::Delete(..) => {
|
||||
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
|
||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||
}
|
||||
@@ -1833,19 +1846,32 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
// We can launch this task. Remove it from the queue first.
|
||||
let next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
|
||||
debug!("starting op: {}", next_op);
|
||||
|
||||
// Update the counters
|
||||
match next_op {
|
||||
UploadOp::UploadLayer(_, _) => {
|
||||
// Update the counters and prepare
|
||||
match &mut next_op {
|
||||
UploadOp::UploadLayer(layer, meta, mode) => {
|
||||
if upload_queue
|
||||
.recently_deleted
|
||||
.remove(&(layer.layer_desc().layer_name().clone(), meta.generation))
|
||||
{
|
||||
*mode = Some(OpType::FlushDeletion);
|
||||
} else {
|
||||
*mode = Some(OpType::MayReorder)
|
||||
}
|
||||
upload_queue.num_inprogress_layer_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::Delete(_) => {
|
||||
UploadOp::Delete(Delete { layers }) => {
|
||||
for (name, meta) in layers {
|
||||
upload_queue
|
||||
.recently_deleted
|
||||
.insert((name.clone(), meta.generation));
|
||||
}
|
||||
upload_queue.num_inprogress_deletions += 1;
|
||||
}
|
||||
UploadOp::Barrier(sender) => {
|
||||
@@ -1921,7 +1947,66 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
|
||||
if let Some(OpType::FlushDeletion) = mode {
|
||||
if self.config.read().unwrap().block_deletions {
|
||||
// Of course, this is not efficient... but usually the queue should be empty.
|
||||
let mut queue_locked = self.upload_queue.lock().unwrap();
|
||||
let mut detected = false;
|
||||
if let Ok(queue) = queue_locked.initialized_mut() {
|
||||
for list in queue.blocked_deletions.iter_mut() {
|
||||
list.layers.retain(|(name, meta)| {
|
||||
if name == &layer.layer_desc().layer_name()
|
||||
&& meta.generation == layer_metadata.generation
|
||||
{
|
||||
detected = true;
|
||||
// remove the layer from deletion queue
|
||||
false
|
||||
} else {
|
||||
// keep the layer
|
||||
true
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
if detected {
|
||||
info!(
|
||||
"cancelled blocked deletion of layer {} at gen {:?}",
|
||||
layer.layer_desc().layer_name(),
|
||||
layer_metadata.generation
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions
|
||||
// that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted,
|
||||
// which is not possible in the current system.
|
||||
info!(
|
||||
"waiting for deletion queue flush to complete before uploading layer {} at gen {:?}",
|
||||
layer.layer_desc().layer_name(),
|
||||
layer_metadata.generation
|
||||
);
|
||||
{
|
||||
// We are going to flush, we can clean up the recently deleted list.
|
||||
let mut queue_locked = self.upload_queue.lock().unwrap();
|
||||
if let Ok(queue) = queue_locked.initialized_mut() {
|
||||
queue.recently_deleted.clear();
|
||||
}
|
||||
}
|
||||
if let Err(e) = self.deletion_queue_client.flush_execute().await {
|
||||
warn!(
|
||||
"failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ",
|
||||
layer.layer_desc().layer_name(),
|
||||
layer_metadata.generation
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"done flushing deletion queue before uploading layer {} at gen {:?}",
|
||||
layer.layer_desc().layer_name(),
|
||||
layer_metadata.generation
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
let local_path = layer.local_path();
|
||||
|
||||
// We should only be uploading layers created by this `Tenant`'s lifetime, so
|
||||
@@ -2085,7 +2170,7 @@ impl RemoteTimelineClient {
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
|
||||
let lsn_update = match task.op {
|
||||
UploadOp::UploadLayer(_, _) => {
|
||||
UploadOp::UploadLayer(_, _, _) => {
|
||||
upload_queue.num_inprogress_layer_uploads -= 1;
|
||||
None
|
||||
}
|
||||
@@ -2162,7 +2247,7 @@ impl RemoteTimelineClient {
|
||||
)> {
|
||||
use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
|
||||
let res = match op {
|
||||
UploadOp::UploadLayer(_, m) => (
|
||||
UploadOp::UploadLayer(_, m, _) => (
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
|
||||
@@ -2259,6 +2344,7 @@ impl RemoteTimelineClient {
|
||||
blocked_deletions: Vec::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
recently_deleted: HashSet::new(),
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
|
||||
@@ -706,7 +706,7 @@ where
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
async fn download_retry_forever<T, O, F>(
|
||||
pub(crate) async fn download_retry_forever<T, O, F>(
|
||||
op: O,
|
||||
description: &str,
|
||||
cancel: &CancellationToken,
|
||||
|
||||
@@ -12,6 +12,7 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::import_pgdata;
|
||||
use crate::tenant::Generation;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
@@ -37,6 +38,13 @@ pub struct IndexPart {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub archived_at: Option<NaiveDateTime>,
|
||||
|
||||
/// This field supports import-from-pgdata ("fast imports" platform feature).
|
||||
/// We don't currently use fast imports, so, this field is None for all production timelines.
|
||||
/// See <https://github.com/neondatabase/neon/pull/9218> for more information.
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub import_pgdata: Option<import_pgdata::index_part_format::Root>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
@@ -90,10 +98,11 @@ impl IndexPart {
|
||||
/// - 7: metadata_bytes is no longer written, but still read
|
||||
/// - 8: added `archived_at`
|
||||
/// - 9: +gc_blocking
|
||||
const LATEST_VERSION: usize = 9;
|
||||
/// - 10: +import_pgdata
|
||||
const LATEST_VERSION: usize = 10;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -108,6 +117,7 @@ impl IndexPart {
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -381,6 +391,7 @@ mod tests {
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -425,6 +436,7 @@ mod tests {
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -470,6 +482,7 @@ mod tests {
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -518,6 +531,7 @@ mod tests {
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
@@ -561,6 +575,7 @@ mod tests {
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -607,6 +622,7 @@ mod tests {
|
||||
},
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -658,6 +674,7 @@ mod tests {
|
||||
},
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -714,6 +731,7 @@ mod tests {
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -771,6 +789,7 @@ mod tests {
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -833,6 +852,83 @@ mod tests {
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
import_pgdata: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v10_importpgdata_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 10,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123",
|
||||
"reasons": ["DetachAncestor"]
|
||||
},
|
||||
"import_pgdata": {
|
||||
"V1": {
|
||||
"Done": {
|
||||
"idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
|
||||
"started_at": "2024-11-13T09:23:42.123",
|
||||
"finished_at": "2024-11-13T09:42:23.123"
|
||||
}
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 10,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: Some(GcBlocking {
|
||||
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
|
||||
started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
|
||||
finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
|
||||
})))
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -4,6 +4,7 @@ pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
mod eviction_task;
|
||||
pub(crate) mod handle;
|
||||
pub(crate) mod import_pgdata;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
pub(crate) mod logical_size;
|
||||
@@ -49,6 +50,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
fs_ext, pausable_failpoint,
|
||||
postgres_client::PostgresClientProtocol,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
@@ -892,10 +894,11 @@ pub(crate) enum ShutdownMode {
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
FreezeAndFlush,
|
||||
/// Only flush the layers to the remote storage without freezing any open layers. This is the
|
||||
/// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
|
||||
/// the generation number.
|
||||
Flush,
|
||||
/// Only flush the layers to the remote storage without freezing any open layers. Flush the deletion
|
||||
/// queue. This is the mode used by ancestor detach and any other operations that reloads a tenant
|
||||
/// but not increasing the generation number. Note that this mode cannot be used at tenant shutdown,
|
||||
/// as flushing the deletion queue at that time will cause shutdown-in-progress errors.
|
||||
Reload,
|
||||
/// Shut down immediately, without waiting for any open layers to flush.
|
||||
Hard,
|
||||
}
|
||||
@@ -1816,7 +1819,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
if let ShutdownMode::Flush = mode {
|
||||
if let ShutdownMode::Reload = mode {
|
||||
// drain the upload queue
|
||||
self.remote_client.shutdown().await;
|
||||
if !self.remote_client.no_pending_work() {
|
||||
@@ -2085,6 +2088,11 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
|
||||
}
|
||||
|
||||
pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
}
|
||||
|
||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2172,6 +2180,21 @@ impl Timeline {
|
||||
)
|
||||
}
|
||||
|
||||
/// Resolve the effective WAL receiver protocol to use for this tenant.
|
||||
///
|
||||
/// Priority order is:
|
||||
/// 1. Tenant config override
|
||||
/// 2. Default value for tenant config override
|
||||
/// 3. Pageserver config override
|
||||
/// 4. Pageserver config default
|
||||
pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.wal_receiver_protocol_override
|
||||
.or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
|
||||
.unwrap_or(self.conf.wal_receiver_protocol)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
@@ -2464,6 +2487,7 @@ impl Timeline {
|
||||
*guard = Some(WalReceiver::start(
|
||||
Arc::clone(self),
|
||||
WalReceiverConf {
|
||||
protocol: self.resolve_wal_receiver_protocol(),
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
@@ -2647,6 +2671,7 @@ impl Timeline {
|
||||
//
|
||||
// NB: generation numbers naturally protect against this because they disambiguate
|
||||
// (1) and (4)
|
||||
// TODO: this is basically a no-op now, should we remove it?
|
||||
self.remote_client.schedule_barrier()?;
|
||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||
// on retry.
|
||||
@@ -2702,20 +2727,23 @@ impl Timeline {
|
||||
{
|
||||
Some(cancel) => cancel.cancel(),
|
||||
None => {
|
||||
let state = self.current_state();
|
||||
if matches!(
|
||||
state,
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping
|
||||
) {
|
||||
|
||||
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
|
||||
// Don't make noise.
|
||||
} else {
|
||||
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
|
||||
debug_assert!(false);
|
||||
match self.current_state() {
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping => {
|
||||
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
|
||||
// Don't make noise.
|
||||
}
|
||||
TimelineState::Loading => {
|
||||
// Import does not return an activated timeline.
|
||||
info!("discarding priority boost for logical size calculation because timeline is not yet active");
|
||||
}
|
||||
TimelineState::Active => {
|
||||
// activation should be setting the once cell
|
||||
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
|
||||
debug_assert!(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3819,7 +3847,8 @@ impl Timeline {
|
||||
};
|
||||
|
||||
// Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
|
||||
// This makes us refuse ingest until the new layers have been persisted to the remote.
|
||||
// This makes us refuse ingest until the new layers have been persisted to the remote
|
||||
let start = Instant::now();
|
||||
self.remote_client
|
||||
.wait_completion()
|
||||
.await
|
||||
@@ -3832,6 +3861,8 @@ impl Timeline {
|
||||
FlushLayerError::Other(anyhow!(e).into())
|
||||
}
|
||||
})?;
|
||||
let duration = start.elapsed().as_secs_f64();
|
||||
self.metrics.flush_wait_upload_time_gauge_add(duration);
|
||||
|
||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||
// a compaction can delete the file and then it won't be available for uploads any more.
|
||||
@@ -5886,7 +5917,7 @@ impl<'a> TimelineWriter<'a> {
|
||||
batch: SerializedValueBatch,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if batch.is_empty() {
|
||||
if !batch.has_data() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
||||
218
pageserver/src/tenant/timeline/import_pgdata.rs
Normal file
218
pageserver/src/tenant/timeline/import_pgdata.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use remote_storage::RemotePath;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info, info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{context::RequestContext, tenant::metadata::TimelineMetadata};
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
mod flow;
|
||||
mod importbucket_client;
|
||||
mod importbucket_format;
|
||||
pub(crate) mod index_part_format;
|
||||
pub(crate) mod upcall_api;
|
||||
|
||||
pub async fn doit(
|
||||
timeline: &Arc<Timeline>,
|
||||
index_part: index_part_format::Root,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let index_part_format::Root::V1(v1) = index_part;
|
||||
let index_part_format::InProgress {
|
||||
location,
|
||||
idempotency_key,
|
||||
started_at,
|
||||
} = match v1 {
|
||||
index_part_format::V1::Done(_) => return Ok(()),
|
||||
index_part_format::V1::InProgress(in_progress) => in_progress,
|
||||
};
|
||||
|
||||
let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
|
||||
|
||||
info!("get spec early so we know we'll be able to upcall when done");
|
||||
let Some(spec) = storage.get_spec().await? else {
|
||||
bail!("spec not found")
|
||||
};
|
||||
|
||||
let upcall_client =
|
||||
upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?;
|
||||
|
||||
//
|
||||
// send an early progress update to clean up k8s job early and generate potentially useful logs
|
||||
//
|
||||
info!("send early progress update");
|
||||
upcall_client
|
||||
.send_progress_until_success(&spec)
|
||||
.instrument(info_span!("early_progress_update"))
|
||||
.await?;
|
||||
|
||||
let status_prefix = RemotePath::from_string("status").unwrap();
|
||||
|
||||
//
|
||||
// See if shard is done.
|
||||
// TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing.
|
||||
//
|
||||
let shard_status_key =
|
||||
status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug()));
|
||||
let shard_status: Option<importbucket_format::ShardStatus> =
|
||||
storage.get_json(&shard_status_key).await?;
|
||||
info!(?shard_status, "peeking shard status");
|
||||
if shard_status.map(|st| st.done).unwrap_or(false) {
|
||||
info!("shard status indicates that the shard is done, skipping import");
|
||||
} else {
|
||||
// TODO: checkpoint the progress into the IndexPart instead of restarting
|
||||
// from the beginning.
|
||||
|
||||
//
|
||||
// Wipe the slate clean - the flow does not allow resuming.
|
||||
// We can implement resuming in the future by checkpointing the progress into the IndexPart.
|
||||
//
|
||||
info!("wipe the slate clean");
|
||||
{
|
||||
// TODO: do we need to hold GC lock for this?
|
||||
let mut guard = timeline.layers.write().await;
|
||||
assert!(
|
||||
guard.layer_map()?.open_layer.is_none(),
|
||||
"while importing, there should be no in-memory layer" // this just seems like a good place to assert it
|
||||
);
|
||||
let all_layers_keys = guard.all_persistent_layers();
|
||||
let all_layers: Vec<_> = all_layers_keys
|
||||
.iter()
|
||||
.map(|key| guard.get_from_key(key))
|
||||
.collect();
|
||||
let open = guard.open_mut().context("open_mut")?;
|
||||
|
||||
timeline.remote_client.schedule_gc_update(&all_layers)?;
|
||||
open.finish_gc_timeline(&all_layers);
|
||||
}
|
||||
|
||||
//
|
||||
// Wait for pgdata to finish uploading
|
||||
//
|
||||
info!("wait for pgdata to reach status 'done'");
|
||||
let pgdata_status_key = status_prefix.join("pgdata");
|
||||
loop {
|
||||
let res = async {
|
||||
let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
|
||||
.get_json(&pgdata_status_key)
|
||||
.await
|
||||
.context("get pgdata status")?;
|
||||
info!(?pgdata_status, "peeking pgdata status");
|
||||
if pgdata_status.map(|st| st.done).unwrap_or(false) {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow::anyhow!("pgdata not done yet"))
|
||||
}
|
||||
}
|
||||
.await;
|
||||
match res {
|
||||
Ok(_) => break,
|
||||
Err(err) => {
|
||||
info!(?err, "indefintely waiting for pgdata to finish");
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
bail!("cancelled while waiting for pgdata");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Do the import
|
||||
//
|
||||
info!("do the import");
|
||||
let control_file = storage.get_control_file().await?;
|
||||
let base_lsn = control_file.base_lsn();
|
||||
|
||||
info!("update TimelineMetadata based on LSNs from control file");
|
||||
{
|
||||
let pg_version = control_file.pg_version();
|
||||
let _ctx: &RequestContext = ctx;
|
||||
async move {
|
||||
// FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
|
||||
// checkpoint record, and prev_record_lsn should point to its beginning.
|
||||
// We should read the real end of the record from the WAL, but here we
|
||||
// just fake it.
|
||||
let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
|
||||
let prev_record_lsn = base_lsn;
|
||||
let metadata = TimelineMetadata::new(
|
||||
disk_consistent_lsn,
|
||||
Some(prev_record_lsn),
|
||||
None, // no ancestor
|
||||
Lsn(0), // no ancestor lsn
|
||||
base_lsn, // latest_gc_cutoff_lsn
|
||||
base_lsn, // initdb_lsn
|
||||
pg_version,
|
||||
);
|
||||
|
||||
let _start_lsn = disk_consistent_lsn + 1;
|
||||
|
||||
timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_full_metadata_update(&metadata)?;
|
||||
|
||||
timeline.remote_client.wait_completion().await?;
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
}
|
||||
.await?;
|
||||
|
||||
flow::run(
|
||||
timeline.clone(),
|
||||
base_lsn,
|
||||
control_file,
|
||||
storage.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
//
|
||||
// Communicate that shard is done.
|
||||
//
|
||||
storage
|
||||
.put_json(
|
||||
&shard_status_key,
|
||||
&importbucket_format::ShardStatus { done: true },
|
||||
)
|
||||
.await
|
||||
.context("put shard status")?;
|
||||
}
|
||||
|
||||
//
|
||||
// Ensure at-least-once deliver of the upcall to cplane
|
||||
// before we mark the task as done and never come here again.
|
||||
//
|
||||
info!("send final progress update");
|
||||
upcall_client
|
||||
.send_progress_until_success(&spec)
|
||||
.instrument(info_span!("final_progress_update"))
|
||||
.await?;
|
||||
|
||||
//
|
||||
// Mark as done in index_part.
|
||||
// This makes subsequent timeline loads enter the normal load code path
|
||||
// instead of spawning the import task and calling this here function.
|
||||
//
|
||||
info!("mark import as complete in index part");
|
||||
timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1(
|
||||
index_part_format::V1::Done(index_part_format::Done {
|
||||
idempotency_key,
|
||||
started_at,
|
||||
finished_at: chrono::Utc::now().naive_utc(),
|
||||
}),
|
||||
)))?;
|
||||
|
||||
timeline.remote_client.wait_completion().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
798
pageserver/src/tenant/timeline/import_pgdata/flow.rs
Normal file
798
pageserver/src/tenant/timeline/import_pgdata/flow.rs
Normal file
@@ -0,0 +1,798 @@
|
||||
//! Import a PGDATA directory into an empty root timeline.
|
||||
//!
|
||||
//! This module is adapted hackathon code by Heikki and Stas.
|
||||
//! Other code in the parent module was written by Christian as part of a customer PoC.
|
||||
//!
|
||||
//! The hackathon code was producing image layer files as a free-standing program.
|
||||
//!
|
||||
//! It has been modified to
|
||||
//! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard)
|
||||
//! - => sharding-awareness: produce image layers with only the data relevant for this shard
|
||||
//! - => S3 as the source for the PGDATA instead of local filesystem
|
||||
//!
|
||||
//! TODOs before productionization:
|
||||
//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding.
|
||||
//! => produced image layers likely too small.
|
||||
//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size.
|
||||
//! - asserts / unwraps need to be replaced with errors
|
||||
//! - don't trust remote objects will be small (=prevent OOMs in those cases)
|
||||
//! - limit all in-memory buffers in size, or download to disk and read from there
|
||||
//! - limit task concurrency
|
||||
//! - generally play nice with other tenants in the system
|
||||
//! - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits
|
||||
//! - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc
|
||||
//! - integrate with layer eviction system
|
||||
//! - audit for Tenant::cancel nor Timeline::cancel responsivity
|
||||
//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!)
|
||||
//!
|
||||
//! An incomplete set of TODOs from the Hackathon:
|
||||
//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, ensure};
|
||||
use bytes::Bytes;
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY},
|
||||
reltag::RelTag,
|
||||
shard::ShardIdentity,
|
||||
};
|
||||
use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, info_span, instrument, Instrument};
|
||||
|
||||
use crate::{
|
||||
assert_u64_eq_usize::UsizeIsU64,
|
||||
pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory},
|
||||
};
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
pgdatadir_mapping::{DbDirectory, RelDirectory},
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::{ImageLayerWriter, Layer},
|
||||
};
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{
|
||||
slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY,
|
||||
TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::singleton_range;
|
||||
use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Range;
|
||||
|
||||
use super::{
|
||||
importbucket_client::{ControlFile, RemoteStorageWrapper},
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use remote_storage::RemotePath;
|
||||
|
||||
pub async fn run(
|
||||
timeline: Arc<Timeline>,
|
||||
pgdata_lsn: Lsn,
|
||||
control_file: ControlFile,
|
||||
storage: RemoteStorageWrapper,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
Flow {
|
||||
timeline,
|
||||
pgdata_lsn,
|
||||
control_file,
|
||||
tasks: Vec::new(),
|
||||
storage,
|
||||
}
|
||||
.run(ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
struct Flow {
|
||||
timeline: Arc<Timeline>,
|
||||
pgdata_lsn: Lsn,
|
||||
control_file: ControlFile,
|
||||
tasks: Vec<AnyImportTask>,
|
||||
storage: RemoteStorageWrapper,
|
||||
}
|
||||
|
||||
impl Flow {
|
||||
/// Perform the ingestion into [`Self::timeline`].
|
||||
/// Assumes the timeline is empty (= no layers).
|
||||
pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
|
||||
|
||||
self.pgdata_lsn = pgdata_lsn;
|
||||
|
||||
let datadir = PgDataDir::new(&self.storage).await?;
|
||||
|
||||
// Import dbdir (00:00:00 keyspace)
|
||||
// This is just constructed here, but will be written to the image layer in the first call to import_db()
|
||||
let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory {
|
||||
dbdirs: datadir
|
||||
.dbs
|
||||
.iter()
|
||||
.map(|db| ((db.spcnode, db.dboid), true))
|
||||
.collect(),
|
||||
})?);
|
||||
self.tasks
|
||||
.push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into());
|
||||
|
||||
// Import databases (00:spcnode:dbnode keyspace for each db)
|
||||
for db in datadir.dbs {
|
||||
self.import_db(&db).await?;
|
||||
}
|
||||
|
||||
// Import SLRUs
|
||||
|
||||
// pg_xact (01:00 keyspace)
|
||||
self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
|
||||
.await?;
|
||||
// pg_multixact/members (01:01 keyspace)
|
||||
self.import_slru(
|
||||
SlruKind::MultiXactMembers,
|
||||
&self.storage.pgdata().join("pg_multixact/members"),
|
||||
)
|
||||
.await?;
|
||||
// pg_multixact/offsets (01:02 keyspace)
|
||||
self.import_slru(
|
||||
SlruKind::MultiXactOffsets,
|
||||
&self.storage.pgdata().join("pg_multixact/offsets"),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Import pg_twophase.
|
||||
// TODO: as empty
|
||||
let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||
xids: HashSet::new(),
|
||||
})?;
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
TWOPHASEDIR_KEY,
|
||||
Bytes::from(twophasedir_buf),
|
||||
)));
|
||||
|
||||
// Controlfile, checkpoint
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
CONTROLFILE_KEY,
|
||||
self.control_file.control_file_buf().clone(),
|
||||
)));
|
||||
|
||||
let checkpoint_buf = self
|
||||
.control_file
|
||||
.control_file_data()
|
||||
.checkPointCopy
|
||||
.encode()?;
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
CHECKPOINT_KEY,
|
||||
checkpoint_buf,
|
||||
)));
|
||||
|
||||
// Assigns parts of key space to later parallel jobs
|
||||
let mut last_end_key = Key::MIN;
|
||||
let mut current_chunk = Vec::new();
|
||||
let mut current_chunk_size: usize = 0;
|
||||
let mut parallel_jobs = Vec::new();
|
||||
for task in std::mem::take(&mut self.tasks).into_iter() {
|
||||
if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 {
|
||||
let key_range = last_end_key..task.key_range().start;
|
||||
parallel_jobs.push(ChunkProcessingJob::new(
|
||||
key_range.clone(),
|
||||
std::mem::take(&mut current_chunk),
|
||||
&self,
|
||||
));
|
||||
last_end_key = key_range.end;
|
||||
current_chunk_size = 0;
|
||||
}
|
||||
current_chunk_size += task.total_size();
|
||||
current_chunk.push(task);
|
||||
}
|
||||
parallel_jobs.push(ChunkProcessingJob::new(
|
||||
last_end_key..Key::MAX,
|
||||
current_chunk,
|
||||
&self,
|
||||
));
|
||||
|
||||
// Start all jobs simultaneosly
|
||||
let mut work = JoinSet::new();
|
||||
// TODO: semaphore?
|
||||
for job in parallel_jobs {
|
||||
let ctx: RequestContext =
|
||||
ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
|
||||
work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job")));
|
||||
}
|
||||
let mut results = Vec::new();
|
||||
while let Some(result) = work.join_next().await {
|
||||
match result {
|
||||
Ok(res) => {
|
||||
results.push(res);
|
||||
}
|
||||
Err(_joinset_err) => {
|
||||
results.push(Err(anyhow::anyhow!(
|
||||
"parallel job panicked or cancelled, check pageserver logs"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if results.iter().all(|r| r.is_ok()) {
|
||||
Ok(())
|
||||
} else {
|
||||
let mut msg = String::new();
|
||||
for result in results {
|
||||
if let Err(err) = result {
|
||||
msg.push_str(&format!("{err:?}\n\n"));
|
||||
}
|
||||
}
|
||||
bail!("Some parallel jobs failed:\n\n{msg}");
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
|
||||
async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> {
|
||||
debug!("start");
|
||||
scopeguard::defer! {
|
||||
debug!("return");
|
||||
}
|
||||
|
||||
// Import relmap (00:spcnode:dbnode:00:*:00)
|
||||
let relmap_key = relmap_file_key(db.spcnode, db.dboid);
|
||||
debug!("Constructing relmap entry, key {relmap_key}");
|
||||
let relmap_path = db.path.join("pg_filenode.map");
|
||||
let relmap_buf = self.storage.get(&relmap_path).await?;
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
relmap_key, relmap_buf,
|
||||
)));
|
||||
|
||||
// Import reldir (00:spcnode:dbnode:00:*:01)
|
||||
let reldir_key = rel_dir_to_key(db.spcnode, db.dboid);
|
||||
debug!("Constructing reldirs entry, key {reldir_key}");
|
||||
let reldir_buf = RelDirectory::ser(&RelDirectory {
|
||||
rels: db
|
||||
.files
|
||||
.iter()
|
||||
.map(|f| (f.rel_tag.relnode, f.rel_tag.forknum))
|
||||
.collect(),
|
||||
})?;
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
reldir_key,
|
||||
Bytes::from(reldir_buf),
|
||||
)));
|
||||
|
||||
// Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last
|
||||
// segment in a given relation (00:spcnode:dbnode:reloid:fork:ff)
|
||||
for file in &db.files {
|
||||
debug!(%file.path, %file.filesize, "importing file");
|
||||
let len = file.filesize;
|
||||
ensure!(len % 8192 == 0);
|
||||
let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let start_key = rel_block_to_key(file.rel_tag, start_blk);
|
||||
let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
|
||||
self.tasks
|
||||
.push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(
|
||||
*self.timeline.get_shard_identity(),
|
||||
start_key..end_key,
|
||||
&file.path,
|
||||
self.storage.clone(),
|
||||
)));
|
||||
|
||||
// Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff)
|
||||
if let Some(nblocks) = file.nblocks {
|
||||
let size_key = rel_size_to_key(file.rel_tag);
|
||||
//debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}");
|
||||
let buf = nblocks.to_le_bytes();
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
size_key,
|
||||
Bytes::from(buf.to_vec()),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let segments = self.storage.listfilesindir(path).await?;
|
||||
let segments: Vec<(String, u32, usize)> = segments
|
||||
.into_iter()
|
||||
.filter_map(|(path, size)| {
|
||||
let filename = path.object_name()?;
|
||||
let segno = u32::from_str_radix(filename, 16).ok()?;
|
||||
Some((filename.to_string(), segno, size))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Write SlruDir
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
let segnos: HashSet<u32> = segments
|
||||
.iter()
|
||||
.map(|(_path, segno, _size)| *segno)
|
||||
.collect();
|
||||
let slrudir = SlruSegmentDirectory { segments: segnos };
|
||||
let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?;
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
slrudir_key,
|
||||
Bytes::from(slrudir_buf),
|
||||
)));
|
||||
|
||||
for (segpath, segno, size) in segments {
|
||||
// SlruSegBlocks for each segment
|
||||
let p = path.join(&segpath);
|
||||
let file_size = size;
|
||||
ensure!(file_size % 8192 == 0);
|
||||
let nblocks = u32::try_from(file_size / 8192)?;
|
||||
let start_key = slru_block_to_key(kind, segno, 0);
|
||||
let end_key = slru_block_to_key(kind, segno, nblocks);
|
||||
debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
|
||||
self.tasks
|
||||
.push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
|
||||
*self.timeline.get_shard_identity(),
|
||||
start_key..end_key,
|
||||
&p,
|
||||
self.storage.clone(),
|
||||
)));
|
||||
|
||||
// Followed by SlruSegSize
|
||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
||||
let segsize_buf = nblocks.to_le_bytes();
|
||||
self.tasks
|
||||
.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
|
||||
segsize_key,
|
||||
Bytes::copy_from_slice(&segsize_buf),
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// dbdir iteration tools
|
||||
//
|
||||
|
||||
struct PgDataDir {
|
||||
pub dbs: Vec<PgDataDirDb>, // spcnode, dboid, path
|
||||
}
|
||||
|
||||
struct PgDataDirDb {
|
||||
pub spcnode: u32,
|
||||
pub dboid: u32,
|
||||
pub path: RemotePath,
|
||||
pub files: Vec<PgDataDirDbFile>,
|
||||
}
|
||||
|
||||
struct PgDataDirDbFile {
|
||||
pub path: RemotePath,
|
||||
pub rel_tag: RelTag,
|
||||
pub segno: u32,
|
||||
pub filesize: usize,
|
||||
// Cummulative size of the given fork, set only for the last segment of that fork
|
||||
pub nblocks: Option<usize>,
|
||||
}
|
||||
|
||||
impl PgDataDir {
|
||||
async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result<Self> {
|
||||
let datadir_path = storage.pgdata();
|
||||
// Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first
|
||||
// Traverse database in increasing oid order
|
||||
|
||||
let basedir = &datadir_path.join("base");
|
||||
let db_oids: Vec<_> = storage
|
||||
.listdir(basedir)
|
||||
.await?
|
||||
.into_iter()
|
||||
.filter_map(|path| path.object_name().and_then(|name| name.parse::<u32>().ok()))
|
||||
.sorted()
|
||||
.collect();
|
||||
debug!(?db_oids, "found databases");
|
||||
let mut databases = Vec::new();
|
||||
for dboid in db_oids {
|
||||
databases.push(
|
||||
PgDataDirDb::new(
|
||||
storage,
|
||||
&basedir.join(dboid.to_string()),
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&datadir_path,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
|
||||
// special case for global catalogs
|
||||
databases.push(
|
||||
PgDataDirDb::new(
|
||||
storage,
|
||||
&datadir_path.join("global"),
|
||||
postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&datadir_path,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
databases.sort_by_key(|db| (db.spcnode, db.dboid));
|
||||
|
||||
Ok(Self { dbs: databases })
|
||||
}
|
||||
}
|
||||
|
||||
impl PgDataDirDb {
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))]
|
||||
async fn new(
|
||||
storage: &RemoteStorageWrapper,
|
||||
db_path: &RemotePath,
|
||||
spcnode: u32,
|
||||
dboid: u32,
|
||||
datadir_path: &RemotePath,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut files: Vec<PgDataDirDbFile> = storage
|
||||
.listfilesindir(db_path)
|
||||
.await?
|
||||
.into_iter()
|
||||
.filter_map(|(path, size)| {
|
||||
debug!(%path, %size, "found file in dbdir");
|
||||
path.object_name().and_then(|name| {
|
||||
// returns (relnode, forknum, segno)
|
||||
parse_relfilename(name).ok().map(|x| (size, x))
|
||||
})
|
||||
})
|
||||
.sorted_by_key(|(_, relfilename)| *relfilename)
|
||||
.map(|(filesize, (relnode, forknum, segno))| {
|
||||
let rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
|
||||
let path = datadir_path.join(rel_tag.to_segfile_name(segno));
|
||||
assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error
|
||||
let nblocks = filesize / BLCKSZ as usize;
|
||||
|
||||
PgDataDirDbFile {
|
||||
path,
|
||||
filesize,
|
||||
rel_tag,
|
||||
segno,
|
||||
nblocks: Some(nblocks), // first non-cummulative sizes
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Set cummulative sizes. Do all of that math here, so that later we could easier
|
||||
// parallelize over segments and know with which segments we need to write relsize
|
||||
// entry.
|
||||
let mut cumulative_nblocks: usize = 0;
|
||||
let mut prev_rel_tag: Option<RelTag> = None;
|
||||
for i in 0..files.len() {
|
||||
if prev_rel_tag == Some(files[i].rel_tag) {
|
||||
cumulative_nblocks += files[i].nblocks.unwrap();
|
||||
} else {
|
||||
cumulative_nblocks = files[i].nblocks.unwrap();
|
||||
}
|
||||
|
||||
files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag {
|
||||
Some(cumulative_nblocks)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
prev_rel_tag = Some(files[i].rel_tag);
|
||||
}
|
||||
|
||||
Ok(PgDataDirDb {
|
||||
files,
|
||||
path: db_path.clone(),
|
||||
spcnode,
|
||||
dboid,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait ImportTask {
|
||||
fn key_range(&self) -> Range<Key>;
|
||||
|
||||
fn total_size(&self) -> usize {
|
||||
// TODO: revisit this
|
||||
if is_contiguous_range(&self.key_range()) {
|
||||
contiguous_range_len(&self.key_range()) as usize * 8192
|
||||
} else {
|
||||
u32::MAX as usize
|
||||
}
|
||||
}
|
||||
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize>;
|
||||
}
|
||||
|
||||
struct ImportSingleKeyTask {
|
||||
key: Key,
|
||||
buf: Bytes,
|
||||
}
|
||||
|
||||
impl ImportSingleKeyTask {
|
||||
fn new(key: Key, buf: Bytes) -> Self {
|
||||
ImportSingleKeyTask { key, buf }
|
||||
}
|
||||
}
|
||||
|
||||
impl ImportTask for ImportSingleKeyTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
singleton_range(self.key)
|
||||
}
|
||||
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
layer_writer.put_image(self.key, self.buf, ctx).await?;
|
||||
Ok(1)
|
||||
}
|
||||
}
|
||||
|
||||
struct ImportRelBlocksTask {
|
||||
shard_identity: ShardIdentity,
|
||||
key_range: Range<Key>,
|
||||
path: RemotePath,
|
||||
storage: RemoteStorageWrapper,
|
||||
}
|
||||
|
||||
impl ImportRelBlocksTask {
|
||||
fn new(
|
||||
shard_identity: ShardIdentity,
|
||||
key_range: Range<Key>,
|
||||
path: &RemotePath,
|
||||
storage: RemoteStorageWrapper,
|
||||
) -> Self {
|
||||
ImportRelBlocksTask {
|
||||
shard_identity,
|
||||
key_range,
|
||||
path: path.clone(),
|
||||
storage,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ImportTask for ImportRelBlocksTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))]
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
debug!("Importing relation file");
|
||||
|
||||
let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
|
||||
let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?;
|
||||
assert_eq!(rel_tag, rel_tag_end);
|
||||
|
||||
let ranges = (start_blk..end_blk)
|
||||
.enumerate()
|
||||
.filter_map(|(i, blknum)| {
|
||||
let key = rel_block_to_key(rel_tag, blknum);
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
return None;
|
||||
}
|
||||
let file_offset = i.checked_mul(8192).unwrap();
|
||||
Some((
|
||||
vec![key],
|
||||
file_offset,
|
||||
file_offset.checked_add(8192).unwrap(),
|
||||
))
|
||||
})
|
||||
.coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| {
|
||||
assert_eq!(key.len(), 1);
|
||||
assert!(!acc.is_empty());
|
||||
assert!(acc_end > acc_start);
|
||||
if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ {
|
||||
acc.push(key.pop().unwrap());
|
||||
Ok((acc, acc_start, end))
|
||||
} else {
|
||||
Err(((acc, acc_start, acc_end), (key, start, end)))
|
||||
}
|
||||
});
|
||||
|
||||
let mut nimages = 0;
|
||||
for (keys, range_start, range_end) in ranges {
|
||||
let range_buf = self
|
||||
.storage
|
||||
.get_range(&self.path, range_start.into_u64(), range_end.into_u64())
|
||||
.await?;
|
||||
let mut buf = Bytes::from(range_buf);
|
||||
// TODO: batched writes
|
||||
for key in keys {
|
||||
let image = buf.split_to(8192);
|
||||
layer_writer.put_image(key, image, ctx).await?;
|
||||
nimages += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(nimages)
|
||||
}
|
||||
}
|
||||
|
||||
struct ImportSlruBlocksTask {
|
||||
shard_identity: ShardIdentity,
|
||||
key_range: Range<Key>,
|
||||
path: RemotePath,
|
||||
storage: RemoteStorageWrapper,
|
||||
}
|
||||
|
||||
impl ImportSlruBlocksTask {
|
||||
fn new(
|
||||
shard_identity: ShardIdentity,
|
||||
key_range: Range<Key>,
|
||||
path: &RemotePath,
|
||||
storage: RemoteStorageWrapper,
|
||||
) -> Self {
|
||||
ImportSlruBlocksTask {
|
||||
shard_identity,
|
||||
key_range,
|
||||
path: path.clone(),
|
||||
storage,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ImportTask for ImportSlruBlocksTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
debug!("Importing SLRU segment file {}", self.path);
|
||||
let buf = self.storage.get(&self.path).await?;
|
||||
|
||||
let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
|
||||
let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
|
||||
let mut blknum = start_blk;
|
||||
let mut nimages = 0;
|
||||
let mut file_offset = 0;
|
||||
while blknum < end_blk {
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
assert!(
|
||||
!self.shard_identity.is_key_disposable(&key),
|
||||
"SLRU keys need to go into every shard"
|
||||
);
|
||||
let buf = &buf[file_offset..(file_offset + 8192)];
|
||||
file_offset += 8192;
|
||||
layer_writer
|
||||
.put_image(key, Bytes::copy_from_slice(buf), ctx)
|
||||
.await?;
|
||||
blknum += 1;
|
||||
nimages += 1;
|
||||
}
|
||||
Ok(nimages)
|
||||
}
|
||||
}
|
||||
|
||||
enum AnyImportTask {
|
||||
SingleKey(ImportSingleKeyTask),
|
||||
RelBlocks(ImportRelBlocksTask),
|
||||
SlruBlocks(ImportSlruBlocksTask),
|
||||
}
|
||||
|
||||
impl ImportTask for AnyImportTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
match self {
|
||||
Self::SingleKey(t) => t.key_range(),
|
||||
Self::RelBlocks(t) => t.key_range(),
|
||||
Self::SlruBlocks(t) => t.key_range(),
|
||||
}
|
||||
}
|
||||
/// returns the number of images put into the `layer_writer`
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
match self {
|
||||
Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImportSingleKeyTask> for AnyImportTask {
|
||||
fn from(t: ImportSingleKeyTask) -> Self {
|
||||
Self::SingleKey(t)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImportRelBlocksTask> for AnyImportTask {
|
||||
fn from(t: ImportRelBlocksTask) -> Self {
|
||||
Self::RelBlocks(t)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImportSlruBlocksTask> for AnyImportTask {
|
||||
fn from(t: ImportSlruBlocksTask) -> Self {
|
||||
Self::SlruBlocks(t)
|
||||
}
|
||||
}
|
||||
|
||||
struct ChunkProcessingJob {
|
||||
timeline: Arc<Timeline>,
|
||||
range: Range<Key>,
|
||||
tasks: Vec<AnyImportTask>,
|
||||
|
||||
pgdata_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl ChunkProcessingJob {
|
||||
fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &Flow) -> Self {
|
||||
assert!(env.pgdata_lsn.is_valid());
|
||||
Self {
|
||||
timeline: env.timeline.clone(),
|
||||
range,
|
||||
tasks,
|
||||
pgdata_lsn: env.pgdata_lsn,
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
self.timeline.conf,
|
||||
self.timeline.timeline_id,
|
||||
self.timeline.tenant_shard_id,
|
||||
&self.range,
|
||||
self.pgdata_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut nimages = 0;
|
||||
for task in self.tasks {
|
||||
nimages += task.doit(&mut writer, ctx).await?;
|
||||
}
|
||||
|
||||
let resident_layer = if nimages > 0 {
|
||||
let (desc, path) = writer.finish(ctx).await?;
|
||||
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?
|
||||
} else {
|
||||
// dropping the writer cleans up
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// this is sharing the same code as create_image_layers
|
||||
let mut guard = self.timeline.layers.write().await;
|
||||
guard
|
||||
.open_mut()?
|
||||
.track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics);
|
||||
crate::tenant::timeline::drop_wlock(guard);
|
||||
|
||||
// Schedule the layer for upload but don't add barriers such as
|
||||
// wait for completion or index upload, so we don't inhibit upload parallelism.
|
||||
// TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?)
|
||||
// TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level.
|
||||
self.timeline
|
||||
.remote_client
|
||||
.schedule_layer_file_upload(resident_layer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,315 @@
|
||||
use std::{ops::Bound, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::ControlFileData;
|
||||
use remote_storage::{
|
||||
Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath,
|
||||
};
|
||||
use serde::de::DeserializeOwned;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf};
|
||||
|
||||
use super::{importbucket_format, index_part_format};
|
||||
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
location: &index_part_format::Location,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<RemoteStorageWrapper, anyhow::Error> {
|
||||
// FIXME: we probably want some timeout, and we might be able to assume the max file
|
||||
// size on S3 is 1GiB (postgres segment size). But the problem is that the individual
|
||||
// downloaders don't know enough about concurrent downloads to make a guess on the
|
||||
// expected bandwidth and resulting best timeout.
|
||||
let timeout = std::time::Duration::from_secs(24 * 60 * 60);
|
||||
let location_storage = match location {
|
||||
#[cfg(feature = "testing")]
|
||||
index_part_format::Location::LocalFs { path } => {
|
||||
GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?)
|
||||
}
|
||||
index_part_format::Location::AwsS3 {
|
||||
region,
|
||||
bucket,
|
||||
key,
|
||||
} => {
|
||||
// TODO: think about security implications of letting the client specify the bucket & prefix.
|
||||
// It's the most flexible right now, but, possibly we want to move bucket name into PS conf
|
||||
// and force the timeline_id into the prefix?
|
||||
GenericRemoteStorage::AwsS3(Arc::new(
|
||||
remote_storage::S3Bucket::new(
|
||||
&remote_storage::S3Config {
|
||||
bucket_name: bucket.clone(),
|
||||
prefix_in_bucket: Some(key.clone()),
|
||||
bucket_region: region.clone(),
|
||||
endpoint: conf
|
||||
.import_pgdata_aws_endpoint_url
|
||||
.clone()
|
||||
.map(|url| url.to_string()), // by specifying None here, remote_storage/aws-sdk-rust will infer from env
|
||||
concurrency_limit: 100.try_into().unwrap(), // TODO: think about this
|
||||
max_keys_per_list_response: Some(1000), // TODO: think about this
|
||||
upload_storage_class: None, // irrelevant
|
||||
},
|
||||
timeout,
|
||||
)
|
||||
.await
|
||||
.context("setup s3 bucket")?,
|
||||
))
|
||||
}
|
||||
};
|
||||
let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel);
|
||||
Ok(storage_wrapper)
|
||||
}
|
||||
|
||||
/// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API
|
||||
/// such as [`tokio::fs`], which was used in the original implementation of the import code.
|
||||
#[derive(Clone)]
|
||||
pub struct RemoteStorageWrapper {
|
||||
storage: GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl RemoteStorageWrapper {
|
||||
pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self {
|
||||
Self { storage, cancel }
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
|
||||
pub async fn listfilesindir(
|
||||
&self,
|
||||
path: &RemotePath,
|
||||
) -> Result<Vec<(RemotePath, usize)>, DownloadError> {
|
||||
assert!(
|
||||
path.object_name().is_some(),
|
||||
"must specify dirname, without trailing slash"
|
||||
);
|
||||
let path = path.add_trailing_slash();
|
||||
|
||||
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|
||||
|| async {
|
||||
let Listing { keys, prefixes: _ } = self
|
||||
.storage
|
||||
.list(
|
||||
Some(&path),
|
||||
remote_storage::ListingMode::WithDelimiter,
|
||||
None,
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
let res = keys
|
||||
.into_iter()
|
||||
.map(|ListingObject { key, size, .. }| (key, size.into_usize()))
|
||||
.collect();
|
||||
Ok(res)
|
||||
},
|
||||
&format!("listfilesindir {path:?}"),
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
debug!(?res, "returning");
|
||||
res
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
|
||||
pub async fn listdir(&self, path: &RemotePath) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
assert!(
|
||||
path.object_name().is_some(),
|
||||
"must specify dirname, without trailing slash"
|
||||
);
|
||||
let path = path.add_trailing_slash();
|
||||
|
||||
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|
||||
|| async {
|
||||
let Listing { keys, prefixes } = self
|
||||
.storage
|
||||
.list(
|
||||
Some(&path),
|
||||
remote_storage::ListingMode::WithDelimiter,
|
||||
None,
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
let res = keys
|
||||
.into_iter()
|
||||
.map(|ListingObject { key, .. }| key)
|
||||
.chain(prefixes.into_iter())
|
||||
.collect();
|
||||
Ok(res)
|
||||
},
|
||||
&format!("listdir {path:?}"),
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
debug!(?res, "returning");
|
||||
res
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
|
||||
pub async fn get(&self, path: &RemotePath) -> Result<Bytes, DownloadError> {
|
||||
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|
||||
|| async {
|
||||
let Download {
|
||||
download_stream, ..
|
||||
} = self
|
||||
.storage
|
||||
.download(path, &DownloadOpts::default(), &self.cancel)
|
||||
.await?;
|
||||
let mut reader = tokio_util::io::StreamReader::new(download_stream);
|
||||
|
||||
// XXX optimize this, can we get the capacity hint from somewhere?
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy_buf(&mut reader, &mut buf).await?;
|
||||
Ok(Bytes::from(buf))
|
||||
},
|
||||
&format!("download {path:?}"),
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
|
||||
res
|
||||
}
|
||||
|
||||
pub async fn get_spec(&self) -> Result<Option<importbucket_format::Spec>, anyhow::Error> {
|
||||
self.get_json(&RemotePath::from_string("spec.json").unwrap())
|
||||
.await
|
||||
.context("get spec")
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
|
||||
pub async fn get_json<T: DeserializeOwned>(
|
||||
&self,
|
||||
path: &RemotePath,
|
||||
) -> Result<Option<T>, DownloadError> {
|
||||
let buf = match self.get(path).await {
|
||||
Ok(buf) => buf,
|
||||
Err(DownloadError::NotFound) => return Ok(None),
|
||||
Err(err) => return Err(err),
|
||||
};
|
||||
let res = serde_json::from_slice(&buf)
|
||||
.context("serialize")
|
||||
// TODO: own error type
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(Some(res))
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
|
||||
pub async fn put_json<T>(&self, path: &RemotePath, value: &T) -> anyhow::Result<()>
|
||||
where
|
||||
T: serde::Serialize,
|
||||
{
|
||||
let buf = serde_json::to_vec(value)?;
|
||||
let bytes = Bytes::from(buf);
|
||||
utils::backoff::retry(
|
||||
|| async {
|
||||
let size = bytes.len();
|
||||
let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
|
||||
self.storage
|
||||
.upload_storage_object(bytes, size, path, &self.cancel)
|
||||
.await
|
||||
},
|
||||
remote_storage::TimeoutOrCancel::caused_by_cancel,
|
||||
1,
|
||||
u32::MAX,
|
||||
&format!("put json {path}"),
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.expect("practically infinite retries")
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
|
||||
pub async fn get_range(
|
||||
&self,
|
||||
path: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: u64,
|
||||
) -> Result<Vec<u8>, DownloadError> {
|
||||
let len = end_exclusive
|
||||
.checked_sub(start_inclusive)
|
||||
.unwrap()
|
||||
.into_usize();
|
||||
let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
|
||||
|| async {
|
||||
let Download {
|
||||
download_stream, ..
|
||||
} = self
|
||||
.storage
|
||||
.download(
|
||||
path,
|
||||
&DownloadOpts {
|
||||
etag: None,
|
||||
byte_start: Bound::Included(start_inclusive),
|
||||
byte_end: Bound::Excluded(end_exclusive)
|
||||
},
|
||||
&self.cancel)
|
||||
.await?;
|
||||
let mut reader = tokio_util::io::StreamReader::new(download_stream);
|
||||
|
||||
let mut buf = Vec::with_capacity(len);
|
||||
tokio::io::copy_buf(&mut reader, &mut buf).await?;
|
||||
Ok(buf)
|
||||
},
|
||||
&format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"),
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
|
||||
res
|
||||
}
|
||||
|
||||
pub fn pgdata(&self) -> RemotePath {
|
||||
RemotePath::from_string("pgdata").unwrap()
|
||||
}
|
||||
|
||||
pub async fn get_control_file(&self) -> Result<ControlFile, anyhow::Error> {
|
||||
let control_file_path = self.pgdata().join("global/pg_control");
|
||||
info!("get control file from {control_file_path}");
|
||||
let control_file_buf = self.get(&control_file_path).await?;
|
||||
ControlFile::new(control_file_buf)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ControlFile {
|
||||
control_file_data: ControlFileData,
|
||||
control_file_buf: Bytes,
|
||||
}
|
||||
|
||||
impl ControlFile {
|
||||
pub(crate) fn new(control_file_buf: Bytes) -> Result<Self, anyhow::Error> {
|
||||
// XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes.
|
||||
let control_file_data = ControlFileData::decode(&control_file_buf)?;
|
||||
let control_file = ControlFile {
|
||||
control_file_data,
|
||||
control_file_buf,
|
||||
};
|
||||
control_file.try_pg_version()?; // so that we can offer infallible pg_version()
|
||||
Ok(control_file)
|
||||
}
|
||||
pub(crate) fn base_lsn(&self) -> Lsn {
|
||||
Lsn(self.control_file_data.checkPoint).align()
|
||||
}
|
||||
pub(crate) fn pg_version(&self) -> u32 {
|
||||
self.try_pg_version()
|
||||
.expect("prepare() checks that try_pg_version doesn't error")
|
||||
}
|
||||
pub(crate) fn control_file_data(&self) -> &ControlFileData {
|
||||
&self.control_file_data
|
||||
}
|
||||
pub(crate) fn control_file_buf(&self) -> &Bytes {
|
||||
&self.control_file_buf
|
||||
}
|
||||
fn try_pg_version(&self) -> anyhow::Result<u32> {
|
||||
Ok(match self.control_file_data.catalog_version_no {
|
||||
// thesea are from catversion.h
|
||||
202107181 => 14,
|
||||
202209061 => 15,
|
||||
202307071 => 16,
|
||||
/* XXX pg17 */
|
||||
catversion => {
|
||||
anyhow::bail!("unrecognized catalog version {catversion}")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct PgdataStatus {
|
||||
pub done: bool,
|
||||
// TODO: remaining fields
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ShardStatus {
|
||||
pub done: bool,
|
||||
// TODO: remaining fields
|
||||
}
|
||||
|
||||
// TODO: dedupe with fast_import code
|
||||
#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Spec {
|
||||
pub project_id: String,
|
||||
pub branch_id: String,
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Root {
|
||||
V1(V1),
|
||||
}
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub enum V1 {
|
||||
InProgress(InProgress),
|
||||
Done(Done),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
#[serde(transparent)]
|
||||
pub struct IdempotencyKey(String);
|
||||
|
||||
impl IdempotencyKey {
|
||||
pub fn new(s: String) -> Self {
|
||||
Self(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct InProgress {
|
||||
pub idempotency_key: IdempotencyKey,
|
||||
pub location: Location,
|
||||
pub started_at: chrono::NaiveDateTime,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Done {
|
||||
pub idempotency_key: IdempotencyKey,
|
||||
pub started_at: chrono::NaiveDateTime,
|
||||
pub finished_at: chrono::NaiveDateTime,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Location {
|
||||
#[cfg(feature = "testing")]
|
||||
LocalFs { path: Utf8PathBuf },
|
||||
AwsS3 {
|
||||
region: String,
|
||||
bucket: String,
|
||||
key: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl Root {
|
||||
pub fn is_done(&self) -> bool {
|
||||
match self {
|
||||
Root::V1(v1) => match v1 {
|
||||
V1::Done(_) => true,
|
||||
V1::InProgress(_) => false,
|
||||
},
|
||||
}
|
||||
}
|
||||
pub fn idempotency_key(&self) -> &IdempotencyKey {
|
||||
match self {
|
||||
Root::V1(v1) => match v1 {
|
||||
V1::InProgress(in_progress) => &in_progress.idempotency_key,
|
||||
V1::Done(done) => &done.idempotency_key,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
119
pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
Normal file
119
pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
Normal file
@@ -0,0 +1,119 @@
|
||||
//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
|
||||
use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::error;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use reqwest::Method;
|
||||
|
||||
use super::importbucket_format::Spec;
|
||||
|
||||
pub struct Client {
|
||||
base_url: String,
|
||||
authorization_header: Option<String>,
|
||||
client: reqwest::Client,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct ImportProgressRequest {
|
||||
// no fields yet, not sure if there every will be any
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct ImportProgressResponse {
|
||||
// we don't care
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result<Self> {
|
||||
let Some(ref base_url) = conf.import_pgdata_upcall_api else {
|
||||
anyhow::bail!("import_pgdata_upcall_api is not configured")
|
||||
};
|
||||
Ok(Self {
|
||||
base_url: base_url.to_string(),
|
||||
client: reqwest::Client::new(),
|
||||
cancel,
|
||||
authorization_header: conf
|
||||
.import_pgdata_upcall_api_token
|
||||
.as_ref()
|
||||
.map(|secret_string| secret_string.get_contents())
|
||||
.map(|jwt| format!("Bearer {jwt}")),
|
||||
})
|
||||
}
|
||||
|
||||
fn start_request<U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
) -> reqwest::RequestBuilder {
|
||||
let req = self.client.request(method, uri);
|
||||
if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
req
|
||||
}
|
||||
}
|
||||
|
||||
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
self.start_request(method, uri)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
let res = self.request_noerror(method, uri, body).await?;
|
||||
let response = res.error_from_body().await?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> {
|
||||
let url = format!(
|
||||
"{}/projects/{}/branches/{}/import_progress",
|
||||
self.base_url, spec.project_id, spec.branch_id
|
||||
);
|
||||
let ImportProgressResponse {} = self
|
||||
.request(Method::POST, url, &ImportProgressRequest {})
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> {
|
||||
loop {
|
||||
match self.send_progress_once(spec).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")),
|
||||
Err(err) => {
|
||||
error!(?err, "error sending progress, retrying");
|
||||
if tokio::time::timeout(
|
||||
std::time::Duration::from_secs(10),
|
||||
self.cancel.cancelled(),
|
||||
)
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
anyhow::bail!("cancelled while sending early progress update");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
|
||||
}
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Flush).await;
|
||||
timeline.shutdown(super::ShutdownMode::Reload).await;
|
||||
|
||||
// TODO extend guard mechanism above with method
|
||||
// to make deletions possible while offloading is in progress
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use tracing::{error, info, info_span};
|
||||
use utils::{fs_ext, id::TimelineId, lsn::Lsn};
|
||||
use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
@@ -23,14 +23,14 @@ use super::Timeline;
|
||||
pub struct UninitializedTimeline<'t> {
|
||||
pub(crate) owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
|
||||
}
|
||||
|
||||
impl<'t> UninitializedTimeline<'t> {
|
||||
pub(crate) fn new(
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
|
||||
) -> Self {
|
||||
Self {
|
||||
owning_tenant,
|
||||
@@ -87,6 +87,10 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn finish_creation_myself(&mut self) -> (Arc<Timeline>, TimelineCreateGuard) {
|
||||
self.raw_timeline.take().expect("already checked")
|
||||
}
|
||||
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
pub(crate) async fn import_basebackup_from_tar(
|
||||
self,
|
||||
@@ -167,9 +171,10 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
|
||||
/// A guard for timeline creations in process: as long as this object exists, the timeline ID
|
||||
/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
|
||||
#[must_use]
|
||||
pub(crate) struct TimelineCreateGuard<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
pub(crate) struct TimelineCreateGuard {
|
||||
pub(crate) _tenant_gate_guard: GateGuard,
|
||||
pub(crate) owning_tenant: Arc<Tenant>,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
pub(crate) timeline_path: Utf8PathBuf,
|
||||
pub(crate) idempotency: CreateTimelineIdempotency,
|
||||
}
|
||||
@@ -184,20 +189,27 @@ pub(crate) enum TimelineExclusionError {
|
||||
},
|
||||
#[error("Already creating")]
|
||||
AlreadyCreating,
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
|
||||
// e.g. I/O errors, or some failure deep in postgres initdb
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl<'t> TimelineCreateGuard<'t> {
|
||||
impl TimelineCreateGuard {
|
||||
pub(crate) fn new(
|
||||
owning_tenant: &'t Tenant,
|
||||
owning_tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
timeline_path: Utf8PathBuf,
|
||||
idempotency: CreateTimelineIdempotency,
|
||||
allow_offloaded: bool,
|
||||
) -> Result<Self, TimelineExclusionError> {
|
||||
let _tenant_gate_guard = owning_tenant
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| TimelineExclusionError::ShuttingDown)?;
|
||||
|
||||
// Lock order: this is the only place we take both locks. During drop() we only
|
||||
// lock creating_timelines
|
||||
let timelines = owning_tenant.timelines.lock().unwrap();
|
||||
@@ -225,8 +237,12 @@ impl<'t> TimelineCreateGuard<'t> {
|
||||
return Err(TimelineExclusionError::AlreadyCreating);
|
||||
}
|
||||
creating_timelines.insert(timeline_id);
|
||||
drop(creating_timelines);
|
||||
drop(timelines_offloaded);
|
||||
drop(timelines);
|
||||
Ok(Self {
|
||||
owning_tenant,
|
||||
_tenant_gate_guard,
|
||||
owning_tenant: Arc::clone(owning_tenant),
|
||||
timeline_id,
|
||||
timeline_path,
|
||||
idempotency,
|
||||
@@ -234,7 +250,7 @@ impl<'t> TimelineCreateGuard<'t> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineCreateGuard<'_> {
|
||||
impl Drop for TimelineCreateGuard {
|
||||
fn drop(&mut self) {
|
||||
self.owning_tenant
|
||||
.timelines_creating
|
||||
|
||||
@@ -38,6 +38,7 @@ use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use self::connection_manager::ConnectionManagerStatus;
|
||||
|
||||
@@ -45,6 +46,7 @@ use super::Timeline;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct WalReceiverConf {
|
||||
pub protocol: PostgresClientProtocol,
|
||||
/// The timeout on the connection to safekeeper for WAL streaming.
|
||||
pub wal_connect_timeout: Duration,
|
||||
/// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
|
||||
|
||||
@@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig;
|
||||
use utils::backoff::{
|
||||
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::postgres_client::wal_stream_connection_config;
|
||||
use utils::postgres_client::{
|
||||
wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol,
|
||||
};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -533,6 +535,7 @@ impl ConnectionManagerState {
|
||||
let node_id = new_sk.safekeeper_id;
|
||||
let connect_timeout = self.conf.wal_connect_timeout;
|
||||
let ingest_batch_size = self.conf.ingest_batch_size;
|
||||
let protocol = self.conf.protocol;
|
||||
let timeline = Arc::clone(&self.timeline);
|
||||
let ctx = ctx.detached_child(
|
||||
TaskKind::WalReceiverConnectionHandler,
|
||||
@@ -546,6 +549,7 @@ impl ConnectionManagerState {
|
||||
|
||||
let res = super::walreceiver_connection::handle_walreceiver_connection(
|
||||
timeline,
|
||||
protocol,
|
||||
new_sk.wal_source_connconf,
|
||||
events_sender,
|
||||
cancellation.clone(),
|
||||
@@ -984,15 +988,33 @@ impl ConnectionManagerState {
|
||||
if info.safekeeper_connstr.is_empty() {
|
||||
return None; // no connection string, ignore sk
|
||||
}
|
||||
match wal_stream_connection_config(
|
||||
self.id,
|
||||
info.safekeeper_connstr.as_ref(),
|
||||
match &self.conf.auth_token {
|
||||
None => None,
|
||||
Some(x) => Some(x),
|
||||
|
||||
let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
|
||||
PostgresClientProtocol::Vanilla => {
|
||||
(None, None, None)
|
||||
},
|
||||
self.conf.availability_zone.as_deref(),
|
||||
) {
|
||||
PostgresClientProtocol::Interpreted { .. } => {
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
(
|
||||
Some(shard_identity.number.0),
|
||||
Some(shard_identity.count.0),
|
||||
Some(shard_identity.stripe_size.0),
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let connection_conf_args = ConnectionConfigArgs {
|
||||
protocol: self.conf.protocol,
|
||||
ttid: self.id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
shard_stripe_size,
|
||||
listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
|
||||
auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
|
||||
availability_zone: self.conf.availability_zone.as_deref()
|
||||
};
|
||||
|
||||
match wal_stream_connection_config(connection_conf_args) {
|
||||
Ok(connstr) => Some((*sk_id, info, connstr)),
|
||||
Err(e) => {
|
||||
error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
|
||||
@@ -1096,6 +1118,7 @@ impl ReconnectReason {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
|
||||
use url::Host;
|
||||
|
||||
fn dummy_broker_sk_timeline(
|
||||
@@ -1532,6 +1555,7 @@ mod tests {
|
||||
timeline,
|
||||
cancel: CancellationToken::new(),
|
||||
conf: WalReceiverConf {
|
||||
protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
||||
|
||||
@@ -22,7 +22,10 @@ use tokio::{select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn, Instrument};
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
|
||||
use wal_decoder::{
|
||||
models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords},
|
||||
wire_format::FromWireFormat,
|
||||
};
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
@@ -36,7 +39,7 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use utils::{id::NodeId, lsn::Lsn};
|
||||
use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
|
||||
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||
|
||||
/// Status of the connection.
|
||||
@@ -109,6 +112,7 @@ impl From<WalDecodeError> for WalReceiverError {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) async fn handle_walreceiver_connection(
|
||||
timeline: Arc<Timeline>,
|
||||
protocol: PostgresClientProtocol,
|
||||
wal_source_connconf: PgConnectionConfig,
|
||||
events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
|
||||
cancellation: CancellationToken,
|
||||
@@ -260,6 +264,14 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
|
||||
let interpreted_proto_config = match protocol {
|
||||
PostgresClientProtocol::Vanilla => None,
|
||||
PostgresClientProtocol::Interpreted {
|
||||
format,
|
||||
compression,
|
||||
} => Some((format, compression)),
|
||||
};
|
||||
|
||||
while let Some(replication_message) = {
|
||||
select! {
|
||||
_ = cancellation.cancelled() => {
|
||||
@@ -291,6 +303,15 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
|
||||
}
|
||||
ReplicationMessage::RawInterpretedWalRecords(raw) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
if !raw.data().is_empty() {
|
||||
connection_status.latest_wal_update = now;
|
||||
}
|
||||
|
||||
connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn()));
|
||||
connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn()));
|
||||
}
|
||||
&_ => {}
|
||||
};
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
|
||||
@@ -298,7 +319,148 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
modification.commit(ctx).await?;
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let status_update = match replication_message {
|
||||
ReplicationMessage::RawInterpretedWalRecords(raw) => {
|
||||
WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
|
||||
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
|
||||
// This is the end LSN of the raw WAL from which the records
|
||||
// were interpreted.
|
||||
let streaming_lsn = Lsn::from(raw.streaming_lsn());
|
||||
|
||||
let (format, compression) = interpreted_proto_config.unwrap();
|
||||
let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
|
||||
.await
|
||||
.with_context(|| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to deserialize interpreted records ending at LSN {streaming_lsn}"
|
||||
)
|
||||
})?;
|
||||
|
||||
let InterpretedWalRecords {
|
||||
records,
|
||||
next_record_lsn,
|
||||
} = batch;
|
||||
|
||||
tracing::debug!(
|
||||
"Received WAL up to {} with next_record_lsn={:?}",
|
||||
streaming_lsn,
|
||||
next_record_lsn
|
||||
);
|
||||
|
||||
// We start the modification at 0 because each interpreted record
|
||||
// advances it to its end LSN. 0 is just an initialization placeholder.
|
||||
let mut modification = timeline.begin_modification(Lsn(0));
|
||||
|
||||
for interpreted in records {
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let local_next_record_lsn = interpreted.next_record_lsn;
|
||||
let ingested = walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {local_next_record_lsn}")
|
||||
})?;
|
||||
|
||||
if !ingested {
|
||||
tracing::debug!(
|
||||
"ingest: filtered out record @ LSN {local_next_record_lsn}"
|
||||
);
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
filtered_records += 1;
|
||||
}
|
||||
|
||||
uncommitted_records += 1;
|
||||
|
||||
// FIXME: this cannot be made pausable_failpoint without fixing the
|
||||
// failpoint library; in tests, the added amount of debugging will cause us
|
||||
// to timeout the tests.
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
if uncommitted_records >= ingest_batch_size
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Records might have been filtered out on the safekeeper side, but we still
|
||||
// need to advance last record LSN on all shards. If we've not ingested the latest
|
||||
// record, then set the LSN of the modification past it. This way all shards
|
||||
// advance their last record LSN at the same time.
|
||||
let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
|
||||
Some(lsn) if lsn > modification.get_lsn() => {
|
||||
modification.set_lsn(lsn).unwrap();
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
|
||||
if uncommitted_records > 0 || needs_last_record_lsn_advance {
|
||||
// Commit any uncommitted records
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
if !caught_up && streaming_lsn >= end_of_wal {
|
||||
info!("caught up at LSN {streaming_lsn}");
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
"Ingested WAL up to {streaming_lsn}. Last record LSN is {}",
|
||||
timeline.get_last_record_lsn()
|
||||
);
|
||||
|
||||
if let Some(lsn) = next_record_lsn {
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
Some(streaming_lsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
@@ -316,21 +478,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
modification.commit(ctx).await?;
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
|
||||
@@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
@@ -14,7 +15,6 @@ use utils::lsn::AtomicLsn;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use utils::generation::Generation;
|
||||
|
||||
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
|
||||
@@ -38,6 +38,12 @@ impl UploadQueue {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
|
||||
pub(crate) enum OpType {
|
||||
MayReorder,
|
||||
FlushDeletion,
|
||||
}
|
||||
|
||||
/// This keeps track of queued and in-progress tasks.
|
||||
pub(crate) struct UploadQueueInitialized {
|
||||
/// Counter to assign task IDs
|
||||
@@ -88,6 +94,9 @@ pub(crate) struct UploadQueueInitialized {
|
||||
#[cfg(feature = "testing")]
|
||||
pub(crate) dangling_files: HashMap<LayerName, Generation>,
|
||||
|
||||
/// Ensure we order file operations correctly.
|
||||
pub(crate) recently_deleted: HashSet<(LayerName, Generation)>,
|
||||
|
||||
/// Deletions that are blocked by the tenant configuration
|
||||
pub(crate) blocked_deletions: Vec<Delete>,
|
||||
|
||||
@@ -183,6 +192,7 @@ impl UploadQueue {
|
||||
queued_operations: VecDeque::new(),
|
||||
#[cfg(feature = "testing")]
|
||||
dangling_files: HashMap::new(),
|
||||
recently_deleted: HashSet::new(),
|
||||
blocked_deletions: Vec::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
@@ -224,6 +234,7 @@ impl UploadQueue {
|
||||
queued_operations: VecDeque::new(),
|
||||
#[cfg(feature = "testing")]
|
||||
dangling_files: HashMap::new(),
|
||||
recently_deleted: HashSet::new(),
|
||||
blocked_deletions: Vec::new(),
|
||||
shutting_down: false,
|
||||
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||
@@ -282,8 +293,8 @@ pub(crate) struct Delete {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum UploadOp {
|
||||
/// Upload a layer file
|
||||
UploadLayer(ResidentLayer, LayerFileMetadata),
|
||||
/// Upload a layer file. The last field indicates the last operation for thie file.
|
||||
UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
|
||||
|
||||
/// Upload a index_part.json file
|
||||
UploadMetadata {
|
||||
@@ -305,11 +316,11 @@ pub(crate) enum UploadOp {
|
||||
impl std::fmt::Display for UploadOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
UploadOp::UploadLayer(layer, metadata) => {
|
||||
UploadOp::UploadLayer(layer, metadata, mode) => {
|
||||
write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?}, gen={:?})",
|
||||
layer, metadata.file_size, metadata.generation
|
||||
"UploadLayer({}, size={:?}, gen={:?}, mode={:?})",
|
||||
layer, metadata.file_size, metadata.generation, mode
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata { uploaded, .. } => {
|
||||
|
||||
@@ -19,7 +19,7 @@ impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
|
||||
impl<const N: usize, A: Alignment> Deref for AlignedSlice<'_, N, A> {
|
||||
type Target = [u8; N];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@@ -27,13 +27,13 @@ impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
|
||||
impl<const N: usize, A: Alignment> DerefMut for AlignedSlice<'_, N, A> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
|
||||
impl<const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'_, N, A> {
|
||||
fn as_ref(&self) -> &[u8; N] {
|
||||
self.buf
|
||||
}
|
||||
|
||||
@@ -334,14 +334,32 @@ impl WalIngest {
|
||||
// replaying it would fail to find the previous image of the page, because
|
||||
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
||||
// record if it doesn't.
|
||||
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
|
||||
//
|
||||
// TODO: analyze the metrics and tighten this up accordingly. This logic
|
||||
// implicitly assumes that VM pages see explicit WAL writes before
|
||||
// implicit ClearVmBits, and will otherwise silently drop updates.
|
||||
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["relation"])
|
||||
.inc();
|
||||
return Ok(());
|
||||
};
|
||||
if let Some(blknum) = new_vm_blk {
|
||||
if blknum >= vm_size {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["new_page"])
|
||||
.inc();
|
||||
new_vm_blk = None;
|
||||
}
|
||||
}
|
||||
if let Some(blknum) = old_vm_blk {
|
||||
if blknum >= vm_size {
|
||||
WAL_INGEST
|
||||
.clear_vm_bits_unknown
|
||||
.with_label_values(&["old_page"])
|
||||
.inc();
|
||||
old_vm_blk = None;
|
||||
}
|
||||
}
|
||||
@@ -572,7 +590,8 @@ impl WalIngest {
|
||||
modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
|
||||
fsm_physical_page_no += 1;
|
||||
}
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?;
|
||||
// TODO: re-examine the None case here wrt. sharding; should we error?
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
|
||||
if nblocks > fsm_physical_page_no {
|
||||
// check if something to do: FSM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
|
||||
@@ -612,7 +631,8 @@ impl WalIngest {
|
||||
)?;
|
||||
vm_page_no += 1;
|
||||
}
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?;
|
||||
// TODO: re-examine the None case here wrt. sharding; should we error?
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
|
||||
if nblocks > vm_page_no {
|
||||
// check if something to do: VM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
|
||||
@@ -1430,24 +1450,27 @@ impl WalIngest {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the size of the relation as of this modification, or None if the relation doesn't exist.
|
||||
///
|
||||
/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
|
||||
/// page number stored in the shard, or None if the shard does not have any pages for it.
|
||||
async fn get_relsize(
|
||||
modification: &DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let nblocks = if !modification
|
||||
) -> Result<Option<BlockNumber>, PageReconstructError> {
|
||||
if !modification
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(modification), ctx)
|
||||
.await?
|
||||
{
|
||||
0
|
||||
} else {
|
||||
modification
|
||||
.tline
|
||||
.get_rel_size(rel, Version::Modified(modification), ctx)
|
||||
.await?
|
||||
};
|
||||
Ok(nblocks)
|
||||
return Ok(None);
|
||||
}
|
||||
modification
|
||||
.tline
|
||||
.get_rel_size(rel, Version::Modified(modification), ctx)
|
||||
.await
|
||||
.map(Some)
|
||||
}
|
||||
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
|
||||
Reference in New Issue
Block a user