mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 00:50:36 +00:00
Merge remote-tracking branch 'origin/main' into problame/write-path-larger-buffers
This commit is contained in:
@@ -27,30 +27,50 @@
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-03-20 on i3en.3xlarge
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
|
||||
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
|
||||
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
|
||||
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
|
||||
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
|
||||
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
|
||||
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
|
||||
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
|
||||
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
|
||||
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
|
||||
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
|
||||
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
|
||||
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
|
||||
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
|
||||
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
|
||||
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
|
||||
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
|
||||
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
|
||||
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
|
||||
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
|
||||
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
|
||||
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
|
||||
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
|
||||
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
|
||||
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
|
||||
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
|
||||
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
|
||||
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
|
||||
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
|
||||
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
|
||||
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
|
||||
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
|
||||
//! ```
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::{PostgresRedoManager, ProcessKind},
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-short"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
fn bench_impl(
|
||||
process_kind: ProcessKind,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
nclients: u64,
|
||||
) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
conf.walredo_process_kind = process_kind;
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
// divide the amount of work equally among the clients.
|
||||
let nredos_per_client = n_redos / nclients;
|
||||
for _ in 0..nclients {
|
||||
rt.block_on(async {
|
||||
tasks.spawn(client(
|
||||
Arc::clone(&manager),
|
||||
Arc::clone(&start),
|
||||
Arc::clone(&redo_work),
|
||||
// divide the amount of work equally among the clients
|
||||
n_redos / nclients,
|
||||
nredos_per_client,
|
||||
))
|
||||
});
|
||||
}
|
||||
|
||||
rt.block_on(async move {
|
||||
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
||||
let elapsed = rt.block_on(async move {
|
||||
let mut total_wallclock_time = Duration::ZERO;
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
total_wallclock_time += res.unwrap();
|
||||
}
|
||||
total_wallclock_time
|
||||
})
|
||||
});
|
||||
|
||||
// consistency check to ensure process kind setting worked
|
||||
if nredos_per_client > 0 {
|
||||
assert_eq!(
|
||||
manager
|
||||
.status()
|
||||
.process
|
||||
.map(|p| p.kind)
|
||||
.expect("the benchmark work causes a walredo process to be spawned"),
|
||||
std::borrow::Cow::Borrowed(process_kind.into())
|
||||
);
|
||||
}
|
||||
|
||||
elapsed
|
||||
}
|
||||
|
||||
async fn client(
|
||||
|
||||
@@ -128,12 +128,12 @@ impl Client {
|
||||
|
||||
pub async fn timeline_info(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
force_await_logical_size: ForceAwaitLogicalSize,
|
||||
) -> Result<pageserver_api::models::TimelineInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
@@ -151,11 +151,11 @@ impl Client {
|
||||
|
||||
pub async fn keyspace(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<pageserver_api::models::partitioning::Partitioning> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.get(&uri)
|
||||
|
||||
@@ -11,7 +11,6 @@ default = []
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
|
||||
@@ -43,7 +43,8 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
fanout: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(fanout >= 2);
|
||||
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
|
||||
let exp_base = fanout.max(2);
|
||||
// Start at L0
|
||||
let mut current_level_no = 0;
|
||||
let mut current_level_target_height = target_file_size;
|
||||
@@ -106,7 +107,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
break;
|
||||
}
|
||||
current_level_no += 1;
|
||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
||||
current_level_target_height = current_level_target_height.saturating_mul(exp_base);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -180,7 +180,7 @@ where
|
||||
match top.deref_mut() {
|
||||
LazyLoadLayer::Unloaded(ref mut l) => {
|
||||
let fut = l.load_keys(this.ctx);
|
||||
this.load_future.set(Some(fut));
|
||||
this.load_future.set(Some(Box::pin(fut)));
|
||||
continue;
|
||||
}
|
||||
LazyLoadLayer::Loaded(ref mut entries) => {
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//!
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
use async_trait::async_trait;
|
||||
use futures::Future;
|
||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||
use std::ops::Range;
|
||||
@@ -141,18 +140,16 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
||||
|
||||
fn is_delta(&self) -> bool;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
|
||||
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
|
||||
where
|
||||
Self: 'a;
|
||||
|
||||
/// Return all keys in this delta layer.
|
||||
async fn load_keys<'a>(
|
||||
fn load_keys<'a>(
|
||||
&self,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
|
||||
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
|
||||
}
|
||||
|
||||
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
|
||||
|
||||
@@ -2,7 +2,6 @@ mod draw;
|
||||
|
||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use futures::StreamExt;
|
||||
use rand::Rng;
|
||||
use tracing::info;
|
||||
@@ -139,7 +138,6 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
|
||||
type DeltaEntry<'a> = MockRecord;
|
||||
|
||||
|
||||
@@ -12,9 +12,14 @@ bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
postgres_ffi.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
utils.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -9,6 +9,11 @@ mod index_part;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
|
||||
use std::{
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::{Parser, Subcommand};
|
||||
use index_part::IndexPartCmd;
|
||||
@@ -20,8 +25,16 @@ use pageserver::{
|
||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::ControlFileData;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::TimelineId,
|
||||
logging::{self, LogFormat, TracingErrorLayerEnablement},
|
||||
lsn::Lsn,
|
||||
project_git_version,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
@@ -43,6 +56,7 @@ enum Commands {
|
||||
#[command(subcommand)]
|
||||
IndexPart(IndexPartCmd),
|
||||
PrintLayerFile(PrintLayerFileCmd),
|
||||
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
|
||||
DrawTimeline {},
|
||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||
#[command(subcommand)]
|
||||
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
/// Roll back the time for the specified prefix using S3 history.
|
||||
///
|
||||
/// The command is fairly low level and powerful. Validation is only very light,
|
||||
/// so it is more powerful, and thus potentially more dangerous.
|
||||
#[derive(Parser)]
|
||||
struct TimeTravelRemotePrefixCmd {
|
||||
/// A configuration string for the remote_storage configuration.
|
||||
///
|
||||
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
|
||||
config_toml_str: String,
|
||||
/// remote prefix to time travel recover. For safety reasons, we require it to contain
|
||||
/// a timeline or tenant ID in the prefix.
|
||||
prefix: String,
|
||||
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
|
||||
travel_to: String,
|
||||
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
|
||||
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
|
||||
done_if_after: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct AnalyzeLayerMapCmd {
|
||||
/// Pageserver data path
|
||||
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
match cli.command {
|
||||
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
|
||||
print_layerfile(&cmd.path).await?;
|
||||
}
|
||||
}
|
||||
Commands::TimeTravelRemotePrefix(cmd) => {
|
||||
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
|
||||
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
|
||||
|
||||
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
|
||||
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
|
||||
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
|
||||
})?
|
||||
} else {
|
||||
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
|
||||
tokio::time::sleep(SAFETY_MARGIN).await;
|
||||
// Convert to string representation and back to get rid of sub-second values
|
||||
let done_if_after = SystemTime::now();
|
||||
tokio::time::sleep(SAFETY_MARGIN).await;
|
||||
done_if_after
|
||||
};
|
||||
|
||||
let timestamp = strip_subsecond(timestamp);
|
||||
let done_if_after = strip_subsecond(done_if_after);
|
||||
|
||||
let Some(prefix) = validate_prefix(&cmd.prefix) else {
|
||||
println!("specified prefix '{}' failed validation", cmd.prefix);
|
||||
return Ok(());
|
||||
};
|
||||
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
|
||||
let toml_item = toml_document
|
||||
.get("remote_storage")
|
||||
.expect("need remote_storage");
|
||||
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
||||
let cancel = CancellationToken::new();
|
||||
storage
|
||||
.unwrap()
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||
.await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
@@ -185,3 +263,89 @@ fn handle_metadata(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Ensures that the given S3 prefix is sufficiently constrained.
|
||||
/// The command is very risky already and we don't want to expose something
|
||||
/// that allows usually unintentional and quite catastrophic time travel of
|
||||
/// an entire bucket, which would be a major catastrophy and away
|
||||
/// by only one character change (similar to "rm -r /home /username/foobar").
|
||||
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
|
||||
if prefix.is_empty() {
|
||||
// Empty prefix means we want to specify the *whole* bucket
|
||||
return None;
|
||||
}
|
||||
let components = prefix.split('/').collect::<Vec<_>>();
|
||||
let (last, components) = {
|
||||
let last = components.last()?;
|
||||
if last.is_empty() {
|
||||
(
|
||||
components.iter().nth_back(1)?,
|
||||
&components[..(components.len() - 1)],
|
||||
)
|
||||
} else {
|
||||
(last, &components[..])
|
||||
}
|
||||
};
|
||||
'valid: {
|
||||
if let Ok(_timeline_id) = TimelineId::from_str(last) {
|
||||
// Ends in either a tenant or timeline ID
|
||||
break 'valid;
|
||||
}
|
||||
if *last == "timelines" {
|
||||
if let Some(before_last) = components.iter().nth_back(1) {
|
||||
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
|
||||
// Has a valid tenant id
|
||||
break 'valid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return None;
|
||||
}
|
||||
RemotePath::from_string(prefix).ok()
|
||||
}
|
||||
|
||||
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
|
||||
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
|
||||
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_validate_prefix() {
|
||||
assert_eq!(validate_prefix(""), None);
|
||||
assert_eq!(validate_prefix("/"), None);
|
||||
#[track_caller]
|
||||
fn assert_valid(prefix: &str) {
|
||||
let remote_path = RemotePath::from_string(prefix).unwrap();
|
||||
assert_eq!(validate_prefix(prefix), Some(remote_path));
|
||||
}
|
||||
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
|
||||
// Path is not relative but absolute
|
||||
assert_eq!(
|
||||
validate_prefix(
|
||||
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
|
||||
),
|
||||
None
|
||||
);
|
||||
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
|
||||
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
|
||||
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
|
||||
assert_eq!(validate_prefix("wal"), None);
|
||||
assert_eq!(validate_prefix("/wal/"), None);
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
|
||||
// Partial tenant ID
|
||||
assert_eq!(
|
||||
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
|
||||
None
|
||||
);
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
|
||||
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::Context;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
|
||||
@@ -95,7 +96,7 @@ async fn main_impl(
|
||||
let timeline = *timeline;
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(
|
||||
timeline.tenant_id,
|
||||
TenantShardId::unsharded(timeline.tenant_id),
|
||||
timeline.timeline_id,
|
||||
ForceAwaitLogicalSize::No,
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -173,7 +174,10 @@ async fn main_impl(
|
||||
let timeline = *timeline;
|
||||
async move {
|
||||
let partitioning = mgmt_api_client
|
||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
||||
.keyspace(
|
||||
TenantShardId::unsharded(timeline.tenant_id),
|
||||
timeline.timeline_id,
|
||||
)
|
||||
.await?;
|
||||
let lsn = partitioning.at_lsn;
|
||||
let start = Instant::now();
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use humantime::Duration;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
js.spawn(async move {
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
||||
.timeline_info(
|
||||
TenantShardId::unsharded(tl.tenant_id),
|
||||
tl.timeline_id,
|
||||
ForceAwaitLogicalSize::Yes,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
while !info.current_logical_size_is_accurate {
|
||||
ticker.tick().await;
|
||||
info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
||||
.timeline_info(
|
||||
TenantShardId::unsharded(tl.tenant_id),
|
||||
tl.timeline_id,
|
||||
ForceAwaitLogicalSize::Yes,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
|
||||
@@ -284,6 +285,7 @@ fn start_pageserver(
|
||||
))
|
||||
.unwrap();
|
||||
pageserver::preinitialize_metrics();
|
||||
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
@@ -671,42 +673,37 @@ fn start_pageserver(
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
{
|
||||
use signal_hook::consts::*;
|
||||
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||
let mut signals =
|
||||
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
|
||||
return signals
|
||||
.forever()
|
||||
.next()
|
||||
.expect("forever() never returns None unless explicitly closed");
|
||||
});
|
||||
let signal = BACKGROUND_RUNTIME
|
||||
.block_on(signal_handler)
|
||||
.expect("join error");
|
||||
match signal {
|
||||
SIGQUIT => {
|
||||
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
|
||||
std::process::exit(111);
|
||||
}
|
||||
SIGINT | SIGTERM => {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
unreachable!()
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
{
|
||||
BACKGROUND_RUNTIME.block_on(async move {
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
|
||||
std::process::exit(111);
|
||||
}
|
||||
_ = sigint.recv() => { "SIGINT" },
|
||||
_ = sigterm.recv() => { "SIGTERM" },
|
||||
};
|
||||
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
unreachable!()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -97,6 +97,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -140,6 +142,8 @@ pub mod defaults {
|
||||
|
||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||
|
||||
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -290,6 +294,8 @@ pub struct PageServerConf {
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub walredo_process_kind: crate::walredo::ProcessKind,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
|
||||
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
|
||||
self.walredo_process_kind = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
walredo_process_kind,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1032,6 +1047,9 @@ impl PageServerConf {
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
"walredo_process_kind" => {
|
||||
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1114,6 +1132,7 @@ impl PageServerConf {
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// We only send consumption metrics from shard 0, so don't waste time calculating
|
||||
// synthetic size on other shards.
|
||||
continue;
|
||||
|
||||
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
|
||||
};
|
||||
|
||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
|
||||
if state != TenantState::Active || !id.is_zero() {
|
||||
if state != TenantState::Active || !id.is_shard_zero() {
|
||||
None
|
||||
} else {
|
||||
tenant_manager
|
||||
|
||||
@@ -12,7 +12,7 @@ use pageserver_api::{
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{backoff, generation::Generation, id::NodeId};
|
||||
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||
|
||||
use crate::{
|
||||
config::{NodeMetadata, PageServerConf},
|
||||
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
.collect(),
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-validate");
|
||||
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(RetryForeverError::ShuttingDown);
|
||||
}
|
||||
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
|
||||
@@ -58,24 +58,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: The reload completed successfully.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error (also hits if no keys were found)
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}:
|
||||
parameters:
|
||||
@@ -93,62 +75,14 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
description: Tenant not found. This is the success path.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -165,18 +99,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
||||
parameters:
|
||||
@@ -206,36 +128,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
@@ -255,36 +147,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
@@ -309,60 +171,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found
|
||||
description: Timeline not found. This is the success path.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -379,18 +193,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
|
||||
parameters:
|
||||
@@ -423,36 +225,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found, or there is no timestamp information for the given lsn
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||
parameters:
|
||||
@@ -484,36 +256,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
@@ -537,36 +279,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
@@ -628,24 +340,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||
"503":
|
||||
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"409":
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
@@ -662,12 +356,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -684,36 +372,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant ignored
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
@@ -740,36 +398,6 @@ paths:
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
@@ -790,37 +418,6 @@ paths:
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"404":
|
||||
description: No tenant or timeline found for the specified ids
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
@@ -839,31 +436,8 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SyntheticSizeResponse"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
# This route has no handler. TODO: remove?
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -945,18 +519,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||
parameters:
|
||||
@@ -987,20 +549,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -1043,24 +591,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Malformed timeline create request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"406":
|
||||
description: Permanently unsatisfiable request, don't retry.
|
||||
content:
|
||||
@@ -1079,18 +609,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/:
|
||||
get:
|
||||
@@ -1104,30 +622,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
post:
|
||||
description: |
|
||||
@@ -1148,43 +642,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Malformed tenant create request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"409":
|
||||
description: Tenant already exists, creation skipped
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
@@ -1206,36 +669,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Malformed tenant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
parameters:
|
||||
@@ -1255,42 +688,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigResponse"
|
||||
"400":
|
||||
description: Malformed get tenanant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenand or timeline were not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/utilization:
|
||||
get:
|
||||
@@ -1304,12 +701,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PageserverUtilization"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
@@ -1629,7 +1020,7 @@ components:
|
||||
type: integer
|
||||
format: int64
|
||||
minimum: 0
|
||||
description: The amount of disk space currently utilized by layer files.
|
||||
description: The amount of disk space currently used.
|
||||
free_space_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
|
||||
@@ -457,8 +457,12 @@ async fn reload_auth_validation_keys_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
Err(e) => {
|
||||
let err_msg = "Error reloading public keys";
|
||||
warn!("Error reloading public keys from {key_path:?}: {e:}");
|
||||
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
|
||||
json_response(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
HttpErrorBody::from_msg(err_msg.to_string()),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -696,7 +700,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
@@ -747,7 +751,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
@@ -772,7 +776,9 @@ async fn get_timestamp_of_lsn_handler(
|
||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => json_response(StatusCode::NOT_FOUND, ()),
|
||||
None => Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -993,11 +999,26 @@ async fn tenant_status(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
// In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
|
||||
let activate = true;
|
||||
#[cfg(feature = "testing")]
|
||||
let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
if activate {
|
||||
// This is advisory: we prefer to let the tenant activate on-demand when this function is
|
||||
// called, but it is still valid to return 200 and describe the current state of the tenant
|
||||
// if it doesn't make it into an active state.
|
||||
tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await
|
||||
.ok();
|
||||
}
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
@@ -1071,7 +1092,7 @@ async fn tenant_size_handler(
|
||||
let headers = request.headers();
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
)));
|
||||
|
||||
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use tokio_tar::Archive;
|
||||
use tracing::*;
|
||||
@@ -170,7 +171,10 @@ async fn import_rel(
|
||||
let r = reader.read_exact(&mut buf).await;
|
||||
match r {
|
||||
Ok(_) => {
|
||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if modification.tline.get_shard_identity().is_key_local(&key) {
|
||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
|
||||
@@ -1483,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) bytes_received: IntCounter,
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
bytes_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_bytes_received",
|
||||
"Bytes of WAL ingested from safekeepers",
|
||||
)
|
||||
.unwrap(),
|
||||
records_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_received",
|
||||
"Number of WAL records received from safekeepers"
|
||||
@@ -1813,6 +1819,29 @@ impl Default for WalRedoProcessCounters {
|
||||
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
Lazy::new(WalRedoProcessCounters::default);
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub mod wal_redo {
|
||||
use super::*;
|
||||
|
||||
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
|
||||
std::sync::Mutex::new(
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_wal_redo_process_kind",
|
||||
"The configured process kind for walredo",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
|
||||
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
|
||||
// use guard to avoid races around the next two steps
|
||||
let guard = PROCESS_KIND.lock().unwrap();
|
||||
guard.reset();
|
||||
guard.with_label_values(&[&format!("{kind}")]).set(1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
@@ -2083,7 +2112,7 @@ impl TimelineMetrics {
|
||||
|
||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
// Only shard zero deals in synthetic sizes
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
let tid = tenant_shard_id.tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
}
|
||||
@@ -2094,6 +2123,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
use futures::Future;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
@@ -2663,6 +2693,26 @@ pub(crate) mod disk_usage_based_eviction {
|
||||
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
|
||||
}
|
||||
|
||||
static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tokio_executor_thread_configured_count",
|
||||
"Total number of configued tokio executor threads in the process.
|
||||
The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
|
||||
&["setup"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
|
||||
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
|
||||
let _guard = SERIALIZE.lock().unwrap();
|
||||
TOKIO_EXECUTOR_THREAD_COUNT.reset();
|
||||
TOKIO_EXECUTOR_THREAD_COUNT
|
||||
.get_metric_with_label_values(&[setup])
|
||||
.unwrap()
|
||||
.set(u64::try_from(num_threads.get()).unwrap());
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
|
||||
@@ -876,7 +876,13 @@ impl PageServerHandler {
|
||||
if lsn <= last_record_lsn {
|
||||
lsn = last_record_lsn;
|
||||
} else {
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// Since we waited for 'lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the
|
||||
// last-record LSN can advance immediately after we return
|
||||
@@ -888,7 +894,13 @@ impl PageServerHandler {
|
||||
"invalid LSN(0) in request".into(),
|
||||
));
|
||||
}
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
if lsn < **latest_gc_cutoff_lsn {
|
||||
@@ -1215,7 +1227,13 @@ impl PageServerHandler {
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
|
||||
@@ -33,13 +33,14 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::panic::AssertUnwindSafe;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use futures::FutureExt;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::task_local;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use utils::env;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::metrics::set_tokio_runtime_setup;
|
||||
|
||||
//
|
||||
// There are four runtimes:
|
||||
//
|
||||
@@ -98,52 +102,119 @@ use utils::id::TimelineId;
|
||||
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
|
||||
// happen, but still.
|
||||
//
|
||||
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("compute request worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create compute request runtime")
|
||||
});
|
||||
|
||||
pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("mgmt request worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create mgmt request runtime")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("walreceiver worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create walreceiver runtime")
|
||||
});
|
||||
|
||||
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("background op worker")
|
||||
// if you change the number of worker threads please change the constant below
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create background op runtime")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
|
||||
// force init and thus panics
|
||||
let _ = BACKGROUND_RUNTIME.handle();
|
||||
pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| {
|
||||
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
|
||||
// tokio would had already panicked for parsing errors or NotUnicode
|
||||
//
|
||||
// this will be wrong if any of the runtimes gets their worker threads configured to something
|
||||
// else, but that has not been needed in a long time.
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
|
||||
NonZeroUsize::new(
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
|
||||
)
|
||||
.expect("the max() ensures that this is not zero")
|
||||
});
|
||||
|
||||
enum TokioRuntimeMode {
|
||||
SingleThreaded,
|
||||
MultiThreaded { num_workers: NonZeroUsize },
|
||||
}
|
||||
|
||||
impl FromStr for TokioRuntimeMode {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
|
||||
s => match s.strip_prefix("multi_thread:") {
|
||||
Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
|
||||
num_workers: *TOKIO_WORKER_THREADS,
|
||||
}),
|
||||
Some(suffix) => {
|
||||
let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
|
||||
format!(
|
||||
"invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
|
||||
)
|
||||
})?;
|
||||
Ok(TokioRuntimeMode::MultiThreaded { num_workers })
|
||||
}
|
||||
None => Err(format!("invalid runtime config: {s:?}")),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
||||
let thread_name = "pageserver-tokio";
|
||||
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
|
||||
// If the env var is not set, leave this static as None.
|
||||
set_tokio_runtime_setup(
|
||||
"multiple-runtimes",
|
||||
NUM_MULTIPLE_RUNTIMES
|
||||
.checked_mul(*TOKIO_WORKER_THREADS)
|
||||
.unwrap(),
|
||||
);
|
||||
return None;
|
||||
};
|
||||
Some(match mode {
|
||||
TokioRuntimeMode::SingleThreaded => {
|
||||
set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.thread_name(thread_name)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create one single runtime")
|
||||
}
|
||||
TokioRuntimeMode::MultiThreaded { num_workers } => {
|
||||
set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name(thread_name)
|
||||
.enable_all()
|
||||
.worker_threads(num_workers.get())
|
||||
.build()
|
||||
.expect("failed to create one multi-threaded runtime")
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
/// Declare a lazy static variable named `$varname` that will resolve
|
||||
/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
|
||||
/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
|
||||
/// declares a separate runtime and the lazy static variable `$varname`
|
||||
/// will resolve to that separate runtime.
|
||||
///
|
||||
/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
|
||||
/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
|
||||
/// otherwise.
|
||||
macro_rules! pageserver_runtime {
|
||||
($varname:ident, $name:literal) => {
|
||||
pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
|
||||
if let Some(runtime) = &*ONE_RUNTIME {
|
||||
return runtime;
|
||||
}
|
||||
static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name($name)
|
||||
.worker_threads(TOKIO_WORKER_THREADS.get())
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect(std::concat!("Failed to create runtime ", $name))
|
||||
});
|
||||
&*RUNTIME
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
|
||||
pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
|
||||
pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
|
||||
pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
|
||||
// Bump this number when adding a new pageserver_runtime!
|
||||
// SAFETY: it's obviously correct
|
||||
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
@@ -214,13 +285,12 @@ pub enum TaskKind {
|
||||
/// Internally, `Client` hands over requests to the `Connection` object.
|
||||
/// The `Connection` object is responsible for speaking the wire protocol.
|
||||
///
|
||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// That abstraction doesn't use `task_mgr`.
|
||||
/// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||
///
|
||||
/// Once the connection is established, the `TaskHandle` task creates a
|
||||
/// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
|
||||
/// Once the connection is established, the `TaskHandle` task spawns a
|
||||
/// [`WalReceiverConnectionPoller`] task that is responsible for polling
|
||||
/// the `Connection` object.
|
||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||
@@ -230,7 +300,6 @@ pub enum TaskKind {
|
||||
WalReceiverManager,
|
||||
|
||||
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
///
|
||||
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use arc_swap::ArcSwap;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use enumset::EnumSet;
|
||||
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Mutex, RwLock};
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::span;
|
||||
@@ -260,7 +261,7 @@ pub struct Tenant {
|
||||
// We keep TenantConfOpt sturct here to preserve the information
|
||||
// about parameters that are not set.
|
||||
// This is necessary to allow global config updates.
|
||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -385,7 +386,7 @@ impl WalRedoManager {
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
match self {
|
||||
WalRedoManager::Prod(m) => m.status(),
|
||||
WalRedoManager::Prod(m) => Some(m.status()),
|
||||
#[cfg(test)]
|
||||
WalRedoManager::Test(_) => None,
|
||||
}
|
||||
@@ -1515,7 +1516,7 @@ impl Tenant {
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
ancestor_timeline
|
||||
.wait_lsn(*lsn, ctx)
|
||||
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||
@@ -1606,7 +1607,7 @@ impl Tenant {
|
||||
);
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.read().unwrap();
|
||||
let conf = self.tenant_conf.load();
|
||||
|
||||
if !conf.location.may_delete_layers_hint() {
|
||||
info!("Skipping GC in location state {:?}", conf.location);
|
||||
@@ -1633,7 +1634,7 @@ impl Tenant {
|
||||
}
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.read().unwrap();
|
||||
let conf = self.tenant_conf.load();
|
||||
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
||||
info!("Skipping compaction in location state {:?}", conf.location);
|
||||
return Ok(());
|
||||
@@ -1782,7 +1783,7 @@ impl Tenant {
|
||||
async fn shutdown(
|
||||
&self,
|
||||
shutdown_progress: completion::Barrier,
|
||||
freeze_and_flush: bool,
|
||||
shutdown_mode: timeline::ShutdownMode,
|
||||
) -> Result<(), completion::Barrier> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -1829,16 +1830,8 @@ impl Tenant {
|
||||
timelines.values().for_each(|timeline| {
|
||||
let timeline = Arc::clone(timeline);
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
let span =
|
||||
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
|
||||
js.spawn(async move {
|
||||
if freeze_and_flush {
|
||||
timeline.flush_and_shutdown().instrument(span).await
|
||||
} else {
|
||||
timeline.shutdown().instrument(span).await
|
||||
}
|
||||
});
|
||||
let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
|
||||
js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
|
||||
})
|
||||
};
|
||||
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
||||
@@ -2082,14 +2075,14 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
|
||||
self.tenant_conf.read().unwrap().location.attach_mode
|
||||
self.tenant_conf.load().location.attach_mode
|
||||
}
|
||||
|
||||
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
||||
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
|
||||
/// rare external API calls, like a reconciliation at startup.
|
||||
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
|
||||
let conf = self.tenant_conf.read().unwrap();
|
||||
let conf = self.tenant_conf.load();
|
||||
|
||||
let location_config_mode = match conf.location.attach_mode {
|
||||
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
|
||||
@@ -2236,7 +2229,7 @@ where
|
||||
|
||||
impl Tenant {
|
||||
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
||||
self.tenant_conf.load().tenant_conf.clone()
|
||||
}
|
||||
|
||||
pub fn effective_config(&self) -> TenantConf {
|
||||
@@ -2245,84 +2238,84 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub fn get_checkpoint_distance(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.checkpoint_distance
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
||||
}
|
||||
|
||||
pub fn get_checkpoint_timeout(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.checkpoint_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||
}
|
||||
|
||||
pub fn get_compaction_target_size(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_target_size
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||
}
|
||||
|
||||
pub fn get_compaction_period(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_period
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
|
||||
}
|
||||
|
||||
pub fn get_compaction_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
pub fn get_gc_horizon(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.gc_horizon
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
|
||||
}
|
||||
|
||||
pub fn get_gc_period(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.gc_period
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_period)
|
||||
}
|
||||
|
||||
pub fn get_image_creation_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.image_creation_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
pub fn get_pitr_interval(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.pitr_interval
|
||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||
}
|
||||
|
||||
pub fn get_trace_read_requests(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.trace_read_requests
|
||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||
}
|
||||
|
||||
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.min_resident_size_override
|
||||
.or(self.conf.default_tenant_conf.min_resident_size_override)
|
||||
}
|
||||
|
||||
pub fn get_heatmap_period(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let heatmap_period = tenant_conf
|
||||
.heatmap_period
|
||||
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
|
||||
@@ -2334,26 +2327,40 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
|
||||
self.tenant_conf_updated();
|
||||
// Use read-copy-update in order to avoid overwriting the location config
|
||||
// state if this races with [`Tenant::set_new_location_config`]. Note that
|
||||
// this race is not possible if both request types come from the storage
|
||||
// controller (as they should!) because an exclusive op lock is required
|
||||
// on the storage controller side.
|
||||
self.tenant_conf.rcu(|inner| {
|
||||
Arc::new(AttachedTenantConf {
|
||||
tenant_conf: new_tenant_conf.clone(),
|
||||
location: inner.location,
|
||||
})
|
||||
});
|
||||
|
||||
self.tenant_conf_updated(&new_tenant_conf);
|
||||
// Don't hold self.timelines.lock() during the notifies.
|
||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||
// mutexes in struct Timeline in the future.
|
||||
let timelines = self.list_timelines();
|
||||
for timeline in timelines {
|
||||
timeline.tenant_conf_updated();
|
||||
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
|
||||
*self.tenant_conf.write().unwrap() = new_conf;
|
||||
self.tenant_conf_updated();
|
||||
let new_tenant_conf = new_conf.tenant_conf.clone();
|
||||
|
||||
self.tenant_conf.store(Arc::new(new_conf));
|
||||
|
||||
self.tenant_conf_updated(&new_tenant_conf);
|
||||
// Don't hold self.timelines.lock() during the notifies.
|
||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||
// mutexes in struct Timeline in the future.
|
||||
let timelines = self.list_timelines();
|
||||
for timeline in timelines {
|
||||
timeline.tenant_conf_updated();
|
||||
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2367,11 +2374,8 @@ impl Tenant {
|
||||
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_conf_updated(&self) {
|
||||
let conf = {
|
||||
let guard = self.tenant_conf.read().unwrap();
|
||||
Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
|
||||
};
|
||||
pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||
let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
|
||||
self.timeline_get_throttle.reconfigure(conf)
|
||||
}
|
||||
|
||||
@@ -2519,7 +2523,7 @@ impl Tenant {
|
||||
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||
)),
|
||||
tenant_conf: Arc::new(RwLock::new(attached_conf)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3186,7 +3190,7 @@ impl Tenant {
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if self.tenant_shard_id().is_zero() {
|
||||
if self.tenant_shard_id().is_shard_zero() {
|
||||
self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
|
||||
.await?;
|
||||
}
|
||||
@@ -3433,7 +3437,7 @@ impl Tenant {
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
// Only shard zero should be calculating synthetic sizes
|
||||
debug_assert!(self.shard_identity.is_zero());
|
||||
debug_assert!(self.shard_identity.is_shard_zero());
|
||||
|
||||
TENANT_SYNTHETIC_SIZE_METRIC
|
||||
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
|
||||
@@ -3505,7 +3509,7 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
||||
self.tenant_conf.load().tenant_conf.clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3653,6 +3657,9 @@ pub(crate) mod harness {
|
||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3851,6 +3858,7 @@ mod tests {
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tests::timeline::ShutdownMode;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
@@ -4296,7 +4304,7 @@ mod tests {
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||
.instrument(harness.span())
|
||||
.await
|
||||
.ok()
|
||||
@@ -4337,7 +4345,7 @@ mod tests {
|
||||
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||
.instrument(harness.span())
|
||||
.await
|
||||
.ok()
|
||||
@@ -5118,7 +5126,7 @@ mod tests {
|
||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown()
|
||||
.shutdown(super::timeline::ShutdownMode::Hard)
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
|
||||
.await;
|
||||
std::mem::forget(tline);
|
||||
|
||||
@@ -57,6 +57,9 @@ pub mod defaults {
|
||||
// throughputs up to 1GiB/s per timeline.
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
@@ -362,6 +365,10 @@ pub struct TenantConf {
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||
|
||||
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -508,6 +518,9 @@ impl TenantConfOpt {
|
||||
.timeline_get_throttle
|
||||
.clone()
|
||||
.unwrap_or(global_conf.timeline_get_throttle),
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -548,6 +561,7 @@ impl Default for TenantConf {
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,10 @@ use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
tenant::{
|
||||
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
timeline::ShutdownMode,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -433,6 +436,11 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Check whether background deletion of this tenant is currently in progress
|
||||
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
|
||||
tenant.delete_progress.try_lock().is_err()
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
@@ -463,7 +471,7 @@ impl DeleteTenantFlow {
|
||||
// tenant.shutdown
|
||||
// Its also bad that we're holding tenants.read here.
|
||||
// TODO relax set_stopping to be idempotent?
|
||||
if tenant.shutdown(progress, false).await.is_err() {
|
||||
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"tenant shutdown is already in progress"
|
||||
)));
|
||||
|
||||
@@ -78,6 +78,10 @@ impl EphemeralFile {
|
||||
self.file.bytes_written()
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> page_cache::FileId {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
|
||||
@@ -346,35 +346,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub enum InMemoryLayerHandle {
|
||||
Open {
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
Frozen {
|
||||
idx: usize,
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
impl InMemoryLayerHandle {
|
||||
pub fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
|
||||
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_end_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
|
||||
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
@@ -576,41 +547,18 @@ impl LayerMap {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
/// Get a handle for the first in memory layer that matches the provided predicate.
|
||||
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
|
||||
///
|
||||
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
|
||||
/// the same exclusive region established by holding the layer manager lock.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
|
||||
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
|
||||
where
|
||||
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
||||
{
|
||||
if let Some(open) = &self.open_layer {
|
||||
if pred(open) {
|
||||
return Some(InMemoryLayerHandle::Open {
|
||||
lsn_floor: open.get_lsn_range().start,
|
||||
end_lsn: open.get_lsn_range().end,
|
||||
});
|
||||
return Some(open.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.frozen_layers.iter().rev().position(pred);
|
||||
pos.map(|rev_idx| {
|
||||
let idx = self.frozen_layers.len() - 1 - rev_idx;
|
||||
InMemoryLayerHandle::Frozen {
|
||||
idx,
|
||||
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
|
||||
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the layer pointed to by the provided handle.
|
||||
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
|
||||
match handle {
|
||||
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
|
||||
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
|
||||
}
|
||||
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
@@ -44,6 +44,7 @@ use crate::tenant::config::{
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::timeline::ShutdownMode;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
@@ -783,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
|
||||
join_set.spawn(
|
||||
async move {
|
||||
let freeze_and_flush = true;
|
||||
|
||||
let res = {
|
||||
let (_guard, shutdown_progress) = completion::channel();
|
||||
t.shutdown(shutdown_progress, freeze_and_flush).await
|
||||
t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
|
||||
};
|
||||
|
||||
if let Err(other_progress) = res {
|
||||
@@ -1107,7 +1106,7 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
info!("Shutting down attached tenant");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
@@ -1223,7 +1222,7 @@ impl TenantManager {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
info!("Finished shutting down just-spawned tenant");
|
||||
}
|
||||
@@ -1273,7 +1272,7 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, false).await {
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
@@ -1411,9 +1410,15 @@ impl TenantManager {
|
||||
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If a tenant is broken or stopping, DeleteTenantFlow can
|
||||
// handle it: broken tenants proceed to delete, stopping tenants
|
||||
// are checked for deletion already in progress.
|
||||
// If deletion is already in progress, return success (the semantics of this
|
||||
// function are to rerturn success afterr deletion is spawned in background).
|
||||
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
|
||||
if DeleteTenantFlow::is_in_progress(&tenant) {
|
||||
// The `delete_progress` lock is held: deletion is already happening
|
||||
// in the bacckground
|
||||
slot_guard.revert();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
tenant
|
||||
@@ -1649,7 +1654,14 @@ impl TenantManager {
|
||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
if let Err(e) = timeline
|
||||
.wait_lsn(
|
||||
*target_lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::Tenant,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
tracing::warn!(
|
||||
@@ -1670,7 +1682,7 @@ impl TenantManager {
|
||||
|
||||
// Phase 5: Shut down the parent shard, and erase it from disk
|
||||
let (_guard, progress) = completion::channel();
|
||||
match parent.shutdown(progress, false).await {
|
||||
match parent.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {}
|
||||
Err(other) => {
|
||||
other.wait().await;
|
||||
@@ -2657,11 +2669,11 @@ where
|
||||
let attached_tenant = match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||
let freeze_and_flush = false;
|
||||
let shutdown_mode = ShutdownMode::Hard;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||
match tenant.shutdown(progress, shutdown_mode).await {
|
||||
Ok(()) => {}
|
||||
Err(_other) => {
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
|
||||
@@ -200,6 +200,7 @@ use utils::backoff::{
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use std::ops::DerefMut;
|
||||
@@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
|
||||
use crate::metrics::{
|
||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
@@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
/// Doing non-essential flushes of deletion queue is subject to this timeout, after
|
||||
/// which we warn and skip.
|
||||
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
@@ -588,14 +593,14 @@ impl RemoteTimelineClient {
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
metadata: TimelineMetadata,
|
||||
) {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
info!(
|
||||
"scheduling metadata upload with {} files ({} changed)",
|
||||
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
|
||||
upload_queue.latest_files.len(),
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
);
|
||||
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
@@ -1050,6 +1055,26 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
|
||||
match tokio::time::timeout(
|
||||
DELETION_QUEUE_FLUSH_TIMEOUT,
|
||||
self.deletion_queue_client.flush_immediate(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(_timeout) => {
|
||||
// Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
|
||||
// to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion
|
||||
// queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
|
||||
tracing::warn!(
|
||||
"Timed out waiting for deletion queue flush, acking deletion anyway"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||
@@ -1099,7 +1124,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
||||
// taking the burden of listing all the layers that we already know we should delete.
|
||||
self.deletion_queue_client.flush_immediate().await?;
|
||||
self.flush_deletion_queue().await?;
|
||||
|
||||
let cancel = shutdown_token();
|
||||
|
||||
@@ -1173,7 +1198,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
||||
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
||||
self.deletion_queue_client.flush_immediate().await?;
|
||||
self.flush_deletion_queue().await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -1569,7 +1594,7 @@ impl RemoteTimelineClient {
|
||||
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
||||
///
|
||||
/// In-progress operations will still be running after this function returns.
|
||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||
/// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
|
||||
/// to wait for them to complete, after calling this function.
|
||||
pub(crate) fn stop(&self) {
|
||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||
|
||||
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
|
||||
let warn_after = 3;
|
||||
let max_attempts = 10;
|
||||
let mut prefixes = Vec::with_capacity(2);
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
// Also recover the unsharded prefix for a shard of zero:
|
||||
// - if the tenant is totally unsharded, the unsharded prefix contains all the data
|
||||
// - if the tenant is sharded, we still want to recover the initdb data, but we only
|
||||
|
||||
@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
||||
id::TimelineId,
|
||||
id::TimelineId, serde_system_time,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let mut progress = SecondaryProgress {
|
||||
layers_total: heatmap_stats.layers,
|
||||
bytes_total: heatmap_stats.bytes,
|
||||
heatmap_mtime: Some(heatmap_mtime),
|
||||
heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)),
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Existing on-disk layers: just update their access time.
|
||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
let local_path = self
|
||||
.conf
|
||||
.timeline_path(tenant_shard_id, &timeline.timeline_id)
|
||||
.join(layer.name.file_name());
|
||||
match tokio::fs::metadata(&local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
"Layer {} present at {}, size {}",
|
||||
layer.name,
|
||||
local_path,
|
||||
meta.len(),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Layer {} not found at {} ({})",
|
||||
layer.name,
|
||||
local_path,
|
||||
e
|
||||
);
|
||||
debug_assert!(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||
|| on_disk.access_time != layer.access_time
|
||||
{
|
||||
|
||||
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tracing::warn;
|
||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
|
||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use super::layer_map::InMemoryLayerHandle;
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::PageReconstructError;
|
||||
|
||||
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Description of layer to be read - the layer map can turn
|
||||
/// this description into the actual layer.
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub(crate) enum ReadableLayerDesc {
|
||||
Persistent {
|
||||
desc: PersistentLayerDesc,
|
||||
lsn_range: Range<Lsn>,
|
||||
},
|
||||
InMemory {
|
||||
handle: InMemoryLayerHandle,
|
||||
lsn_ceil: Lsn,
|
||||
},
|
||||
/// A key that uniquely identifies a layer in a timeline
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub(crate) enum LayerId {
|
||||
PersitentLayerId(PersistentLayerKey),
|
||||
InMemoryLayerId(InMemoryLayerFileId),
|
||||
}
|
||||
|
||||
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
|
||||
/// Layer wrapper for the read path. Note that it is valid
|
||||
/// to use these layers even after external operations have
|
||||
/// been performed on them (compaction, freeze, etc.).
|
||||
#[derive(Debug)]
|
||||
struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
||||
pub(crate) enum ReadableLayer {
|
||||
PersistentLayer(Layer),
|
||||
InMemoryLayer(Arc<InMemoryLayer>),
|
||||
}
|
||||
|
||||
/// A partial description of a read to be done.
|
||||
#[derive(Debug, Clone)]
|
||||
struct ReadDesc {
|
||||
/// An id used to resolve the readable layer within the fringe
|
||||
layer_id: LayerId,
|
||||
/// Lsn range for the read, used for selecting the next read
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
/// Data structure which maintains a fringe of layers for the
|
||||
/// read path. The fringe is the set of layers which intersects
|
||||
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
||||
/// a two layer indexing scheme.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LayerFringe {
|
||||
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
|
||||
layers: HashMap<ReadableLayerDesc, KeySpace>,
|
||||
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
|
||||
layers: HashMap<LayerId, LayerKeyspace>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
layer: ReadableLayer,
|
||||
target_keyspace: KeySpace,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
pub(crate) fn new() -> Self {
|
||||
LayerFringe {
|
||||
layers_by_lsn: BinaryHeap::new(),
|
||||
planned_reads_by_lsn: BinaryHeap::new(),
|
||||
layers: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
|
||||
let handle = match self.layers_by_lsn.pop() {
|
||||
Some(h) => h,
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
|
||||
let read_desc = match self.planned_reads_by_lsn.pop() {
|
||||
Some(desc) => desc,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&handle.0);
|
||||
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||
match removed {
|
||||
Some((layer, keyspace)) => Some((layer, keyspace)),
|
||||
Some((
|
||||
_,
|
||||
LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace,
|
||||
},
|
||||
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
|
||||
let entry = self.layers.entry(layer.clone());
|
||||
pub(crate) fn update(
|
||||
&mut self,
|
||||
layer: ReadableLayer,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
) {
|
||||
let layer_id = layer.id();
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().merge(&keyspace);
|
||||
entry.get_mut().target_keyspace.merge(&keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.layers_by_lsn
|
||||
.push(ReadableLayerDescOrdered(entry.key().clone()));
|
||||
entry.insert(keyspace);
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range,
|
||||
layer_id: layer_id.clone(),
|
||||
});
|
||||
entry.insert(LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace: keyspace,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ReadableLayerDescOrdered {
|
||||
impl Ord for ReadDesc {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
|
||||
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||
if ord == std::cmp::Ordering::Equal {
|
||||
self.0
|
||||
.get_lsn_floor()
|
||||
.cmp(&other.0.get_lsn_floor())
|
||||
.reverse()
|
||||
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
|
||||
} else {
|
||||
ord
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ReadableLayerDescOrdered {
|
||||
impl PartialOrd for ReadDesc {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ReadableLayerDescOrdered {
|
||||
impl PartialEq for ReadDesc {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.get_lsn_floor() == other.0.get_lsn_floor()
|
||||
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
|
||||
self.lsn_range == other.lsn_range
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ReadableLayerDescOrdered {}
|
||||
impl Eq for ReadDesc {}
|
||||
|
||||
impl ReadableLayerDesc {
|
||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
||||
impl ReadableLayer {
|
||||
pub(crate) fn id(&self) -> LayerId {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
|
||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
|
||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
||||
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
|
||||
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
layer_manager: &LayerManager,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { desc, lsn_range } => {
|
||||
let layer = layer_manager.get_from_desc(desc);
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
lsn_range.clone(),
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
||||
let layer = layer_manager
|
||||
.layer_map()
|
||||
.get_in_memory_layer(handle)
|
||||
.unwrap();
|
||||
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
||||
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -938,7 +939,7 @@ impl DeltaLayerInner {
|
||||
}
|
||||
|
||||
if !range_end_handled {
|
||||
tracing::info!("Handling range end fallback at {}", data_end_offset);
|
||||
tracing::debug!("Handling range end fallback at {}", data_end_offset);
|
||||
planner.handle_range_end(data_end_offset);
|
||||
}
|
||||
}
|
||||
@@ -946,6 +947,34 @@ impl DeltaLayerInner {
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
fn get_min_read_buffer_size(
|
||||
planned_reads: &[VectoredRead],
|
||||
read_size_soft_max: usize,
|
||||
) -> usize {
|
||||
let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
|
||||
return read_size_soft_max;
|
||||
};
|
||||
|
||||
let largest_read_size = largest_read.size();
|
||||
if largest_read_size > read_size_soft_max {
|
||||
// If the read is oversized, it should only contain one key.
|
||||
let offenders = largest_read
|
||||
.blobs_at
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||
.join(", ");
|
||||
tracing::warn!(
|
||||
"Oversized vectored read ({} > {}) for keys {}",
|
||||
largest_read_size,
|
||||
read_size_soft_max,
|
||||
offenders
|
||||
);
|
||||
}
|
||||
|
||||
largest_read_size
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
@@ -959,7 +988,8 @@ impl DeltaLayerInner {
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into();
|
||||
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
@@ -986,7 +1016,7 @@ impl DeltaLayerInner {
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -1210,9 +1240,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
mod test {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use itertools::MinMaxResult;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
@@ -1332,4 +1369,229 @@ mod test {
|
||||
|
||||
assert_eq!(planned_blobs, expected_blobs);
|
||||
}
|
||||
|
||||
mod constants {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Offset used by all lsns in this test
|
||||
pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
|
||||
/// Number of unique keys including in the test data
|
||||
pub(super) const KEY_COUNT: u8 = 60;
|
||||
/// Max number of different lsns for each key
|
||||
pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
|
||||
/// Possible value sizes for each key along with a probability weight
|
||||
pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
|
||||
/// Probability that there will be a gap between the current key and the next one (33.3%)
|
||||
pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
|
||||
/// The minimum size of a key range in all the generated reads
|
||||
pub(super) const MIN_RANGE_SIZE: i128 = 10;
|
||||
/// The number of ranges included in each vectored read
|
||||
pub(super) const RANGES_COUNT: u8 = 2;
|
||||
/// The number of vectored reads performed
|
||||
pub(super) const READS_COUNT: u8 = 100;
|
||||
/// Soft max size of a vectored read. Will be violated if we have to read keys
|
||||
/// with values larger than the limit
|
||||
pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
|
||||
}
|
||||
|
||||
struct Entry {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
value: Vec<u8>,
|
||||
}
|
||||
|
||||
fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
|
||||
let mut current_key = Key::MIN;
|
||||
|
||||
let mut entries = Vec::new();
|
||||
for _ in 0..constants::KEY_COUNT {
|
||||
let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
|
||||
let mut lsns_iter =
|
||||
std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
|
||||
Some(Lsn(lsn.0 + 0x08))
|
||||
});
|
||||
let mut lsns = Vec::new();
|
||||
while lsns.len() < count as usize {
|
||||
let take = rng.gen_bool(0.5);
|
||||
let lsn = lsns_iter.next().unwrap();
|
||||
if take {
|
||||
lsns.push(lsn);
|
||||
}
|
||||
}
|
||||
|
||||
for lsn in lsns {
|
||||
let size = constants::VALUE_SIZES
|
||||
.choose_weighted(rng, |item| item.1)
|
||||
.unwrap()
|
||||
.0;
|
||||
let mut buf = vec![0; size];
|
||||
rng.fill_bytes(&mut buf);
|
||||
|
||||
entries.push(Entry {
|
||||
key: current_key,
|
||||
lsn,
|
||||
value: buf,
|
||||
})
|
||||
}
|
||||
|
||||
let gap = constants::KEY_GAP_CHANGES
|
||||
.choose_weighted(rng, |item| item.1)
|
||||
.unwrap()
|
||||
.0;
|
||||
if gap {
|
||||
current_key = current_key.add(2);
|
||||
} else {
|
||||
current_key = current_key.add(1);
|
||||
}
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
|
||||
struct EntriesMeta {
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
index: BTreeMap<(Key, Lsn), Vec<u8>>,
|
||||
}
|
||||
|
||||
fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
|
||||
let key_range = match entries.iter().minmax_by_key(|e| e.key) {
|
||||
MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
|
||||
_ => panic!("More than one entry is always expected"),
|
||||
};
|
||||
|
||||
let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
|
||||
MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
|
||||
_ => panic!("More than one entry is always expected"),
|
||||
};
|
||||
|
||||
let mut index = BTreeMap::new();
|
||||
for entry in entries.iter() {
|
||||
index.insert((entry.key, entry.lsn), entry.value.clone());
|
||||
}
|
||||
|
||||
EntriesMeta {
|
||||
key_range,
|
||||
lsn_range,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
|
||||
let start = key_range.start.to_i128();
|
||||
let end = key_range.end.to_i128();
|
||||
|
||||
let mut keyspace = KeySpace::default();
|
||||
|
||||
for _ in 0..constants::RANGES_COUNT {
|
||||
let mut range: Option<Range<Key>> = Option::default();
|
||||
while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
|
||||
let range_start = rng.gen_range(start..end);
|
||||
let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
|
||||
if range_end_offset >= end {
|
||||
range = Some(Key::from_i128(range_start)..Key::from_i128(end));
|
||||
} else {
|
||||
let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
|
||||
range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
|
||||
}
|
||||
}
|
||||
keyspace.ranges.push(range.unwrap());
|
||||
}
|
||||
|
||||
keyspace
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
tracing::info!("Generating test data ...");
|
||||
|
||||
let rng = &mut StdRng::seed_from_u64(0);
|
||||
let entries = generate_entries(rng);
|
||||
let entries_meta = get_entries_meta(&entries);
|
||||
|
||||
tracing::info!("Done generating {} entries", entries.len());
|
||||
|
||||
tracing::info!("Writing test data to delta layer ...");
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
entries_meta.key_range.start,
|
||||
entries_meta.lsn_range.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
for entry in entries {
|
||||
let (_, res) = writer
|
||||
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
|
||||
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
|
||||
|
||||
let inner = resident.get_inner_delta(&ctx).await?;
|
||||
|
||||
let file_size = inner.file.metadata().await?.len();
|
||||
tracing::info!(
|
||||
"Done writing test data to delta layer. Resulting file size is: {}",
|
||||
file_size
|
||||
);
|
||||
|
||||
for i in 0..constants::READS_COUNT {
|
||||
tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
|
||||
|
||||
let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
|
||||
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
|
||||
let vectored_reads = DeltaLayerInner::plan_reads(
|
||||
keyspace.clone(),
|
||||
entries_meta.lsn_range.clone(),
|
||||
data_end_offset,
|
||||
index_reader,
|
||||
planner,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
|
||||
let buf_size = DeltaLayerInner::get_min_read_buffer_size(
|
||||
&vectored_reads,
|
||||
constants::MAX_VECTORED_READ_BYTES,
|
||||
);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
for read in vectored_reads {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||
.await?;
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -540,7 +541,25 @@ impl ImageLayerInner {
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
for read in reads.into_iter() {
|
||||
let buf = BytesMut::with_capacity(max_vectored_read_bytes);
|
||||
let buf_size = read.size();
|
||||
|
||||
if buf_size > max_vectored_read_bytes {
|
||||
// If the read is oversized, it should only contain one key.
|
||||
let offenders = read
|
||||
.blobs_at
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||
.join(", ");
|
||||
tracing::warn!(
|
||||
"Oversized vectored read ({} > {}) for keys {}",
|
||||
buf_size,
|
||||
max_vectored_read_bytes,
|
||||
offenders
|
||||
);
|
||||
}
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf).await;
|
||||
|
||||
match res {
|
||||
|
||||
@@ -12,13 +12,14 @@ use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::walrecord;
|
||||
use crate::{page_cache, walrecord};
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
@@ -36,10 +37,14 @@ use super::{
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
file_id: InMemoryLayerFileId,
|
||||
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
/// start is inclusive.
|
||||
@@ -49,6 +54,8 @@ pub struct InMemoryLayer {
|
||||
/// Writes are only allowed when this is `None`.
|
||||
end_lsn: OnceLock<Lsn>,
|
||||
|
||||
opened_at: Instant,
|
||||
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
@@ -200,6 +207,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
|
||||
};
|
||||
|
||||
impl InMemoryLayer {
|
||||
pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
|
||||
self.file_id
|
||||
}
|
||||
|
||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
@@ -443,13 +454,16 @@ impl InMemoryLayer {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
||||
let key = InMemoryLayerFileId(file.id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_lsn,
|
||||
end_lsn: OnceLock::new(),
|
||||
opened_at: Instant::now(),
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
index: HashMap::new(),
|
||||
file,
|
||||
@@ -510,6 +524,10 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_opened_at(&self) -> Instant {
|
||||
self.opened_at
|
||||
}
|
||||
|
||||
pub(crate) async fn tick(&self) -> Option<u64> {
|
||||
let mut inner = self.inner.write().await;
|
||||
let size = inner.file.len();
|
||||
|
||||
@@ -1759,6 +1759,18 @@ impl ResidentLayer {
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.owner.metadata()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn get_inner_delta<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
|
||||
let owner = &self.owner.0;
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
LayerKind::Delta(d) => Ok(d),
|
||||
LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ResidentLayer {
|
||||
|
||||
@@ -18,7 +18,7 @@ use utils::{backoff, completion};
|
||||
|
||||
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
|
||||
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
|
||||
let permits = usize::max(
|
||||
1,
|
||||
// while a lot of the work is done on spawn_blocking, we still do
|
||||
@@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||
loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
|
||||
);
|
||||
|
||||
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
|
||||
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
|
||||
Ok(permit) => permit,
|
||||
Err(_closed) => unreachable!("we never close the semaphore"),
|
||||
|
||||
@@ -9,6 +9,7 @@ pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use arc_swap::ArcSwap;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use enumset::EnumSet;
|
||||
@@ -118,11 +119,11 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::config::TenantConf;
|
||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub(super) enum FlushLoopState {
|
||||
@@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState {
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
|
||||
myself: Weak<Self>,
|
||||
|
||||
@@ -281,10 +282,12 @@ pub struct Timeline {
|
||||
pub(super) flush_loop_state: Mutex<FlushLoopState>,
|
||||
|
||||
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
|
||||
/// The value is a counter, incremented every time a new flush cycle is requested.
|
||||
/// The flush cycle counter is sent back on the layer_flush_done channel when
|
||||
/// the flush finishes. You can use that to wait for the flush to finish.
|
||||
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
|
||||
/// - The u64 value is a counter, incremented every time a new flush cycle is requested.
|
||||
/// The flush cycle counter is sent back on the layer_flush_done channel when
|
||||
/// the flush finishes. You can use that to wait for the flush to finish.
|
||||
/// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
|
||||
/// read by whoever sends an update
|
||||
layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
|
||||
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
||||
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
|
||||
|
||||
@@ -309,6 +312,8 @@ pub struct Timeline {
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
|
||||
last_image_layer_creation_check_at: AtomicLsn,
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: LogicalSize,
|
||||
|
||||
@@ -610,6 +615,25 @@ pub enum GetVectoredImpl {
|
||||
Vectored,
|
||||
}
|
||||
|
||||
pub(crate) enum WaitLsnWaiter<'a> {
|
||||
Timeline(&'a Timeline),
|
||||
Tenant,
|
||||
PageService,
|
||||
}
|
||||
|
||||
/// Argument to [`Timeline::shutdown`].
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) enum ShutdownMode {
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||
///
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
FreezeAndFlush,
|
||||
/// Shut down immediately, without waiting for any open layers to flush.
|
||||
Hard,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -1058,7 +1082,8 @@ impl Timeline {
|
||||
pub(crate) async fn wait_lsn(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
_ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||
who_is_waiting: WaitLsnWaiter<'_>,
|
||||
ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||
) -> Result<(), WaitLsnError> {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(WaitLsnError::Shutdown);
|
||||
@@ -1066,20 +1091,28 @@ impl Timeline {
|
||||
return Err(WaitLsnError::BadState);
|
||||
}
|
||||
|
||||
// This should never be called from the WAL receiver, because that could lead
|
||||
// to a deadlock.
|
||||
debug_assert!(
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
debug_assert!(
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
debug_assert!(
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
if cfg!(debug_assertions) {
|
||||
match ctx.task_kind() {
|
||||
TaskKind::WalReceiverManager
|
||||
| TaskKind::WalReceiverConnectionHandler
|
||||
| TaskKind::WalReceiverConnectionPoller => {
|
||||
let is_myself = match who_is_waiting {
|
||||
WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
|
||||
WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
|
||||
};
|
||||
if is_myself {
|
||||
if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
|
||||
// walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
|
||||
panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
|
||||
}
|
||||
} else {
|
||||
// if another timeline's is waiting for us, there's no deadlock risk because
|
||||
// our walreceiver task can make progress independent of theirs
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
|
||||
|
||||
@@ -1138,8 +1171,8 @@ impl Timeline {
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
self.freeze_inmem_layer(false).await;
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
let to_lsn = self.freeze_inmem_layer(false).await;
|
||||
self.flush_frozen_layers_and_wait(to_lsn).await
|
||||
}
|
||||
|
||||
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
|
||||
@@ -1159,7 +1192,39 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
|
||||
// No open layer, no work to do.
|
||||
// If there is no open layer, we have no layer freezing to do. However, we might need to generate
|
||||
// some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
|
||||
// that didn't result in writes to this shard.
|
||||
|
||||
// Must not hold the layers lock while waiting for a flush.
|
||||
drop(layers_guard);
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let disk_consistent_lsn = self.get_disk_consistent_lsn();
|
||||
if last_record_lsn > disk_consistent_lsn {
|
||||
// We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
|
||||
// we are a sharded tenant and have skipped some WAL
|
||||
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
|
||||
if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
|
||||
// This should be somewhat rare, so we log it at INFO level.
|
||||
//
|
||||
// We checked for checkpoint timeout so that a shard without any
|
||||
// data ingested (yet) doesn't write a remote index as soon as it
|
||||
// sees its LSN advance: we only do this if we've been layer-less
|
||||
// for some time.
|
||||
tracing::info!(
|
||||
"Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
|
||||
disk_consistent_lsn,
|
||||
last_record_lsn
|
||||
);
|
||||
|
||||
// The flush loop will update remote consistent LSN as well as disk consistent LSN.
|
||||
self.flush_frozen_layers_and_wait(last_record_lsn)
|
||||
.await
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
};
|
||||
|
||||
@@ -1192,7 +1257,7 @@ impl Timeline {
|
||||
checkpoint_distance,
|
||||
self.get_last_record_lsn(),
|
||||
self.last_freeze_at.load(),
|
||||
*self.last_freeze_ts.read().unwrap(),
|
||||
open_layer.get_opened_at(),
|
||||
) {
|
||||
match open_layer.info() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
|
||||
@@ -1279,7 +1344,7 @@ impl Timeline {
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
if self.tenant_shard_id.is_zero() {
|
||||
if self.tenant_shard_id.is_shard_zero() {
|
||||
// Logical size is only maintained accurately on shard zero.
|
||||
self.spawn_initial_logical_size_computation_task(ctx);
|
||||
}
|
||||
@@ -1288,83 +1353,119 @@ impl Timeline {
|
||||
self.launch_eviction_task(parent, background_jobs_can_start);
|
||||
}
|
||||
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||
/// After this function returns, there are no timeline-scoped tasks are left running.
|
||||
///
|
||||
/// While we are flushing, we continue to accept read I/O.
|
||||
pub(crate) async fn flush_and_shutdown(&self) {
|
||||
/// The preferred pattern for is:
|
||||
/// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
|
||||
/// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
|
||||
/// go the extra mile and keep track of JoinHandles
|
||||
/// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
|
||||
/// instead of spawning directly on a runtime. It is a more composable / testable pattern.
|
||||
///
|
||||
/// For legacy reasons, we still have multiple tasks spawned using
|
||||
/// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
|
||||
/// We refer to these as "timeline-scoped task_mgr tasks".
|
||||
/// Some of these tasks are already sensitive to Timeline::cancel while others are
|
||||
/// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
|
||||
/// or [`task_mgr::shutdown_watcher`].
|
||||
/// We want to gradually convert the code base away from these.
|
||||
///
|
||||
/// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
|
||||
/// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
|
||||
/// ones that aren't mentioned here):
|
||||
/// - [`TaskKind::TimelineDeletionWorker`]
|
||||
/// - NB: also used for tenant deletion
|
||||
/// - [`TaskKind::RemoteUploadTask`]`
|
||||
/// - [`TaskKind::InitialLogicalSizeCalculation`]
|
||||
/// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
|
||||
// Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
|
||||
/// - [`TaskKind::Eviction`]
|
||||
/// - [`TaskKind::LayerFlushTask`]
|
||||
/// - [`TaskKind::OndemandLogicalSizeCalculation`]
|
||||
/// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
|
||||
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
|
||||
// trying to flush
|
||||
tracing::debug!("Waiting for WalReceiverManager...");
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
let try_freeze_and_flush = match mode {
|
||||
ShutdownMode::FreezeAndFlush => true,
|
||||
ShutdownMode::Hard => false,
|
||||
};
|
||||
|
||||
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
|
||||
// Regardless of whether we're going to try_freeze_and_flush
|
||||
// or not, stop ingesting any more data. Walreceiver only provides
|
||||
// cancellation but no "wait until gone", because it uses the Timeline::gate.
|
||||
// So, only after the self.gate.close() below will we know for sure that
|
||||
// no walreceiver tasks are left.
|
||||
// For `try_freeze_and_flush=true`, this means that we might still be ingesting
|
||||
// data during the call to `self.freeze_and_flush()` below.
|
||||
// That's not ideal, but, we don't have the concept of a ChildGuard,
|
||||
// which is what we'd need to properly model early shutdown of the walreceiver
|
||||
// task sub-tree before the other Timeline task sub-trees.
|
||||
let walreceiver = self.walreceiver.lock().unwrap().take();
|
||||
tracing::debug!(
|
||||
is_some = walreceiver.is_some(),
|
||||
"Waiting for WalReceiverManager..."
|
||||
);
|
||||
if let Some(walreceiver) = walreceiver {
|
||||
walreceiver.cancel();
|
||||
}
|
||||
// ... and inform any waiters for newer LSNs that there won't be any.
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
// now all writers to InMemory layer are gone, do the final flush if requested
|
||||
match self.freeze_and_flush().await {
|
||||
Ok(_) => {
|
||||
// drain the upload queue
|
||||
if let Some(client) = self.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.shutdown().await;
|
||||
if try_freeze_and_flush {
|
||||
// we shut down walreceiver above, so, we won't add anything more
|
||||
// to the InMemoryLayer; freeze it and wait for all frozen layers
|
||||
// to reach the disk & upload queue, then shut the upload queue and
|
||||
// wait for it to drain.
|
||||
match self.freeze_and_flush().await {
|
||||
Ok(_) => {
|
||||
// drain the upload queue
|
||||
if let Some(client) = self.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.shutdown().await;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||
// we have some extra WAL replay to do next time the timeline starts.
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||
// we have some extra WAL replay to do next time the timeline starts.
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
self.shutdown().await;
|
||||
}
|
||||
|
||||
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
|
||||
/// the graceful [`Timeline::flush_and_shutdown`] function.
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Signal any subscribers to our cancellation token to drop out
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
self.cancel.cancel();
|
||||
|
||||
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
|
||||
// while doing so.
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::LayerFlushTask),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
|
||||
// case our caller wants to use that for a deletion
|
||||
// Transition the remote_client into a state where it's only useful for timeline deletion.
|
||||
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
// As documented in remote_client.stop()'s doc comment, it's our responsibility
|
||||
// to shut down the upload queue tasks.
|
||||
// TODO: fix that, task management should be encapsulated inside remote_client.
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::RemoteUploadTask),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
// TODO: work toward making this a no-op. See this funciton's doc comment for more context.
|
||||
tracing::debug!("Waiting for tasks...");
|
||||
|
||||
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
|
||||
|
||||
// Finally wait until any gate-holders are complete
|
||||
// Finally wait until any gate-holders are complete.
|
||||
//
|
||||
// TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
|
||||
// and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
|
||||
self.gate.close().await;
|
||||
|
||||
self.metrics.shutdown();
|
||||
@@ -1521,7 +1622,7 @@ impl Timeline {
|
||||
checkpoint_distance: u64,
|
||||
projected_lsn: Lsn,
|
||||
last_freeze_at: Lsn,
|
||||
last_freeze_ts: Instant,
|
||||
opened_at: Instant,
|
||||
) -> bool {
|
||||
let distance = projected_lsn.widening_sub(last_freeze_at);
|
||||
|
||||
@@ -1547,13 +1648,13 @@ impl Timeline {
|
||||
);
|
||||
|
||||
true
|
||||
} else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
|
||||
} else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
last_freeze_ts.elapsed()
|
||||
);
|
||||
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
opened_at.elapsed()
|
||||
);
|
||||
|
||||
true
|
||||
} else {
|
||||
@@ -1568,57 +1669,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
||||
// Private functions
|
||||
impl Timeline {
|
||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.lazy_slru_download
|
||||
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
|
||||
}
|
||||
|
||||
fn get_checkpoint_distance(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.checkpoint_distance
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
||||
}
|
||||
|
||||
fn get_checkpoint_timeout(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.checkpoint_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||
}
|
||||
|
||||
fn get_compaction_target_size(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_target_size
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||
}
|
||||
|
||||
fn get_compaction_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
fn get_image_creation_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.image_creation_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
|
||||
let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
|
||||
let tenant_conf = &self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_algorithm
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
|
||||
}
|
||||
|
||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.eviction_policy
|
||||
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
|
||||
}
|
||||
@@ -1632,14 +1741,26 @@ impl Timeline {
|
||||
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self) {
|
||||
fn get_image_layer_creation_check_threshold(&self) -> u8 {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(
|
||||
self.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_creation_check_threshold,
|
||||
)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
|
||||
// The threshold is embedded in the metric. So, we need to update it.
|
||||
{
|
||||
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
|
||||
&self.tenant_conf.read().unwrap().tenant_conf,
|
||||
new_conf,
|
||||
&self.conf.default_tenant_conf,
|
||||
);
|
||||
|
||||
@@ -1666,7 +1787,7 @@ impl Timeline {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
metadata: &TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
timeline_id: TimelineId,
|
||||
@@ -1682,17 +1803,16 @@ impl Timeline {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let (state, _) = watch::channel(state);
|
||||
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
|
||||
let tenant_conf_guard = tenant_conf.read().unwrap();
|
||||
|
||||
let evictions_low_residence_duration_metric_threshold =
|
||||
let evictions_low_residence_duration_metric_threshold = {
|
||||
let loaded_tenant_conf = tenant_conf.load();
|
||||
Self::get_evictions_low_residence_duration_metric_threshold(
|
||||
&tenant_conf_guard.tenant_conf,
|
||||
&loaded_tenant_conf.tenant_conf,
|
||||
&conf.default_tenant_conf,
|
||||
);
|
||||
drop(tenant_conf_guard);
|
||||
)
|
||||
};
|
||||
|
||||
Arc::new_cyclic(|myself| {
|
||||
let mut result = Timeline {
|
||||
@@ -1769,6 +1889,7 @@ impl Timeline {
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_cache: RwLock::new(HashMap::new()),
|
||||
@@ -1797,6 +1918,7 @@ impl Timeline {
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
result
|
||||
.metrics
|
||||
.last_record_gauge
|
||||
@@ -1873,20 +1995,19 @@ impl Timeline {
|
||||
self.timeline_id, self.tenant_shard_id
|
||||
);
|
||||
|
||||
let tenant_conf_guard = self.tenant_conf.read().unwrap();
|
||||
let wal_connect_timeout = tenant_conf_guard
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let wal_connect_timeout = tenant_conf
|
||||
.tenant_conf
|
||||
.walreceiver_connect_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
|
||||
let lagging_wal_timeout = tenant_conf_guard
|
||||
let lagging_wal_timeout = tenant_conf
|
||||
.tenant_conf
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
|
||||
let max_lsn_wal_lag = tenant_conf_guard
|
||||
let max_lsn_wal_lag = tenant_conf
|
||||
.tenant_conf
|
||||
.max_lsn_wal_lag
|
||||
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
|
||||
drop(tenant_conf_guard);
|
||||
|
||||
let mut guard = self.walreceiver.lock().unwrap();
|
||||
assert!(
|
||||
@@ -2116,7 +2237,7 @@ impl Timeline {
|
||||
priority: GetLogicalSizePriority,
|
||||
ctx: &RequestContext,
|
||||
) -> logical_size::CurrentLogicalSize {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
// Logical size is only accurately maintained on shard zero: when called elsewhere, for example
|
||||
// when HTTP API is serving a GET for timeline zero, return zero
|
||||
return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
|
||||
@@ -2412,7 +2533,7 @@ impl Timeline {
|
||||
crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// We should never be calculating logical sizes on shard !=0, because these shards do not have
|
||||
// accurate relation sizes, and they do not emit consumption metrics.
|
||||
debug_assert!(self.tenant_shard_id.is_zero());
|
||||
debug_assert!(self.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let guard = self
|
||||
.gate
|
||||
@@ -2434,10 +2555,6 @@ impl Timeline {
|
||||
debug!("cancelling logical size calculation for timeline shutdown");
|
||||
calculation.await
|
||||
}
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
debug!("cancelling logical size calculation for task shutdown");
|
||||
calculation.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2892,16 +3009,6 @@ impl Timeline {
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
|
||||
// Hold the layer map whilst visiting the timeline to prevent
|
||||
// compaction, eviction and flushes from rendering the layers unreadable.
|
||||
//
|
||||
// TODO: Do we actually need to do this? In theory holding on
|
||||
// to [`tenant::storage_layer::Layer`] should be enough. However,
|
||||
// [`Timeline::get`] also holds the lock during IO, so more investigation
|
||||
// is needed.
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
loop {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
@@ -2911,6 +3018,9 @@ impl Timeline {
|
||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||
completed_keyspace.merge(&keys_done_last_step);
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
let in_memory_layer = layers.find_in_memory_layer(|l| {
|
||||
let start_lsn = l.get_lsn_range().start;
|
||||
cont_lsn > start_lsn
|
||||
@@ -2918,12 +3028,11 @@ impl Timeline {
|
||||
|
||||
match in_memory_layer {
|
||||
Some(l) => {
|
||||
let lsn_range = l.get_lsn_range().start..cont_lsn;
|
||||
fringe.update(
|
||||
ReadableLayerDesc::InMemory {
|
||||
handle: l,
|
||||
lsn_ceil: cont_lsn,
|
||||
},
|
||||
ReadableLayer::InMemoryLayer(l),
|
||||
unmapped_keyspace.clone(),
|
||||
lsn_range,
|
||||
);
|
||||
}
|
||||
None => {
|
||||
@@ -2935,30 +3044,43 @@ impl Timeline {
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
ReadableLayerDesc::Persistent {
|
||||
desc: (*layer).clone(),
|
||||
lsn_range: lsn_floor..cont_lsn,
|
||||
},
|
||||
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
// The fringe keeps readable handles for the layers which are safe to read even
|
||||
// if layers were compacted or flushed.
|
||||
//
|
||||
// The more interesting consideration is: "Why is the read algorithm still correct
|
||||
// if the layer map changes while it is operating?". Doing a vectored read on a
|
||||
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
|
||||
// covered by the read. The layer map tells us how to move the lsn downwards for a
|
||||
// range at *a particular point in time*. It is fine for the answer to be different
|
||||
// at two different time points.
|
||||
drop(guard);
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
&guard,
|
||||
keyspace_to_read.clone(),
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = layer_to_read.get_lsn_floor();
|
||||
cont_lsn = next_cont_lsn;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -3036,7 +3158,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
ancestor
|
||||
.wait_lsn(self.ancestor_lsn, ctx)
|
||||
.wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
|
||||
@@ -3086,7 +3208,9 @@ impl Timeline {
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
}
|
||||
|
||||
async fn freeze_inmem_layer(&self, write_lock_held: bool) {
|
||||
/// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
|
||||
/// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
|
||||
async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
|
||||
// Freeze the current open in-memory layer. It will be written to disk on next
|
||||
// iteration.
|
||||
|
||||
@@ -3096,7 +3220,9 @@ impl Timeline {
|
||||
Some(self.write_lock.lock().await)
|
||||
};
|
||||
|
||||
self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
|
||||
let to_lsn = self.get_last_record_lsn();
|
||||
self.freeze_inmem_layer_at(to_lsn).await;
|
||||
to_lsn
|
||||
}
|
||||
|
||||
async fn freeze_inmem_layer_at(&self, at: Lsn) {
|
||||
@@ -3109,25 +3235,24 @@ impl Timeline {
|
||||
/// Layer flusher task's main loop.
|
||||
async fn flush_loop(
|
||||
self: &Arc<Self>,
|
||||
mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
|
||||
mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
info!("started flush loop");
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = self.cancel.cancelled() => {
|
||||
info!("shutting down layer flush task");
|
||||
break;
|
||||
},
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("shutting down layer flush task");
|
||||
info!("shutting down layer flush task due to Timeline::cancel");
|
||||
break;
|
||||
},
|
||||
_ = layer_flush_start_rx.changed() => {}
|
||||
}
|
||||
|
||||
trace!("waking up");
|
||||
let flush_counter = *layer_flush_start_rx.borrow();
|
||||
let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
|
||||
|
||||
// The highest LSN to which we flushed in the loop over frozen layers
|
||||
let mut flushed_to_lsn = Lsn(0);
|
||||
|
||||
let result = loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
@@ -3148,7 +3273,9 @@ impl Timeline {
|
||||
break Ok(());
|
||||
};
|
||||
match self.flush_frozen_layer(layer_to_flush, ctx).await {
|
||||
Ok(()) => {}
|
||||
Ok(this_layer_to_lsn) => {
|
||||
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
|
||||
}
|
||||
Err(FlushLayerError::Cancelled) => {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
return;
|
||||
@@ -3157,11 +3284,36 @@ impl Timeline {
|
||||
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
|
||||
) => {
|
||||
error!("could not flush frozen layer: {err:?}");
|
||||
break err;
|
||||
break err.map(|_| ());
|
||||
}
|
||||
}
|
||||
timer.stop_and_record();
|
||||
};
|
||||
|
||||
// Unsharded tenants should never advance their LSN beyond the end of the
|
||||
// highest layer they write: such gaps between layer data and the frozen LSN
|
||||
// are only legal on sharded tenants.
|
||||
debug_assert!(
|
||||
self.shard_identity.count.count() > 1
|
||||
|| flushed_to_lsn >= frozen_to_lsn
|
||||
|| !flushed_to_lsn.is_valid()
|
||||
);
|
||||
|
||||
if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
|
||||
// If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
|
||||
// to us via layer_flush_start_rx, then advance it here.
|
||||
//
|
||||
// This path is only taken for tenants with multiple shards: single sharded tenants should
|
||||
// never encounter a gap in the wal.
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
|
||||
if self.set_disk_consistent_lsn(frozen_to_lsn) {
|
||||
if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
|
||||
tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Notify any listeners that we're done
|
||||
let _ = self
|
||||
.layer_flush_done_tx
|
||||
@@ -3169,7 +3321,13 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
|
||||
/// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
|
||||
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
|
||||
///
|
||||
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
|
||||
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
|
||||
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
|
||||
async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||
|
||||
// Increment the flush cycle counter and wake up the flush task.
|
||||
@@ -3183,9 +3341,10 @@ impl Timeline {
|
||||
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
||||
}
|
||||
|
||||
self.layer_flush_start_tx.send_modify(|counter| {
|
||||
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||
my_flush_request = *counter + 1;
|
||||
*counter = my_flush_request;
|
||||
*lsn = std::cmp::max(last_record_lsn, *lsn);
|
||||
});
|
||||
|
||||
loop {
|
||||
@@ -3222,16 +3381,22 @@ impl Timeline {
|
||||
}
|
||||
|
||||
fn flush_frozen_layers(&self) {
|
||||
self.layer_flush_start_tx.send_modify(|val| *val += 1);
|
||||
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||
*counter += 1;
|
||||
|
||||
*lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
|
||||
});
|
||||
}
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
///
|
||||
/// Return value is the last lsn (inclusive) of the layer that was frozen.
|
||||
#[instrument(skip_all, fields(layer=%frozen_layer))]
|
||||
async fn flush_frozen_layer(
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: Arc<InMemoryLayer>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
) -> Result<Lsn, FlushLayerError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
@@ -3306,7 +3471,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
// in-memory layer from the map now. The flushed layer is stored in
|
||||
@@ -3320,10 +3484,7 @@ impl Timeline {
|
||||
|
||||
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
|
||||
|
||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||
|
||||
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
|
||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
||||
}
|
||||
@@ -3340,7 +3501,22 @@ impl Timeline {
|
||||
// This failpoint is used by another test case `test_pageserver_recovery`.
|
||||
fail_point!("flush-frozen-exit");
|
||||
|
||||
Ok(())
|
||||
Ok(Lsn(lsn_range.end.0 - 1))
|
||||
}
|
||||
|
||||
/// Return true if the value changed
|
||||
///
|
||||
/// This function must only be used from the layer flush task, and may not be called concurrently.
|
||||
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
|
||||
// We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
|
||||
let old_value = self.disk_consistent_lsn.load();
|
||||
if new_value != old_value {
|
||||
assert!(new_value >= old_value);
|
||||
self.disk_consistent_lsn.store(new_value);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Update metadata file
|
||||
@@ -3501,6 +3677,24 @@ impl Timeline {
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let last = self.last_image_layer_creation_check_at.load();
|
||||
if lsn != Lsn(0) {
|
||||
let distance = lsn
|
||||
.checked_sub(last)
|
||||
.expect("Attempt to compact with LSN going backwards");
|
||||
|
||||
let min_distance = self.get_image_layer_creation_check_threshold() as u64
|
||||
* self.get_checkpoint_distance();
|
||||
|
||||
// Skip the expensive delta layer counting below if we've not ingested
|
||||
// sufficient WAL since the last check.
|
||||
if distance.0 < min_distance {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.last_image_layer_creation_check_at.store(lsn);
|
||||
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
@@ -3842,6 +4036,24 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Schedules the uploads of the given image layers
|
||||
fn upload_new_image_layers(
|
||||
self: &Arc<Self>,
|
||||
new_images: impl IntoIterator<Item = ResidentLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(remote_client) = &self.remote_client else {
|
||||
return Ok(());
|
||||
};
|
||||
for layer in new_images {
|
||||
remote_client.schedule_layer_file_upload(layer)?;
|
||||
}
|
||||
// should any new image layer been created, not uploading index_part will
|
||||
// result in a mismatch between remote_physical_size and layermap calculated
|
||||
// size, which will fail some tests, but should not be an issue otherwise.
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update information about which layer files need to be retained on
|
||||
/// garbage collection. This is separate from actually performing the GC,
|
||||
/// and is updated more frequently, so that compaction can remove obsolete
|
||||
@@ -4491,23 +4703,16 @@ struct TimelineWriterState {
|
||||
max_lsn: Option<Lsn>,
|
||||
// Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
|
||||
cached_last_freeze_at: Lsn,
|
||||
cached_last_freeze_ts: Instant,
|
||||
}
|
||||
|
||||
impl TimelineWriterState {
|
||||
fn new(
|
||||
open_layer: Arc<InMemoryLayer>,
|
||||
current_size: u64,
|
||||
last_freeze_at: Lsn,
|
||||
last_freeze_ts: Instant,
|
||||
) -> Self {
|
||||
fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self {
|
||||
Self {
|
||||
open_layer,
|
||||
current_size,
|
||||
prev_lsn: None,
|
||||
max_lsn: None,
|
||||
cached_last_freeze_at: last_freeze_at,
|
||||
cached_last_freeze_ts: last_freeze_ts,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4606,12 +4811,10 @@ impl<'a> TimelineWriter<'a> {
|
||||
let initial_size = layer.size().await?;
|
||||
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
|
||||
self.write_guard.replace(TimelineWriterState::new(
|
||||
layer,
|
||||
initial_size,
|
||||
last_freeze_at,
|
||||
last_freeze_ts,
|
||||
));
|
||||
|
||||
Ok(())
|
||||
@@ -4658,7 +4861,7 @@ impl<'a> TimelineWriter<'a> {
|
||||
self.get_checkpoint_distance(),
|
||||
lsn,
|
||||
state.cached_last_freeze_at,
|
||||
state.cached_last_freeze_ts,
|
||||
state.open_layer.get_opened_at(),
|
||||
) {
|
||||
OpenLayerAction::Roll
|
||||
} else {
|
||||
|
||||
@@ -12,7 +12,6 @@ use super::layer_manager::LayerManager;
|
||||
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use async_trait::async_trait;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
@@ -125,18 +124,8 @@ impl Timeline {
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
for layer in layers {
|
||||
remote_client.schedule_layer_file_upload(layer)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
// should any new image layer been created, not uploading index_part will
|
||||
// result in a mismatch between remote_physical_size and layermap calculated
|
||||
// size, which will fail some tests, but should not be an issue otherwise.
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
self.upload_new_image_layers(layers)?;
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -818,7 +807,10 @@ impl TimelineAdaptor {
|
||||
self.timeline
|
||||
.finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
|
||||
.await?;
|
||||
self.new_images.clear();
|
||||
|
||||
self.timeline
|
||||
.upload_new_image_layers(std::mem::take(&mut self.new_images))?;
|
||||
|
||||
self.new_deltas.clear();
|
||||
self.layers_to_delete.clear();
|
||||
Ok(())
|
||||
@@ -1129,7 +1121,6 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
|
||||
type DeltaEntry<'a> = DeltaEntry<'a>;
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{debug, error, info, instrument, Instrument};
|
||||
use tracing::{error, info, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
@@ -14,7 +14,6 @@ use crate::{
|
||||
deletion_queue::DeletionQueueClient,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||
@@ -23,58 +22,6 @@ use crate::{
|
||||
|
||||
use super::{Timeline, TimelineResources};
|
||||
|
||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// Notify any timeline work to drop out of loops/requests
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
timeline.cancel.cancel();
|
||||
|
||||
// Stop the walreceiver first.
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
||||
if let Some(walreceiver) = maybe_started_walreceiver {
|
||||
walreceiver.stop().await;
|
||||
}
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
|
||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::LayerFlushTask),
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||
// so, they are not affected by this shutdown_tasks() call.
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(
|
||||
None,
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
))?
|
||||
});
|
||||
|
||||
tracing::debug!("Waiting for gate...");
|
||||
timeline.gate.close().await;
|
||||
tracing::debug!("Shutdown complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
/// during attach or pageserver restart.
|
||||
/// See comment in persist_index_part_with_deleted_flag.
|
||||
@@ -268,7 +215,14 @@ impl DeleteTimelineFlow {
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
stop_tasks(&timeline).await?;
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
))?
|
||||
});
|
||||
|
||||
set_deleted_in_remote_index(&timeline).await?;
|
||||
|
||||
|
||||
@@ -67,20 +67,19 @@ impl Timeline {
|
||||
),
|
||||
false,
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); }
|
||||
_ = self_clone.cancel.cancelled() => { return Ok(()); }
|
||||
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
||||
};
|
||||
|
||||
self_clone.eviction_task(parent, cancel).await;
|
||||
self_clone.eviction_task(parent).await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
|
||||
// acquire the gate guard only once within a useful span
|
||||
@@ -95,7 +94,7 @@ impl Timeline {
|
||||
EvictionPolicy::OnlyImitiate(lat) => lat.period,
|
||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
||||
};
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
if random_init_delay(period, &self.cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -104,13 +103,13 @@ impl Timeline {
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self
|
||||
.eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
|
||||
.eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
|
||||
.await;
|
||||
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
ControlFlow::Continue(sleep_until) => {
|
||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
||||
if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
@@ -379,7 +378,7 @@ impl Timeline {
|
||||
gate: &GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
// Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
|
||||
// for consumption metrics (consumption metrics are only sent from shard 0). We may therefore
|
||||
// skip imitating logical size accesses for eviction purposes.
|
||||
|
||||
@@ -120,9 +120,10 @@ impl LayerManager {
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub(crate) async fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
Lsn(last_record_lsn): Lsn,
|
||||
lsn: Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
) {
|
||||
let Lsn(last_record_lsn) = lsn;
|
||||
let end_lsn = Lsn(last_record_lsn + 1);
|
||||
|
||||
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
@@ -135,8 +136,11 @@ impl LayerManager {
|
||||
self.layer_map.frozen_layers.push_back(open_layer_rc);
|
||||
self.layer_map.open_layer = None;
|
||||
self.layer_map.next_open_layer_at = Some(end_lsn);
|
||||
last_freeze_at.store(end_lsn);
|
||||
}
|
||||
|
||||
// Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
|
||||
// accounts for regions in the LSN range where we might have ingested no data due to sharding.
|
||||
last_freeze_at.store(end_lsn);
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
|
||||
@@ -24,13 +24,12 @@ mod connection_manager;
|
||||
mod walreceiver_connection;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
|
||||
use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::timeline::walreceiver::connection_manager::{
|
||||
connection_manager_loop_step, ConnectionManagerState,
|
||||
};
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
@@ -40,8 +39,6 @@ use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use self::connection_manager::ConnectionManagerStatus;
|
||||
|
||||
use super::Timeline;
|
||||
@@ -60,9 +57,10 @@ pub struct WalReceiverConf {
|
||||
}
|
||||
|
||||
pub struct WalReceiver {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
|
||||
/// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
|
||||
/// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl WalReceiver {
|
||||
@@ -76,23 +74,23 @@ impl WalReceiver {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let walreceiver_ctx =
|
||||
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
||||
|
||||
let loop_status = Arc::new(std::sync::RwLock::new(None));
|
||||
let manager_status = Arc::clone(&loop_status);
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverManager,
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline_id),
|
||||
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
|
||||
false,
|
||||
let cancel = timeline.cancel.child_token();
|
||||
WALRECEIVER_RUNTIME.spawn({
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// acquire timeline gate so we know the task doesn't outlive the Timeline
|
||||
let Ok(_guard) = timeline.gate.enter() else {
|
||||
debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
|
||||
return;
|
||||
};
|
||||
debug!("WAL receiver manager started, connecting to broker");
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let mut connection_manager_state = ConnectionManagerState::new(
|
||||
timeline,
|
||||
conf,
|
||||
cancel.clone(),
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
let loop_step_result = connection_manager_loop_step(
|
||||
@@ -112,25 +110,22 @@ impl WalReceiver {
|
||||
}
|
||||
connection_manager_state.shutdown().await;
|
||||
*loop_status.write().unwrap() = None;
|
||||
Ok(())
|
||||
debug!("task exits");
|
||||
}
|
||||
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
|
||||
);
|
||||
});
|
||||
|
||||
Self {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
manager_status,
|
||||
cancel,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn stop(self) {
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
#[instrument(skip_all, level = tracing::Level::DEBUG)]
|
||||
pub fn cancel(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("cancelling walreceiver tasks");
|
||||
self.cancel.cancel();
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
|
||||
@@ -164,14 +159,18 @@ enum TaskStateUpdate<E> {
|
||||
|
||||
impl<E: Clone> TaskHandle<E> {
|
||||
/// Initializes the task, starting it immediately after the creation.
|
||||
///
|
||||
/// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
|
||||
/// It being a child token enables us to provide a [`Self::shutdown`] method.
|
||||
fn spawn<Fut>(
|
||||
cancel_parent: &CancellationToken,
|
||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
|
||||
) -> Self
|
||||
where
|
||||
Fut: Future<Output = anyhow::Result<()>> + Send,
|
||||
E: Send + Sync + 'static,
|
||||
{
|
||||
let cancellation = CancellationToken::new();
|
||||
let cancellation = cancel_parent.child_token();
|
||||
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
||||
|
||||
let cancellation_clone = cancellation.clone();
|
||||
|
||||
@@ -280,6 +280,8 @@ pub(super) struct ConnectionManagerState {
|
||||
id: TenantTimelineId,
|
||||
/// Use pageserver data about the timeline to filter out some of the safekeepers.
|
||||
timeline: Arc<Timeline>,
|
||||
/// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
|
||||
cancel: CancellationToken,
|
||||
conf: WalReceiverConf,
|
||||
/// Current connection to safekeeper for WAL streaming.
|
||||
wal_connection: Option<WalConnection>,
|
||||
@@ -402,7 +404,11 @@ struct BrokerSkTimeline {
|
||||
}
|
||||
|
||||
impl ConnectionManagerState {
|
||||
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
|
||||
pub(super) fn new(
|
||||
timeline: Arc<Timeline>,
|
||||
conf: WalReceiverConf,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
let id = TenantTimelineId {
|
||||
tenant_id: timeline.tenant_shard_id.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
@@ -410,6 +416,7 @@ impl ConnectionManagerState {
|
||||
Self {
|
||||
id,
|
||||
timeline,
|
||||
cancel,
|
||||
conf,
|
||||
wal_connection: None,
|
||||
wal_stream_candidates: HashMap::new(),
|
||||
@@ -417,6 +424,22 @@ impl ConnectionManagerState {
|
||||
}
|
||||
}
|
||||
|
||||
fn spawn<Fut>(
|
||||
&self,
|
||||
task: impl FnOnce(
|
||||
tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
|
||||
CancellationToken,
|
||||
) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
) -> TaskHandle<WalConnectionStatus>
|
||||
where
|
||||
Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
|
||||
{
|
||||
// TODO: get rid of TaskHandle
|
||||
super::TaskHandle::spawn(&self.cancel, task)
|
||||
}
|
||||
|
||||
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
|
||||
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
|
||||
WALRECEIVER_SWITCHES
|
||||
@@ -435,7 +458,7 @@ impl ConnectionManagerState {
|
||||
);
|
||||
|
||||
let span = info_span!("connection", %node_id);
|
||||
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
|
||||
let connection_handle = self.spawn(move |events_sender, cancellation| {
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -463,6 +486,12 @@ impl ConnectionManagerState {
|
||||
info!("walreceiver connection handling ended: {e}");
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::ClosedGate => {
|
||||
info!(
|
||||
"walreceiver connection handling ended because of closed gate"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::Other(e) => {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
if cancellation.is_cancelled() {
|
||||
@@ -1016,7 +1045,7 @@ mod tests {
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
@@ -1184,7 +1213,7 @@ mod tests {
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
@@ -1251,7 +1280,7 @@ mod tests {
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
@@ -1315,7 +1344,7 @@ mod tests {
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
|
||||
connection_task: state.spawn(move |_, _| async move { Ok(()) }),
|
||||
discovered_new_wal: Some(NewCommittedWAL {
|
||||
discovered_at: time_over_threshold,
|
||||
lsn: new_lsn,
|
||||
@@ -1371,6 +1400,7 @@ mod tests {
|
||||
timeline_id: TIMELINE_ID,
|
||||
},
|
||||
timeline,
|
||||
cancel: CancellationToken::new(),
|
||||
conf: WalReceiverConf {
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
@@ -1414,7 +1444,7 @@ mod tests {
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
|
||||
@@ -27,7 +27,6 @@ use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
@@ -37,8 +36,8 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::{id::NodeId, lsn::Lsn};
|
||||
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||
|
||||
/// Status of the connection.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -68,6 +67,7 @@ pub(super) enum WalReceiverError {
|
||||
SuccessfulCompletion(String),
|
||||
/// Generic error
|
||||
Other(anyhow::Error),
|
||||
ClosedGate,
|
||||
}
|
||||
|
||||
impl From<tokio_postgres::Error> for WalReceiverError {
|
||||
@@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
) -> Result<(), WalReceiverError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// prevent timeline shutdown from finishing until we have exited
|
||||
let _guard = timeline.gate.enter().map_err(|e| match e {
|
||||
GateError::GateClosed => WalReceiverError::ClosedGate,
|
||||
})?;
|
||||
// This function spawns a side-car task (WalReceiverConnectionPoller).
|
||||
// Get its gate guard now as well.
|
||||
let poller_guard = timeline.gate.enter().map_err(|e| match e {
|
||||
GateError::GateClosed => WalReceiverError::ClosedGate,
|
||||
})?;
|
||||
|
||||
WALRECEIVER_STARTED_CONNECTIONS.inc();
|
||||
|
||||
// Connect to the database in replication mode.
|
||||
@@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
// so spawn it off to run on its own. It shouldn't outlive this function, but,
|
||||
// due to lack of async drop, we can't enforce that. However, we ensure that
|
||||
// 1. it is sensitive to `cancellation` and
|
||||
// 2. holds the Timeline gate open so that after timeline shutdown,
|
||||
// we know this task is gone.
|
||||
let _connection_ctx = ctx.detached_child(
|
||||
TaskKind::WalReceiverConnectionPoller,
|
||||
ctx.download_behavior(),
|
||||
);
|
||||
let connection_cancellation = cancellation.clone();
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverConnectionPoller,
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline.timeline_id),
|
||||
"walreceiver connection",
|
||||
false,
|
||||
WALRECEIVER_RUNTIME.spawn(
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
select! {
|
||||
connection_result = connection => match connection_result {
|
||||
Ok(()) => debug!("Walreceiver db connection closed"),
|
||||
@@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// with a similar error.
|
||||
},
|
||||
WalReceiverError::SuccessfulCompletion(_) => {}
|
||||
WalReceiverError::ClosedGate => {
|
||||
// doesn't happen at runtime
|
||||
}
|
||||
WalReceiverError::Other(err) => {
|
||||
warn!("Connection aborted: {err:#}")
|
||||
}
|
||||
@@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
},
|
||||
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
|
||||
}
|
||||
Ok(())
|
||||
drop(poller_guard);
|
||||
}
|
||||
// Enrich the log lines emitted by this closure with meaningful context.
|
||||
// TODO: technically, this task outlives the surrounding function, so, the
|
||||
@@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
trace!("received XLogData between {startlsn} and {endlsn}");
|
||||
|
||||
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
{
|
||||
@@ -416,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
|
||||
let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
|
||||
timeline
|
||||
.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||
|
||||
@@ -61,7 +61,7 @@ pub struct VectoredRead {
|
||||
}
|
||||
|
||||
impl VectoredRead {
|
||||
fn size(&self) -> usize {
|
||||
pub fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
|
||||
.map_err(std::io::Error::from)
|
||||
.context("statvfs tenants directory")?;
|
||||
|
||||
let blocksz = statvfs.block_size();
|
||||
// https://unix.stackexchange.com/a/703650
|
||||
let blocksz = if statvfs.fragment_size() > 0 {
|
||||
statvfs.fragment_size()
|
||||
} else {
|
||||
statvfs.block_size()
|
||||
};
|
||||
|
||||
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
|
||||
let free = statvfs.blocks_available() as u64 * blocksz;
|
||||
let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
|
||||
|
||||
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
|
||||
let used = statvfs
|
||||
.blocks()
|
||||
// use blocks_free instead of available here to match df in case someone compares
|
||||
.saturating_sub(statvfs.blocks_free()) as u64
|
||||
* blocksz;
|
||||
|
||||
let captured_at = std::time::SystemTime::now();
|
||||
|
||||
let doc = PageserverUtilization {
|
||||
@@ -29,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
|
||||
//
|
||||
// note that u64::MAX will be output as i64::MAX as u64, but that should not matter
|
||||
utilization_score: u64::MAX,
|
||||
captured_at,
|
||||
captured_at: utils::serde_system_time::SystemTime(captured_at),
|
||||
};
|
||||
|
||||
// TODO: make utilization_score into a metric
|
||||
|
||||
@@ -403,7 +403,7 @@ impl WalIngest {
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if self.shard.is_zero() {
|
||||
if self.shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
self.observe_decoded_block(modification, blk, ctx).await?;
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
/// Process lifecycle and abstracction for the IPC protocol.
|
||||
mod process;
|
||||
pub use process::Kind as ProcessKind;
|
||||
|
||||
/// Code to apply [`NeonWalRecord`]s.
|
||||
pub(crate) mod apply_neon;
|
||||
@@ -34,13 +35,14 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
///
|
||||
/// This is the real implementation that uses a Postgres process to
|
||||
@@ -53,7 +55,19 @@ pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
|
||||
/// The current [`process::Process`] that is used by new redo requests.
|
||||
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
|
||||
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// their process object; we use [`Arc::clone`] for that.
|
||||
/// This is primarily because earlier implementations that didn't use [`heavier_once_cell`]
|
||||
/// had that behavior; it's probably unnecessary.
|
||||
/// The only merit of it is that if one walredo process encounters an error,
|
||||
/// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`].
|
||||
/// and retry redo, thereby starting the new process, while other redo tasks might
|
||||
/// still be using the old redo process. But, those other tasks will most likely
|
||||
/// encounter an error as well, and errors are an unexpected condition anyway.
|
||||
/// So, probably we could get rid of the `Arc` in the future.
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -101,6 +115,7 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
};
|
||||
img = Some(result?);
|
||||
|
||||
@@ -121,11 +136,12 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
Some(WalRedoManagerStatus {
|
||||
pub fn status(&self) -> WalRedoManagerStatus {
|
||||
WalRedoManagerStatus {
|
||||
last_redo_at: {
|
||||
let at = *self.last_redo_at.lock().unwrap();
|
||||
at.and_then(|at| {
|
||||
@@ -134,8 +150,14 @@ impl PostgresRedoManager {
|
||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||
})
|
||||
},
|
||||
pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
|
||||
})
|
||||
process: self
|
||||
.redo_process
|
||||
.get()
|
||||
.map(|p| WalRedoManagerProcessStatus {
|
||||
pid: p.id(),
|
||||
kind: std::borrow::Cow::Borrowed(p.kind().into()),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,7 +174,7 @@ impl PostgresRedoManager {
|
||||
tenant_shard_id,
|
||||
conf,
|
||||
last_redo_at: std::sync::Mutex::default(),
|
||||
redo_process: RwLock::new(None),
|
||||
redo_process: heavier_once_cell::OnceCell::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -164,8 +186,7 @@ impl PostgresRedoManager {
|
||||
if let Some(last_redo_at) = *g {
|
||||
if last_redo_at.elapsed() >= idle_timeout {
|
||||
drop(g);
|
||||
let mut guard = self.redo_process.write().unwrap();
|
||||
*guard = None;
|
||||
drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -174,8 +195,11 @@ impl PostgresRedoManager {
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation safe.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn apply_batch_postgres(
|
||||
async fn apply_batch_postgres(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -191,40 +215,24 @@ impl PostgresRedoManager {
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
// launch the WAL redo process on first use
|
||||
let proc: Arc<process::WalRedoProcess> = {
|
||||
let proc_guard = self.redo_process.read().unwrap();
|
||||
match &*proc_guard {
|
||||
None => {
|
||||
// "upgrade" to write lock to launch the process
|
||||
drop(proc_guard);
|
||||
let mut proc_guard = self.redo_process.write().unwrap();
|
||||
match &*proc_guard {
|
||||
None => {
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
|
||||
.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
*proc_guard = Some(Arc::clone(&proc));
|
||||
proc
|
||||
}
|
||||
Some(proc) => Arc::clone(proc),
|
||||
}
|
||||
}
|
||||
Some(proc) => Arc::clone(proc),
|
||||
let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
|
||||
@@ -233,6 +241,7 @@ impl PostgresRedoManager {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let result = proc
|
||||
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
.context("apply_wal_records");
|
||||
|
||||
let duration = started_at.elapsed();
|
||||
@@ -272,34 +281,34 @@ impl PostgresRedoManager {
|
||||
n_attempts,
|
||||
e,
|
||||
);
|
||||
// Avoid concurrent callers hitting the same issue.
|
||||
// We can't prevent it from happening because we want to enable parallelism.
|
||||
{
|
||||
let mut guard = self.redo_process.write().unwrap();
|
||||
match &*guard {
|
||||
Some(current_field_value) => {
|
||||
if Arc::ptr_eq(current_field_value, &proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
*guard = None;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Another thread was faster to observe the error, and already took the process out of rotation.
|
||||
}
|
||||
}
|
||||
}
|
||||
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
|
||||
// Note that there may be other tasks concurrent with us that also hold `proc`.
|
||||
// We have to deal with that here.
|
||||
// Also read the doc comment on field `self.redo_process`.
|
||||
//
|
||||
// NB: there may still be other concurrent threads using `proc`.
|
||||
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
|
||||
// NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
|
||||
// holding the lock while waiting for the process to exit.
|
||||
// NB: the drop impl blocks the current threads with a wait() system call for
|
||||
// the child process. We dropped the `guard` above so that other threads aren't
|
||||
// affected. But, it's good that the current thread _does_ block to wait.
|
||||
// If we instead deferred the waiting into the background / to tokio, it could
|
||||
// happen that if walredo always fails immediately, we spawn processes faster
|
||||
//
|
||||
// NB: the drop impl blocks the dropping thread with a wait() system call for
|
||||
// the child process. In some ways the blocking is actually good: if we
|
||||
// deferred the waiting into the background / to tokio if we used `tokio::process`,
|
||||
// it could happen that if walredo always fails immediately, we spawn processes faster
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
match self.redo_process.get() {
|
||||
None => (),
|
||||
Some(guard) => {
|
||||
if Arc::ptr_eq(&proc, &*guard) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
guard.take_and_deinit();
|
||||
} else {
|
||||
// Another task already spawned another redo process (further up in this method)
|
||||
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||
}
|
||||
}
|
||||
}
|
||||
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
|
||||
drop(proc);
|
||||
} else if n_attempts != 0 {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
|
||||
@@ -1,186 +1,67 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Write},
|
||||
process::{ChildStdin, ChildStdout, Command, Stdio},
|
||||
sync::{Mutex, MutexGuard},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, nonblock::set_nonblock};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
|
||||
mod no_leak_child;
|
||||
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
|
||||
mod protocol;
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
mod process_impl {
|
||||
pub(super) mod process_async;
|
||||
pub(super) mod process_std;
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
#[derive(
|
||||
Clone,
|
||||
Copy,
|
||||
Debug,
|
||||
PartialEq,
|
||||
Eq,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
strum_macros::IntoStaticStr,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[repr(u8)]
|
||||
pub enum Kind {
|
||||
Sync,
|
||||
Async,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
pub(crate) enum Process {
|
||||
Sync(process_impl::process_std::WalRedoProcess),
|
||||
Async(process_impl::process_async::WalRedoProcess),
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
impl Process {
|
||||
#[inline(always)]
|
||||
pub fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
Ok(match conf.walredo_process_kind {
|
||||
Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
pg_version,
|
||||
)?),
|
||||
Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
pg_version,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) fn apply_wal_records(
|
||||
#[inline(always)]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
@@ -188,221 +69,29 @@ impl WalRedoProcess {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
match self {
|
||||
Process::Sync(p) => {
|
||||
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
Process::Async(p) => {
|
||||
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
match self {
|
||||
Process::Sync(p) => p.id(),
|
||||
Process::Async(p) => p.id(),
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
pub(crate) fn kind(&self) -> Kind {
|
||||
match self {
|
||||
Process::Sync(_) => Kind::Sync,
|
||||
Process::Async(_) => Kind::Async,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
374
pageserver/src/walredo/process/process_impl/process_async.rs
Normal file
374
pageserver/src/walredo/process/process_impl/process_async.rs
Normal file
@@ -0,0 +1,374 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::process::{no_leak_child, protocol},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
process::{Command, Stdio},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, poison::Poison};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
|
||||
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: tokio::process::ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: tokio::process::ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
let stdin =
|
||||
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
|
||||
let stdout = tokio::process::ChildStdout::from_std(stdout)
|
||||
.context("convert to tokio::ChildStdout")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdin",
|
||||
ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
},
|
||||
)),
|
||||
stdout: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdout",
|
||||
ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
/// Apply given WAL records ('records') over an old page image. Returns
|
||||
/// new page image.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation safe.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let Ok(res) =
|
||||
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
};
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
/// branch becomes ready before this future), concurrent and subsequent
|
||||
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
|
||||
/// Dispose of this process instance and create a new one.
|
||||
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
|
||||
let request_no = {
|
||||
let mut lock_guard = self.stdin.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let input = poison_guard.data_mut();
|
||||
input
|
||||
.stdin
|
||||
.write_all(writebuf)
|
||||
.await
|
||||
.context("write to walredo stdin")?;
|
||||
let request_no = input.n_requests;
|
||||
input.n_requests += 1;
|
||||
poison_guard.disarm();
|
||||
request_no
|
||||
};
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut lock_guard = self.stdout.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let output = poison_guard.data_mut();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
output
|
||||
.stdout
|
||||
.read_exact(&mut resultbuf)
|
||||
.await
|
||||
.context("read walredo stdout")?;
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
poison_guard.disarm();
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
use std::io::Write;
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
405
pageserver/src/walredo/process/process_impl/process_std.rs
Normal file
405
pageserver/src/walredo/process/process_impl/process_std.rs
Normal file
@@ -0,0 +1,405 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::process::{no_leak_child, protocol},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Write},
|
||||
process::{ChildStdin, ChildStdout, Command, Stdio},
|
||||
sync::{Mutex, MutexGuard},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user