mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 03:50:37 +00:00
Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517
This commit is contained in:
@@ -12,6 +12,9 @@ testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "
|
||||
|
||||
fuzz-read-path = ["testing"]
|
||||
|
||||
# Enables benchmarking only APIs
|
||||
benchmarking = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
@@ -56,6 +59,7 @@ pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
postgres_ffi_types.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
@@ -126,6 +130,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
required-features = ["benchmarking"]
|
||||
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
|
||||
@@ -1,23 +1,30 @@
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use criterion::{Criterion, criterion_group, criterion_main};
|
||||
use futures::stream::FuturesUnordered;
|
||||
use pageserver::config::PageServerConf;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::keyspace::KeySpace;
|
||||
use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::storage_layer::InMemoryLayer;
|
||||
use pageserver::tenant::storage_layer::IoConcurrency;
|
||||
use pageserver::tenant::storage_layer::{InMemoryLayer, ValuesReconstructState};
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver_api::config::GetVectoredConcurrentIo;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::virtual_file::IoMode;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::Gate;
|
||||
use wal_decoder::models::value::Value;
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
|
||||
// A very cheap hash for generating non-sequential keys.
|
||||
@@ -30,7 +37,7 @@ fn murmurhash32(mut h: u32) -> u32 {
|
||||
h
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, Clone, Copy, Debug)]
|
||||
#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
|
||||
enum KeyLayout {
|
||||
/// Sequential unique keys
|
||||
Sequential,
|
||||
@@ -40,19 +47,30 @@ enum KeyLayout {
|
||||
RandomReuse(u32),
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, Clone, Copy, Debug)]
|
||||
#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
|
||||
enum WriteDelta {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
|
||||
enum ConcurrentReads {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
async fn ingest(
|
||||
conf: &'static PageServerConf,
|
||||
put_size: usize,
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
concurrent_reads: ConcurrentReads,
|
||||
) -> anyhow::Result<()> {
|
||||
if concurrent_reads == ConcurrentReads::Yes {
|
||||
assert_eq!(key_layout, KeyLayout::Sequential);
|
||||
}
|
||||
|
||||
let mut lsn = utils::lsn::Lsn(1000);
|
||||
let mut key = Key::from_i128(0x0);
|
||||
|
||||
@@ -68,16 +86,18 @@ async fn ingest(
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let layer = InMemoryLayer::create(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn,
|
||||
&gate,
|
||||
&cancel,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
let layer = Arc::new(
|
||||
InMemoryLayer::create(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn,
|
||||
&gate,
|
||||
&cancel,
|
||||
&ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
|
||||
let data_ser_size = data.serialized_size().unwrap() as usize;
|
||||
@@ -86,6 +106,61 @@ async fn ingest(
|
||||
pageserver::context::DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
const READ_BATCH_SIZE: u32 = 32;
|
||||
let (tx, mut rx) = tokio::sync::watch::channel::<Option<Key>>(None);
|
||||
let reader_cancel = CancellationToken::new();
|
||||
let reader_handle = if concurrent_reads == ConcurrentReads::Yes {
|
||||
Some(tokio::task::spawn({
|
||||
let cancel = reader_cancel.clone();
|
||||
let layer = layer.clone();
|
||||
let ctx = ctx.attached_child();
|
||||
async move {
|
||||
let gate = Gate::default();
|
||||
let gate_guard = gate.enter().unwrap();
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
GetVectoredConcurrentIo::SidecarTask,
|
||||
gate_guard,
|
||||
);
|
||||
|
||||
rx.wait_for(|key| key.is_some()).await.unwrap();
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
let key = match *rx.borrow() {
|
||||
Some(some) => some,
|
||||
None => unreachable!(),
|
||||
};
|
||||
|
||||
let mut start_key = key;
|
||||
start_key.field6 = key.field6.saturating_sub(READ_BATCH_SIZE);
|
||||
let key_range = start_key..key.next();
|
||||
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(
|
||||
KeySpace::single(key_range),
|
||||
Lsn(1)..Lsn(u64::MAX),
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
|
||||
.into_values()
|
||||
.map(|state| state.sink_pending_ios())
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
while collect_futs.next().await.is_some() {}
|
||||
}
|
||||
|
||||
drop(io_concurrency);
|
||||
gate.close().await;
|
||||
}
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
const BATCH_SIZE: usize = 16;
|
||||
let mut batch = Vec::new();
|
||||
|
||||
@@ -113,19 +188,27 @@ async fn ingest(
|
||||
|
||||
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
||||
if batch.len() >= BATCH_SIZE {
|
||||
let last_key = Key::from_compact(batch.last().unwrap().0);
|
||||
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedValueBatch::from_values(this_batch);
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
|
||||
tx.send(Some(last_key)).unwrap();
|
||||
}
|
||||
}
|
||||
if !batch.is_empty() {
|
||||
let last_key = Key::from_compact(batch.last().unwrap().0);
|
||||
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedValueBatch::from_values(this_batch);
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
|
||||
tx.send(Some(last_key)).unwrap();
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
if matches!(write_delta, WriteDelta::Yes) {
|
||||
if write_delta == WriteDelta::Yes {
|
||||
let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
|
||||
max_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
});
|
||||
@@ -136,6 +219,11 @@ async fn ingest(
|
||||
tokio::fs::remove_file(path).await?;
|
||||
}
|
||||
|
||||
reader_cancel.cancel();
|
||||
if let Some(handle) = reader_handle {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -147,6 +235,7 @@ fn ingest_main(
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
concurrent_reads: ConcurrentReads,
|
||||
) {
|
||||
pageserver::virtual_file::set_io_mode(io_mode);
|
||||
|
||||
@@ -156,7 +245,15 @@ fn ingest_main(
|
||||
.unwrap();
|
||||
|
||||
runtime.block_on(async move {
|
||||
let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
|
||||
let r = ingest(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
key_layout,
|
||||
write_delta,
|
||||
concurrent_reads,
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = r {
|
||||
panic!("{e:?}");
|
||||
}
|
||||
@@ -195,6 +292,7 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
key_size: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
concurrent_reads: ConcurrentReads,
|
||||
}
|
||||
#[derive(Clone)]
|
||||
struct HandPickedParameters {
|
||||
@@ -245,7 +343,7 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
];
|
||||
let exploded_parameters = {
|
||||
let mut out = Vec::new();
|
||||
for io_mode in IoMode::iter() {
|
||||
for concurrent_reads in [ConcurrentReads::Yes, ConcurrentReads::No] {
|
||||
for param in expect.clone() {
|
||||
let HandPickedParameters {
|
||||
volume_mib,
|
||||
@@ -253,12 +351,18 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
key_layout,
|
||||
write_delta,
|
||||
} = param;
|
||||
|
||||
if key_layout != KeyLayout::Sequential && concurrent_reads == ConcurrentReads::Yes {
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push(ExplodedParameters {
|
||||
io_mode,
|
||||
io_mode: IoMode::DirectRw,
|
||||
volume_mib,
|
||||
key_size,
|
||||
key_layout,
|
||||
write_delta,
|
||||
concurrent_reads,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -272,9 +376,10 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
key_size,
|
||||
key_layout,
|
||||
write_delta,
|
||||
concurrent_reads,
|
||||
} = self;
|
||||
format!(
|
||||
"io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}"
|
||||
"io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?} concurrent_reads={concurrent_reads:?}"
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -287,12 +392,23 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
key_size,
|
||||
key_layout,
|
||||
write_delta,
|
||||
concurrent_reads,
|
||||
} = params;
|
||||
let put_count = volume_mib * 1024 * 1024 / key_size;
|
||||
group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64));
|
||||
group.sample_size(10);
|
||||
group.bench_function(id, |b| {
|
||||
b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta))
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
io_mode,
|
||||
key_size,
|
||||
put_count,
|
||||
key_layout,
|
||||
write_delta,
|
||||
concurrent_reads,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -18,6 +18,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
futures.workspace = true
|
||||
tokio-util.workspace = true
|
||||
anyhow.workspace = true
|
||||
|
||||
@@ -7,6 +7,7 @@ use detach_ancestor::AncestorDetached;
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use pageserver_api::models::*;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_versioninfo::PgMajorVersion;
|
||||
pub use reqwest::Body as ReqwestBody;
|
||||
use reqwest::{IntoUrl, Method, StatusCode, Url};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -508,11 +509,11 @@ impl Client {
|
||||
.expect("Cannot build URL");
|
||||
|
||||
path.query_pairs_mut()
|
||||
.append_pair("recurse", &format!("{}", recurse));
|
||||
.append_pair("recurse", &format!("{recurse}"));
|
||||
|
||||
if let Some(concurrency) = concurrency {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("concurrency", &format!("{}", concurrency));
|
||||
.append_pair("concurrency", &format!("{concurrency}"));
|
||||
}
|
||||
|
||||
self.request(Method::POST, path, ()).await.map(|_| ())
|
||||
@@ -745,9 +746,11 @@ impl Client {
|
||||
timeline_id: TimelineId,
|
||||
base_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
basebackup_tarball: ReqwestBody,
|
||||
) -> Result<()> {
|
||||
let pg_version = pg_version.major_version_num();
|
||||
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
|
||||
self.mgmt_api_endpoint,
|
||||
@@ -841,4 +844,13 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> {
|
||||
let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint);
|
||||
self.request(Method::POST, uri, spec)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use futures::stream::{SplitSink, SplitStream};
|
||||
use futures::{SinkExt, StreamExt};
|
||||
use pageserver_api::models::{
|
||||
use pageserver_api::pagestream_api::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
@@ -152,7 +152,7 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
|
||||
let key_diff = key_end - key_start;
|
||||
|
||||
if key_start >= key_end {
|
||||
panic!("Invalid key range {}-{}", key_start, key_end);
|
||||
panic!("Invalid key range {key_start}-{key_end}");
|
||||
}
|
||||
|
||||
let lsn_start = lsn_map.map(f.lsn_range.start);
|
||||
@@ -212,12 +212,12 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
|
||||
)?;
|
||||
writeln!(svg, "</line>")?;
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"),
|
||||
}
|
||||
files_seen.insert(f);
|
||||
}
|
||||
|
||||
writeln!(svg, "{}", EndSvg)?;
|
||||
writeln!(svg, "{EndSvg}")?;
|
||||
|
||||
let mut layer_events_str = String::new();
|
||||
let mut first = true;
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
//!
|
||||
//! # local timeline dir
|
||||
//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//! grep "__" | cargo run --release --bin pagectl draw-timeline > out.svg
|
||||
//!
|
||||
//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
|
||||
//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg
|
||||
@@ -81,7 +81,11 @@ fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T
|
||||
fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
let split: Vec<&str> = name.split("__").collect();
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
||||
|
||||
// Remove the temporary file extension, e.g., remove the `.d20a.___temp` part from the following filename:
|
||||
// 000000067F000040490000404A00441B0000-000000067F000040490000404A00441B4000__000043483A34CE00.d20a.___temp
|
||||
let lsns = split[1].split('.').collect::<Vec<&str>>()[0];
|
||||
let mut lsns: Vec<&str> = lsns.split('-').collect();
|
||||
|
||||
// The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001
|
||||
|
||||
@@ -224,7 +228,7 @@ pub fn main() -> Result<()> {
|
||||
let lsn_max = lsn_map.len();
|
||||
|
||||
if key_start >= key_end {
|
||||
panic!("Invalid key range {}-{}", key_start, key_end);
|
||||
panic!("Invalid key range {key_start}-{key_end}");
|
||||
}
|
||||
|
||||
let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
|
||||
@@ -246,7 +250,7 @@ pub fn main() -> Result<()> {
|
||||
ymargin = 0.05;
|
||||
fill = Fill::Color(rgb(0, 0, 0));
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"),
|
||||
}
|
||||
|
||||
println!(
|
||||
@@ -283,10 +287,10 @@ pub fn main() -> Result<()> {
|
||||
);
|
||||
}
|
||||
|
||||
println!("{}", EndSvg);
|
||||
println!("{EndSvg}");
|
||||
|
||||
eprintln!("num_images: {}", num_images);
|
||||
eprintln!("num_deltas: {}", num_deltas);
|
||||
eprintln!("num_images: {num_images}");
|
||||
eprintln!("num_deltas: {num_deltas}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -372,7 +372,7 @@ impl<const N: usize> std::fmt::Debug for RelTagish<N> {
|
||||
f.write_char('/')?;
|
||||
}
|
||||
first = false;
|
||||
write!(f, "{}", x)
|
||||
write!(f, "{x}")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -224,8 +224,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
}
|
||||
}
|
||||
println!(
|
||||
"Total delta layers {} image layers {} excess layers {}",
|
||||
total_delta_layers, total_image_layers, total_excess_layers
|
||||
"Total delta layers {total_delta_layers} image layers {total_image_layers} excess layers {total_excess_layers}"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use pageserver::{page_cache, virtual_file};
|
||||
use pageserver_api::key::Key;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
use crate::layer_map_analyzer::{LayerFile, parse_filename};
|
||||
|
||||
#[derive(Subcommand)]
|
||||
pub(crate) enum LayerCmd {
|
||||
@@ -38,6 +38,8 @@ pub(crate) enum LayerCmd {
|
||||
/// The id from list-layer command
|
||||
id: usize,
|
||||
},
|
||||
/// Dump all information of a layer file locally
|
||||
DumpLayerLocal { path: PathBuf },
|
||||
RewriteSummary {
|
||||
layer_file_path: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
@@ -131,15 +133,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
}
|
||||
|
||||
for (idx, layer_file) in to_print {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
print_layer_file(idx, &layer_file);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -159,16 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
let layer = layer?;
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
if *id == idx {
|
||||
// TODO(chi): dedup code
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
print_layer_file(idx, &layer_file);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path(), &ctx).await?;
|
||||
@@ -183,6 +168,18 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::DumpLayerLocal { path } => {
|
||||
if let Ok(layer_file) = parse_filename(path.file_name().unwrap().to_str().unwrap()) {
|
||||
print_layer_file(0, &layer_file);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(path, &ctx).await?;
|
||||
} else {
|
||||
read_image_file(path, &ctx).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::RewriteSummary {
|
||||
layer_file_path,
|
||||
new_tenant_id,
|
||||
@@ -247,3 +244,15 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn print_layer_file(idx: usize, layer_file: &LayerFile) {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -176,9 +176,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?;
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
|
||||
let cancel = CancellationToken::new();
|
||||
// Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not
|
||||
// need to limit the number of versions we are going to delete.
|
||||
storage
|
||||
.unwrap()
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None)
|
||||
.await?;
|
||||
}
|
||||
Commands::Key(dkc) => dkc.execute(),
|
||||
|
||||
@@ -5,11 +5,16 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
futures.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
prost.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tonic.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -102,12 +102,27 @@ message CheckRelExistsResponse {
|
||||
bool exists = 1;
|
||||
}
|
||||
|
||||
// Requests a base backup at a given LSN.
|
||||
// Requests a base backup.
|
||||
message GetBaseBackupRequest {
|
||||
// The LSN to fetch a base backup at.
|
||||
ReadLsn read_lsn = 1;
|
||||
// The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver.
|
||||
uint64 lsn = 1;
|
||||
// If true, logical replication slots will not be created.
|
||||
bool replica = 2;
|
||||
// If true, include relation files in the base backup. Mainly for debugging and tests.
|
||||
bool full = 3;
|
||||
// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
|
||||
// compression, so that we can cache compressed backups on the server.
|
||||
BaseBackupCompression compression = 4;
|
||||
}
|
||||
|
||||
// Base backup compression algorithms.
|
||||
enum BaseBackupCompression {
|
||||
// Unknown algorithm. Used when clients send an unsupported algorithm.
|
||||
BASE_BACKUP_COMPRESSION_UNKNOWN = 0;
|
||||
// No compression.
|
||||
BASE_BACKUP_COMPRESSION_NONE = 1;
|
||||
// GZIP compression.
|
||||
BASE_BACKUP_COMPRESSION_GZIP = 2;
|
||||
}
|
||||
|
||||
// Base backup response chunk, returned as an ordered stream.
|
||||
|
||||
199
pageserver/page_api/src/client.rs
Normal file
199
pageserver/page_api/src/client.rs
Normal file
@@ -0,0 +1,199 @@
|
||||
use std::convert::TryInto;
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::TryStreamExt;
|
||||
use futures::{Stream, StreamExt};
|
||||
use tonic::metadata::AsciiMetadataValue;
|
||||
use tonic::metadata::errors::InvalidMetadataValue;
|
||||
use tonic::transport::Channel;
|
||||
use tonic::{Request, Streaming};
|
||||
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::model;
|
||||
use crate::proto;
|
||||
|
||||
///
|
||||
/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
|
||||
/// headers are required at the pageserver.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
struct AuthInterceptor {
|
||||
tenant_id: AsciiMetadataValue,
|
||||
timeline_id: AsciiMetadataValue,
|
||||
shard_id: AsciiMetadataValue,
|
||||
auth_header: Option<AsciiMetadataValue>, // including "Bearer " prefix
|
||||
}
|
||||
|
||||
impl AuthInterceptor {
|
||||
fn new(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
auth_token: Option<String>,
|
||||
shard_id: ShardIndex,
|
||||
) -> Result<Self, InvalidMetadataValue> {
|
||||
let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
|
||||
let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
|
||||
let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
|
||||
|
||||
let auth_header: Option<AsciiMetadataValue> = match auth_token {
|
||||
Some(token) => Some(format!("Bearer {token}").try_into()?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
tenant_id: tenant_ascii,
|
||||
shard_id: shard_ascii,
|
||||
timeline_id: timeline_ascii,
|
||||
auth_header,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
|
||||
req.metadata_mut()
|
||||
.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-shard-id", self.shard_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
if let Some(auth_header) = &self.auth_header {
|
||||
req.metadata_mut()
|
||||
.insert("authorization", auth_header.clone());
|
||||
}
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub struct Client {
|
||||
client: proto::PageServiceClient<
|
||||
tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
|
||||
into_endpoint: T,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
shard_id: ShardIndex,
|
||||
auth_header: Option<String>,
|
||||
compression: Option<tonic::codec::CompressionEncoding>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let endpoint: tonic::transport::Endpoint = into_endpoint
|
||||
.try_into()
|
||||
.map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
|
||||
let channel = endpoint.connect().await?;
|
||||
let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
|
||||
.map_err(|e| anyhow::anyhow!(e.to_string()))?;
|
||||
let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
|
||||
|
||||
if let Some(compression) = compression {
|
||||
// TODO: benchmark this (including network latency).
|
||||
client = client
|
||||
.accept_compressed(compression)
|
||||
.send_compressed(compression);
|
||||
}
|
||||
|
||||
Ok(Self { client })
|
||||
}
|
||||
|
||||
/// Returns whether a relation exists.
|
||||
pub async fn check_rel_exists(
|
||||
&mut self,
|
||||
req: model::CheckRelExistsRequest,
|
||||
) -> Result<model::CheckRelExistsResponse, tonic::Status> {
|
||||
let proto_req = proto::CheckRelExistsRequest::from(req);
|
||||
|
||||
let response = self.client.check_rel_exists(proto_req).await?;
|
||||
|
||||
let proto_resp = response.into_inner();
|
||||
Ok(proto_resp.into())
|
||||
}
|
||||
|
||||
/// Fetches a base backup.
|
||||
pub async fn get_base_backup(
|
||||
&mut self,
|
||||
req: model::GetBaseBackupRequest,
|
||||
) -> Result<impl Stream<Item = Result<Bytes, tonic::Status>> + 'static, tonic::Status> {
|
||||
let proto_req = proto::GetBaseBackupRequest::from(req);
|
||||
|
||||
let response_stream: Streaming<proto::GetBaseBackupResponseChunk> =
|
||||
self.client.get_base_backup(proto_req).await?.into_inner();
|
||||
|
||||
// TODO: Consider dechunking internally
|
||||
let domain_stream = response_stream.map(|chunk_res| {
|
||||
chunk_res.and_then(|proto_chunk| {
|
||||
proto_chunk.try_into().map_err(|e| {
|
||||
tonic::Status::internal(format!("Failed to convert response chunk: {e}"))
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
Ok(domain_stream)
|
||||
}
|
||||
|
||||
/// Returns the total size of a database, as # of bytes.
|
||||
pub async fn get_db_size(
|
||||
&mut self,
|
||||
req: model::GetDbSizeRequest,
|
||||
) -> Result<u64, tonic::Status> {
|
||||
let proto_req = proto::GetDbSizeRequest::from(req);
|
||||
|
||||
let response = self.client.get_db_size(proto_req).await?;
|
||||
Ok(response.into_inner().into())
|
||||
}
|
||||
|
||||
/// Fetches pages.
|
||||
///
|
||||
/// This is implemented as a bidirectional streaming RPC for performance.
|
||||
/// Per-request errors are often returned as status_code instead of errors,
|
||||
/// to avoid tearing down the entire stream via tonic::Status.
|
||||
pub async fn get_pages<ReqSt>(
|
||||
&mut self,
|
||||
inbound: ReqSt,
|
||||
) -> Result<
|
||||
impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
|
||||
tonic::Status,
|
||||
>
|
||||
where
|
||||
ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
|
||||
{
|
||||
let outbound_proto = inbound.map(|domain_req| domain_req.into());
|
||||
|
||||
let req_new = Request::new(outbound_proto);
|
||||
|
||||
let response_stream: Streaming<proto::GetPageResponse> =
|
||||
self.client.get_pages(req_new).await?.into_inner();
|
||||
|
||||
let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
|
||||
|
||||
Ok(domain_stream)
|
||||
}
|
||||
|
||||
/// Returns the size of a relation, as # of blocks.
|
||||
pub async fn get_rel_size(
|
||||
&mut self,
|
||||
req: model::GetRelSizeRequest,
|
||||
) -> Result<model::GetRelSizeResponse, tonic::Status> {
|
||||
let proto_req = proto::GetRelSizeRequest::from(req);
|
||||
let response = self.client.get_rel_size(proto_req).await?;
|
||||
let proto_resp = response.into_inner();
|
||||
Ok(proto_resp.into())
|
||||
}
|
||||
|
||||
/// Fetches an SLRU segment.
|
||||
pub async fn get_slru_segment(
|
||||
&mut self,
|
||||
req: model::GetSlruSegmentRequest,
|
||||
) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
|
||||
let proto_req = proto::GetSlruSegmentRequest::from(req);
|
||||
let response = self.client.get_slru_segment(proto_req).await?;
|
||||
Ok(response.into_inner().try_into()?)
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,8 @@ pub mod proto {
|
||||
pub use page_service_server::{PageService, PageServiceServer};
|
||||
}
|
||||
|
||||
mod client;
|
||||
pub use client::Client;
|
||||
mod model;
|
||||
|
||||
pub use model::*;
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
|
||||
use crate::proto;
|
||||
|
||||
/// A protocol error. Typically returned via try_from() or try_into().
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[derive(thiserror::Error, Clone, Debug)]
|
||||
pub enum ProtocolError {
|
||||
#[error("field '{0}' has invalid value '{1}'")]
|
||||
Invalid(&'static str, String),
|
||||
@@ -182,13 +182,18 @@ impl From<CheckRelExistsResponse> for proto::CheckRelExistsResponse {
|
||||
}
|
||||
}
|
||||
|
||||
/// Requests a base backup at a given LSN.
|
||||
/// Requests a base backup.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct GetBaseBackupRequest {
|
||||
/// The LSN to fetch a base backup at.
|
||||
pub read_lsn: ReadLsn,
|
||||
/// The LSN to fetch a base backup at. If None, uses the latest LSN known to the Pageserver.
|
||||
pub lsn: Option<Lsn>,
|
||||
/// If true, logical replication slots will not be created.
|
||||
pub replica: bool,
|
||||
/// If true, include relation files in the base backup. Mainly for debugging and tests.
|
||||
pub full: bool,
|
||||
/// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
|
||||
/// compression, so that we can cache compressed backups on the server.
|
||||
pub compression: BaseBackupCompression,
|
||||
}
|
||||
|
||||
impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
@@ -196,11 +201,10 @@ impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
|
||||
fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
read_lsn: pb
|
||||
.read_lsn
|
||||
.ok_or(ProtocolError::Missing("read_lsn"))?
|
||||
.try_into()?,
|
||||
lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
|
||||
replica: pb.replica,
|
||||
full: pb.full,
|
||||
compression: pb.compression.try_into()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -208,12 +212,58 @@ impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
|
||||
fn from(request: GetBaseBackupRequest) -> Self {
|
||||
Self {
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
lsn: request.lsn.unwrap_or_default().0,
|
||||
replica: request.replica,
|
||||
full: request.full,
|
||||
compression: request.compression.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Base backup compression algorithm.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum BaseBackupCompression {
|
||||
None,
|
||||
Gzip,
|
||||
}
|
||||
|
||||
impl TryFrom<proto::BaseBackupCompression> for BaseBackupCompression {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::BaseBackupCompression) -> Result<Self, Self::Error> {
|
||||
match pb {
|
||||
proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)),
|
||||
proto::BaseBackupCompression::None => Ok(Self::None),
|
||||
proto::BaseBackupCompression::Gzip => Ok(Self::Gzip),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<i32> for BaseBackupCompression {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(compression: i32) -> Result<Self, Self::Error> {
|
||||
proto::BaseBackupCompression::try_from(compression)
|
||||
.map_err(|_| ProtocolError::invalid("compression", compression))
|
||||
.and_then(Self::try_from)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BaseBackupCompression> for proto::BaseBackupCompression {
|
||||
fn from(compression: BaseBackupCompression) -> Self {
|
||||
match compression {
|
||||
BaseBackupCompression::None => Self::None,
|
||||
BaseBackupCompression::Gzip => Self::Gzip,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BaseBackupCompression> for i32 {
|
||||
fn from(compression: BaseBackupCompression) -> Self {
|
||||
proto::BaseBackupCompression::from(compression).into()
|
||||
}
|
||||
}
|
||||
|
||||
pub type GetBaseBackupResponseChunk = Bytes;
|
||||
|
||||
impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
|
||||
@@ -422,12 +472,45 @@ impl From<GetPageResponse> for proto::GetPageResponse {
|
||||
}
|
||||
}
|
||||
|
||||
impl GetPageResponse {
|
||||
/// Attempts to represent a tonic::Status as a GetPageResponse if appropriate. Returning a
|
||||
/// tonic::Status will terminate the GetPage stream, so per-request errors are emitted as a
|
||||
/// GetPageResponse with a non-OK status code instead.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn try_from_status(
|
||||
status: tonic::Status,
|
||||
request_id: RequestID,
|
||||
) -> Result<Self, tonic::Status> {
|
||||
// We shouldn't see an OK status here, because we're emitting an error.
|
||||
debug_assert_ne!(status.code(), tonic::Code::Ok);
|
||||
if status.code() == tonic::Code::Ok {
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected OK status: {status:?}",
|
||||
)));
|
||||
}
|
||||
|
||||
// If we can't convert the tonic::Code to a GetPageStatusCode, this is not a per-request
|
||||
// error and we should return a tonic::Status to terminate the stream.
|
||||
let Ok(status_code) = status.code().try_into() else {
|
||||
return Err(status);
|
||||
};
|
||||
|
||||
// Return a GetPageResponse for the status.
|
||||
Ok(Self {
|
||||
request_id,
|
||||
status_code,
|
||||
reason: Some(status.message().to_string()),
|
||||
page_images: Vec::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPage response status code.
|
||||
///
|
||||
/// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
|
||||
/// (potentially shared by many backends), and a gRPC status response would terminate the stream so
|
||||
/// we send GetPageResponse messages with these codes instead.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Copy, Debug, PartialEq, strum_macros::Display)]
|
||||
pub enum GetPageStatusCode {
|
||||
/// Unknown status. For forwards compatibility: used when an older client version receives a new
|
||||
/// status code from a newer server version.
|
||||
@@ -485,8 +568,42 @@ impl From<GetPageStatusCode> for i32 {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<tonic::Code> for GetPageStatusCode {
|
||||
type Error = tonic::Code;
|
||||
|
||||
fn try_from(code: tonic::Code) -> Result<Self, Self::Error> {
|
||||
use tonic::Code;
|
||||
|
||||
let status_code = match code {
|
||||
Code::Ok => Self::Ok,
|
||||
|
||||
// These are per-request errors, which should be returned as GetPageResponses.
|
||||
Code::AlreadyExists => Self::InvalidRequest,
|
||||
Code::DataLoss => Self::InternalError,
|
||||
Code::FailedPrecondition => Self::InvalidRequest,
|
||||
Code::InvalidArgument => Self::InvalidRequest,
|
||||
Code::Internal => Self::InternalError,
|
||||
Code::NotFound => Self::NotFound,
|
||||
Code::OutOfRange => Self::InvalidRequest,
|
||||
Code::ResourceExhausted => Self::SlowDown,
|
||||
|
||||
// These should terminate the stream by returning a tonic::Status.
|
||||
Code::Aborted
|
||||
| Code::Cancelled
|
||||
| Code::DeadlineExceeded
|
||||
| Code::PermissionDenied
|
||||
| Code::Unauthenticated
|
||||
| Code::Unavailable
|
||||
| Code::Unimplemented
|
||||
| Code::Unknown => return Err(code),
|
||||
};
|
||||
Ok(status_code)
|
||||
}
|
||||
}
|
||||
|
||||
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
|
||||
// shards will error.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct GetRelSizeRequest {
|
||||
pub read_lsn: ReadLsn,
|
||||
pub rel: RelTag,
|
||||
@@ -530,6 +647,7 @@ impl From<GetRelSizeResponse> for proto::GetRelSizeResponse {
|
||||
}
|
||||
|
||||
/// Requests an SLRU segment. Only valid on shard 0, other shards will error.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct GetSlruSegmentRequest {
|
||||
pub read_lsn: ReadLsn,
|
||||
pub kind: SlruKind,
|
||||
|
||||
@@ -25,6 +25,7 @@ tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tonic.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
pageserver_client.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -62,7 +62,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
println!("operating on timeline {}", timeline);
|
||||
println!("operating on timeline {timeline}");
|
||||
|
||||
mgmt_api_client
|
||||
.set_tenant_config(&TenantConfigRequest {
|
||||
@@ -75,8 +75,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let items = (0..100)
|
||||
.map(|id| {
|
||||
(
|
||||
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
|
||||
format!("{:08}", id),
|
||||
format!("pg_logical/mappings/{batch:03}.{id:03}"),
|
||||
format!("{id:08}"),
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
@@ -1,20 +1,29 @@
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::anyhow;
|
||||
use futures::TryStreamExt as _;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
use pageserver_page_api as page_api;
|
||||
use rand::prelude::*;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::compat::{TokioAsyncReadCompatExt as _, TokioAsyncWriteCompatExt as _};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tonic::async_trait;
|
||||
use tracing::{info, instrument};
|
||||
use url::Url;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
@@ -24,14 +33,15 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
/// The Pageserver to connect to. Use postgresql:// for libpq, or grpc:// for gRPC.
|
||||
#[clap(long, default_value = "postgresql://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long, default_value = "1.0")]
|
||||
gzip_probability: f64,
|
||||
#[clap(long)]
|
||||
no_compression: bool,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
@@ -146,12 +156,27 @@ async fn main_impl(
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let scheme = match Url::parse(&args.page_service_connstring) {
|
||||
Ok(url) => url.scheme().to_lowercase().to_string(),
|
||||
Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(),
|
||||
Err(err) => return Err(anyhow!("invalid connstring: {err}")),
|
||||
};
|
||||
for &tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
|
||||
let client: Box<dyn Client> = match scheme.as_str() {
|
||||
"postgresql" | "postgres" => Box::new(
|
||||
LibpqClient::new(&args.page_service_connstring, tl, !args.no_compression).await?,
|
||||
),
|
||||
"grpc" => Box::new(
|
||||
GrpcClient::new(&args.page_service_connstring, tl, !args.no_compression).await?,
|
||||
),
|
||||
scheme => return Err(anyhow!("invalid scheme {scheme}")),
|
||||
};
|
||||
|
||||
tasks.push(tokio::spawn(run_worker(
|
||||
client,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
@@ -166,13 +191,7 @@ async fn main_impl(
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
lsn,
|
||||
gzip: rng.gen_bool(args.gzip_probability),
|
||||
},
|
||||
)
|
||||
(target.timeline, Work { lsn })
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
@@ -216,13 +235,11 @@ async fn main_impl(
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
async fn run_worker(
|
||||
mut client: Box<dyn Client>,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
@@ -230,37 +247,14 @@ async fn client(
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
while let Some(Work { lsn }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
let copy_out_stream = client
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap();
|
||||
let stream = client.basebackup(lsn).await.unwrap();
|
||||
|
||||
use futures::StreamExt;
|
||||
let size = Arc::new(AtomicUsize::new(0));
|
||||
copy_out_stream
|
||||
.for_each({
|
||||
|r| {
|
||||
let size = Arc::clone(&size);
|
||||
async move {
|
||||
let size = Arc::clone(&size);
|
||||
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
let size = futures::io::copy(stream.compat(), &mut tokio::io::sink().compat_write())
|
||||
.await
|
||||
.unwrap();
|
||||
info!("basebackup size is {size} bytes");
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
@@ -270,3 +264,100 @@ async fn client(
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
|
||||
/// A basebackup client. This allows switching out the client protocol implementation.
|
||||
#[async_trait]
|
||||
trait Client: Send {
|
||||
async fn basebackup(
|
||||
&mut self,
|
||||
lsn: Option<Lsn>,
|
||||
) -> anyhow::Result<Pin<Box<dyn AsyncRead + Send>>>;
|
||||
}
|
||||
|
||||
/// A libpq-based Pageserver client.
|
||||
struct LibpqClient {
|
||||
inner: pageserver_client::page_service::Client,
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
}
|
||||
|
||||
impl LibpqClient {
|
||||
async fn new(
|
||||
connstring: &str,
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
inner: pageserver_client::page_service::Client::new(connstring.to_string()).await?,
|
||||
ttid,
|
||||
compression,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for LibpqClient {
|
||||
async fn basebackup(
|
||||
&mut self,
|
||||
lsn: Option<Lsn>,
|
||||
) -> anyhow::Result<Pin<Box<dyn AsyncRead + Send + 'static>>> {
|
||||
let req = BasebackupRequest {
|
||||
tenant_id: self.ttid.tenant_id,
|
||||
timeline_id: self.ttid.timeline_id,
|
||||
lsn,
|
||||
gzip: self.compression,
|
||||
};
|
||||
let stream = self.inner.basebackup(&req).await?;
|
||||
Ok(Box::pin(StreamReader::new(
|
||||
stream.map_err(std::io::Error::other),
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
/// A gRPC Pageserver client.
|
||||
struct GrpcClient {
|
||||
inner: page_api::Client,
|
||||
compression: page_api::BaseBackupCompression,
|
||||
}
|
||||
|
||||
impl GrpcClient {
|
||||
async fn new(
|
||||
connstring: &str,
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let inner = page_api::Client::new(
|
||||
connstring.to_string(),
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
ShardIndex::unsharded(),
|
||||
None,
|
||||
None, // NB: uses payload compression
|
||||
)
|
||||
.await?;
|
||||
let compression = match compression {
|
||||
true => page_api::BaseBackupCompression::Gzip,
|
||||
false => page_api::BaseBackupCompression::None,
|
||||
};
|
||||
Ok(Self { inner, compression })
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for GrpcClient {
|
||||
async fn basebackup(
|
||||
&mut self,
|
||||
lsn: Option<Lsn>,
|
||||
) -> anyhow::Result<Pin<Box<dyn AsyncRead + Send + 'static>>> {
|
||||
let req = page_api::GetBaseBackupRequest {
|
||||
lsn,
|
||||
replica: false,
|
||||
full: false,
|
||||
compression: self.compression,
|
||||
};
|
||||
let stream = self.inner.get_base_backup(req).await?;
|
||||
Ok(Box::pin(StreamReader::new(
|
||||
stream.map_err(std::io::Error::other),
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,33 +10,31 @@ use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::{Stream, StreamExt as _};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_page_api::proto;
|
||||
use pageserver_page_api as page_api;
|
||||
use rand::prelude::*;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use url::Url;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
#[derive(clap::ValueEnum, Clone, Debug)]
|
||||
enum Protocol {
|
||||
Libpq,
|
||||
Grpc,
|
||||
}
|
||||
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
@@ -45,8 +43,9 @@ pub(crate) struct Args {
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long, value_enum, default_value = "libpq")]
|
||||
protocol: Protocol,
|
||||
/// If true, enable compression (only for gRPC).
|
||||
#[clap(long)]
|
||||
compression: bool,
|
||||
/// Each client sends requests at the given rate.
|
||||
///
|
||||
/// If a request takes too long and we should be issuing a new request already,
|
||||
@@ -325,18 +324,32 @@ async fn main_impl(
|
||||
.unwrap();
|
||||
|
||||
Box::pin(async move {
|
||||
let client: Box<dyn Client> = match args.protocol {
|
||||
Protocol::Libpq => Box::new(
|
||||
LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline)
|
||||
.await
|
||||
.unwrap(),
|
||||
let scheme = match Url::parse(&args.page_service_connstring) {
|
||||
Ok(url) => url.scheme().to_lowercase().to_string(),
|
||||
Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(),
|
||||
Err(err) => panic!("invalid connstring: {err}"),
|
||||
};
|
||||
let client: Box<dyn Client> = match scheme.as_str() {
|
||||
"postgresql" | "postgres" => {
|
||||
assert!(!args.compression, "libpq does not support compression");
|
||||
Box::new(
|
||||
LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
|
||||
.await
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
"grpc" => Box::new(
|
||||
GrpcClient::new(
|
||||
&args.page_service_connstring,
|
||||
worker_id.timeline,
|
||||
args.compression,
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
|
||||
Protocol::Grpc => Box::new(
|
||||
GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline)
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
scheme => panic!("unsupported scheme {scheme}"),
|
||||
};
|
||||
run_worker(args, client, ss, cancel, rps_period, ranges, weights).await
|
||||
})
|
||||
@@ -543,8 +556,8 @@ struct LibpqClient {
|
||||
}
|
||||
|
||||
impl LibpqClient {
|
||||
async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
|
||||
let inner = pageserver_client::page_service::Client::new(connstring)
|
||||
async fn new(connstring: &str, ttid: TenantTimelineId) -> anyhow::Result<Self> {
|
||||
let inner = pageserver_client::page_service::Client::new(connstring.to_string())
|
||||
.await?
|
||||
.pagestream(ttid.tenant_id, ttid.timeline_id)
|
||||
.await?;
|
||||
@@ -600,34 +613,36 @@ impl Client for LibpqClient {
|
||||
}
|
||||
}
|
||||
|
||||
/// A gRPC client using the raw, no-frills gRPC client.
|
||||
/// A gRPC Pageserver client.
|
||||
struct GrpcClient {
|
||||
req_tx: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
|
||||
resp_rx: tonic::Streaming<proto::GetPageResponse>,
|
||||
req_tx: tokio::sync::mpsc::Sender<page_api::GetPageRequest>,
|
||||
resp_rx: Pin<Box<dyn Stream<Item = Result<page_api::GetPageResponse, tonic::Status>> + Send>>,
|
||||
}
|
||||
|
||||
impl GrpcClient {
|
||||
async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
|
||||
let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?;
|
||||
async fn new(
|
||||
connstring: &str,
|
||||
ttid: TenantTimelineId,
|
||||
compression: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut client = page_api::Client::new(
|
||||
connstring.to_string(),
|
||||
ttid.tenant_id,
|
||||
ttid.timeline_id,
|
||||
ShardIndex::unsharded(),
|
||||
None,
|
||||
compression.then_some(tonic::codec::CompressionEncoding::Zstd),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the
|
||||
// benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are
|
||||
// buffered by Tonic and the OS too.
|
||||
let (req_tx, req_rx) = tokio::sync::mpsc::channel(1);
|
||||
let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
|
||||
let mut req = tonic::Request::new(req_stream);
|
||||
let metadata = req.metadata_mut();
|
||||
metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?);
|
||||
metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?);
|
||||
metadata.insert("neon-shard-id", "0000".try_into()?);
|
||||
let resp_rx = Box::pin(client.get_pages(req_stream).await?);
|
||||
|
||||
let resp = client.get_pages(req).await?;
|
||||
let resp_stream = resp.into_inner();
|
||||
|
||||
Ok(Self {
|
||||
req_tx,
|
||||
resp_rx: resp_stream,
|
||||
})
|
||||
Ok(Self { req_tx, resp_rx })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -641,27 +656,27 @@ impl Client for GrpcClient {
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = proto::GetPageRequest {
|
||||
let req = page_api::GetPageRequest {
|
||||
request_id: req_id,
|
||||
request_class: proto::GetPageClass::Normal as i32,
|
||||
read_lsn: Some(proto::ReadLsn {
|
||||
request_lsn: req_lsn.0,
|
||||
not_modified_since_lsn: mod_lsn.0,
|
||||
}),
|
||||
rel: Some(rel.into()),
|
||||
block_number: blks,
|
||||
request_class: page_api::GetPageClass::Normal,
|
||||
read_lsn: page_api::ReadLsn {
|
||||
request_lsn: req_lsn,
|
||||
not_modified_since_lsn: Some(mod_lsn),
|
||||
},
|
||||
rel,
|
||||
block_numbers: blks,
|
||||
};
|
||||
self.req_tx.send(req).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let resp = self.resp_rx.message().await?.unwrap();
|
||||
let resp = self.resp_rx.next().await.unwrap().unwrap();
|
||||
anyhow::ensure!(
|
||||
resp.status_code == proto::GetPageStatusCode::Ok as i32,
|
||||
resp.status_code == page_api::GetPageStatusCode::Ok,
|
||||
"unexpected status code: {}",
|
||||
resp.status_code
|
||||
resp.status_code,
|
||||
);
|
||||
Ok((resp.request_id, resp.page_image))
|
||||
Ok((resp.request_id, resp.page_images))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,19 +14,19 @@ use std::fmt::Write as FmtWrite;
|
||||
use std::time::{Instant, SystemTime};
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{Key, rel_block_to_key};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants::{
|
||||
DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES,
|
||||
};
|
||||
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
|
||||
use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES};
|
||||
use postgres_ffi::{
|
||||
BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants,
|
||||
BLCKSZ, PG_TLI, PgMajorVersion, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName,
|
||||
dispatch_pgversion, pg_constants,
|
||||
};
|
||||
use tokio::io;
|
||||
use tokio::io::AsyncWrite;
|
||||
use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM};
|
||||
use tokio::io::{self, AsyncWrite, AsyncWriteExt as _};
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -97,6 +97,7 @@ impl From<BasebackupError> for tonic::Status {
|
||||
/// * When working without safekeepers. In this situation it is important to match the lsn
|
||||
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||
/// to start the replication.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn send_basebackup_tarball<'a, W>(
|
||||
write: &'a mut W,
|
||||
timeline: &'a Timeline,
|
||||
@@ -104,6 +105,7 @@ pub async fn send_basebackup_tarball<'a, W>(
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
replica: bool,
|
||||
gzip_level: Option<async_compression::Level>,
|
||||
ctx: &'a RequestContext,
|
||||
) -> Result<(), BasebackupError>
|
||||
where
|
||||
@@ -122,7 +124,7 @@ where
|
||||
// prev_lsn value; that happens if the timeline was just branched from
|
||||
// an old LSN and it doesn't have any WAL of its own yet. We will set
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn {
|
||||
// Backup was requested at a particular LSN. The caller should've
|
||||
// already checked that it's a valid LSN.
|
||||
|
||||
@@ -143,7 +145,7 @@ where
|
||||
};
|
||||
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
|
||||
return Err(BasebackupError::Server(anyhow!(
|
||||
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
|
||||
@@ -155,30 +157,55 @@ where
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
|
||||
backup_lsn, prev_lsn, full_backup, replica
|
||||
"taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \
|
||||
(full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})",
|
||||
);
|
||||
let span = info_span!("send_tarball", backup_lsn=%lsn);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
timeline.conf.get_vectored_concurrent_io,
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
);
|
||||
|
||||
let basebackup = Basebackup {
|
||||
ar: Builder::new_non_terminated(write),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency: IoConcurrency::spawn_from_conf(
|
||||
timeline.conf.get_vectored_concurrent_io,
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
),
|
||||
};
|
||||
basebackup
|
||||
if let Some(gzip_level) = gzip_level {
|
||||
let mut encoder = GzipEncoder::with_quality(write, gzip_level);
|
||||
Basebackup {
|
||||
ar: Builder::new_non_terminated(&mut encoder),
|
||||
timeline,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency,
|
||||
}
|
||||
.send_tarball()
|
||||
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
|
||||
.await
|
||||
.instrument(span)
|
||||
.await?;
|
||||
encoder
|
||||
.shutdown()
|
||||
.await
|
||||
.map_err(|err| BasebackupError::Client(err, "gzip"))?;
|
||||
} else {
|
||||
Basebackup {
|
||||
ar: Builder::new_non_terminated(write),
|
||||
timeline,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency,
|
||||
}
|
||||
.send_tarball()
|
||||
.instrument(span)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
@@ -372,6 +399,7 @@ where
|
||||
.partition(
|
||||
self.timeline.get_shard_identity(),
|
||||
self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
|
||||
BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
@@ -619,10 +647,7 @@ where
|
||||
};
|
||||
|
||||
if spcnode == GLOBALTABLESPACE_OID {
|
||||
let pg_version_str = match self.timeline.pg_version {
|
||||
14 | 15 => self.timeline.pg_version.to_string(),
|
||||
ver => format!("{ver}\x0A"),
|
||||
};
|
||||
let pg_version_str = self.timeline.pg_version.versionfile_string();
|
||||
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
@@ -669,7 +694,7 @@ where
|
||||
}
|
||||
|
||||
// Append dir path for each database
|
||||
let path = format!("base/{}", dbnode);
|
||||
let path = format!("base/{dbnode}");
|
||||
let header = new_tar_header_dir(&path)?;
|
||||
self.ar
|
||||
.append(&header, io::empty())
|
||||
@@ -677,19 +702,16 @@ where
|
||||
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
let dst_path = format!("base/{dbnode}/PG_VERSION");
|
||||
|
||||
let pg_version_str = match self.timeline.pg_version {
|
||||
14 | 15 => self.timeline.pg_version.to_string(),
|
||||
ver => format!("{ver}\x0A"),
|
||||
};
|
||||
let pg_version_str = self.timeline.pg_version.versionfile_string();
|
||||
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;
|
||||
|
||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||
let relmap_path = format!("base/{dbnode}/pg_filenode.map");
|
||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &img[..])
|
||||
@@ -713,10 +735,10 @@ where
|
||||
buf.extend_from_slice(&img[..]);
|
||||
let crc = crc32c::crc32c(&img[..]);
|
||||
buf.put_u32_le(crc);
|
||||
let path = if self.timeline.pg_version < 17 {
|
||||
format!("pg_twophase/{:>08X}", xid)
|
||||
let path = if self.timeline.pg_version < PgMajorVersion::PG17 {
|
||||
format!("pg_twophase/{xid:>08X}")
|
||||
} else {
|
||||
format!("pg_twophase/{:>016X}", xid)
|
||||
format!("pg_twophase/{xid:>016X}")
|
||||
};
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
self.ar
|
||||
@@ -768,7 +790,7 @@ where
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let wal_file_path = format!("pg_wal/{wal_file_name}");
|
||||
let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;
|
||||
|
||||
let wal_seg = postgres_ffi::generate_wal_segment(
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
|
||||
use tokio::{
|
||||
io::{AsyncWriteExt, BufWriter},
|
||||
sync::mpsc::{UnboundedReceiver, UnboundedSender},
|
||||
sync::mpsc::{Receiver, Sender, error::TrySendError},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
@@ -18,7 +18,10 @@ use utils::{
|
||||
use crate::{
|
||||
basebackup::send_basebackup_tarball,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
|
||||
metrics::{
|
||||
BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE,
|
||||
BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE,
|
||||
},
|
||||
task_mgr::TaskKind,
|
||||
tenant::{
|
||||
Timeline,
|
||||
@@ -32,11 +35,16 @@ pub struct BasebackupPrepareRequest {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareSender = Sender<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareReceiver = Receiver<BasebackupPrepareRequest>;
|
||||
|
||||
type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
|
||||
type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
|
||||
#[derive(Clone)]
|
||||
struct CacheEntry {
|
||||
/// LSN at which the basebackup was taken.
|
||||
lsn: Lsn,
|
||||
/// Size of the basebackup archive in bytes.
|
||||
size_bytes: u64,
|
||||
}
|
||||
|
||||
/// BasebackupCache stores cached basebackup archives for timelines on local disk.
|
||||
///
|
||||
@@ -52,68 +60,118 @@ type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
|
||||
/// and ~1 RPS for get requests.
|
||||
pub struct BasebackupCache {
|
||||
data_dir: Utf8PathBuf,
|
||||
config: BasebackupCacheConfig,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remove_entry_sender: BasebackupRemoveEntrySender,
|
||||
config: Option<BasebackupCacheConfig>,
|
||||
|
||||
entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
|
||||
entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
prepare_sender: BasebackupPrepareSender,
|
||||
|
||||
read_hit_count: GenericCounter<AtomicU64>,
|
||||
read_miss_count: GenericCounter<AtomicU64>,
|
||||
read_err_count: GenericCounter<AtomicU64>,
|
||||
|
||||
prepare_ok_count: GenericCounter<AtomicU64>,
|
||||
prepare_skip_count: GenericCounter<AtomicU64>,
|
||||
prepare_err_count: GenericCounter<AtomicU64>,
|
||||
}
|
||||
|
||||
impl BasebackupCache {
|
||||
/// Creates a BasebackupCache and spawns the background task.
|
||||
/// The initialization of the cache is performed in the background and does not
|
||||
/// block the caller. The cache will return `None` for any get requests until
|
||||
/// initialization is complete.
|
||||
pub fn spawn(
|
||||
runtime_handle: &tokio::runtime::Handle,
|
||||
/// Create a new BasebackupCache instance.
|
||||
/// Also returns a BasebackupPrepareReceiver which is needed to start
|
||||
/// the background task.
|
||||
/// The cache is initialized from the data_dir in the background task.
|
||||
/// The cache will return `None` for any get requests until the initialization is complete.
|
||||
/// The background task is spawned separately using [`Self::spawn_background_task`]
|
||||
/// to avoid a circular dependency between the cache and the tenant manager.
|
||||
pub fn new(
|
||||
data_dir: Utf8PathBuf,
|
||||
config: Option<BasebackupCacheConfig>,
|
||||
prepare_receiver: BasebackupPrepareReceiver,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
|
||||
) -> (Arc<Self>, BasebackupPrepareReceiver) {
|
||||
let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1);
|
||||
|
||||
let enabled = config.is_some();
|
||||
let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size);
|
||||
|
||||
let cache = Arc::new(BasebackupCache {
|
||||
data_dir,
|
||||
config: config.unwrap_or_default(),
|
||||
tenant_manager,
|
||||
remove_entry_sender,
|
||||
|
||||
config,
|
||||
entries: std::sync::Mutex::new(HashMap::new()),
|
||||
|
||||
cancel,
|
||||
prepare_sender,
|
||||
|
||||
read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
|
||||
read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
|
||||
read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
|
||||
|
||||
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
|
||||
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
|
||||
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
|
||||
});
|
||||
|
||||
if enabled {
|
||||
runtime_handle.spawn(
|
||||
cache
|
||||
.clone()
|
||||
.background(prepare_receiver, remove_entry_receiver),
|
||||
);
|
||||
}
|
||||
(cache, prepare_receiver)
|
||||
}
|
||||
|
||||
cache
|
||||
/// Spawns the background task.
|
||||
/// The background task initializes the cache from the disk,
|
||||
/// processes prepare requests, and cleans up outdated cache entries.
|
||||
/// Noop if the cache is disabled (config is None).
|
||||
pub fn spawn_background_task(
|
||||
self: Arc<Self>,
|
||||
runtime_handle: &tokio::runtime::Handle,
|
||||
prepare_receiver: BasebackupPrepareReceiver,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
if let Some(config) = self.config.clone() {
|
||||
let background = BackgroundTask {
|
||||
c: self,
|
||||
|
||||
config,
|
||||
tenant_manager,
|
||||
cancel,
|
||||
|
||||
entry_count: 0,
|
||||
total_size_bytes: 0,
|
||||
|
||||
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
|
||||
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
|
||||
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
|
||||
};
|
||||
runtime_handle.spawn(background.run(prepare_receiver));
|
||||
}
|
||||
}
|
||||
|
||||
/// Send a basebackup prepare request to the background task.
|
||||
/// The basebackup will be prepared asynchronously, it does not block the caller.
|
||||
/// The request will be skipped if any cache limits are exceeded.
|
||||
pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) {
|
||||
let req = BasebackupPrepareRequest {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
};
|
||||
|
||||
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc();
|
||||
let res = self.prepare_sender.try_send(req);
|
||||
|
||||
if let Err(e) = res {
|
||||
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
|
||||
self.prepare_skip_count.inc();
|
||||
match e {
|
||||
TrySendError::Full(_) => {
|
||||
// Basebackup prepares are pretty rare, normally we should not hit this.
|
||||
tracing::info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
%timeline_id,
|
||||
%lsn,
|
||||
"Basebackup prepare channel is full, skipping the request"
|
||||
);
|
||||
}
|
||||
TrySendError::Closed(_) => {
|
||||
// Normal during shutdown, not critical.
|
||||
tracing::info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
%timeline_id,
|
||||
%lsn,
|
||||
"Basebackup prepare channel is closed, skipping the request"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets a basebackup entry from the cache.
|
||||
@@ -126,9 +184,13 @@ impl BasebackupCache {
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Option<tokio::fs::File> {
|
||||
if !self.is_enabled() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Fast path. Check if the entry exists using the in-memory state.
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
|
||||
if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
|
||||
self.read_miss_count.inc();
|
||||
return None;
|
||||
}
|
||||
@@ -153,6 +215,10 @@ impl BasebackupCache {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
self.config.is_some()
|
||||
}
|
||||
|
||||
// Private methods.
|
||||
|
||||
fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
|
||||
@@ -166,6 +232,42 @@ impl BasebackupCache {
|
||||
self.data_dir
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
}
|
||||
|
||||
/// The background task that does the job to prepare basebackups
|
||||
/// and manage the cache entries on disk.
|
||||
/// It is a separate struct from BasebackupCache to allow holding
|
||||
/// a mutable reference to this state without a mutex lock,
|
||||
/// while BasebackupCache is referenced by the clients.
|
||||
struct BackgroundTask {
|
||||
c: Arc<BasebackupCache>,
|
||||
|
||||
config: BasebackupCacheConfig,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
|
||||
/// Number of the entries in the cache.
|
||||
/// This counter is used for metrics and applying cache limits.
|
||||
/// It generally should be equal to c.entries.len(), but it's calculated
|
||||
/// pessimistically for abnormal situations: if we encountered some errors
|
||||
/// during removing the entry from disk, we won't decrement this counter to
|
||||
/// make sure that we don't exceed the limit with "trashed" files on the disk.
|
||||
/// It will also count files in the data_dir that are not valid cache entries.
|
||||
entry_count: usize,
|
||||
/// Total size of all the entries on the disk.
|
||||
/// This counter is used for metrics and applying cache limits.
|
||||
/// Similar to entry_count, it is calculated pessimistically for abnormal situations.
|
||||
total_size_bytes: u64,
|
||||
|
||||
prepare_ok_count: GenericCounter<AtomicU64>,
|
||||
prepare_skip_count: GenericCounter<AtomicU64>,
|
||||
prepare_err_count: GenericCounter<AtomicU64>,
|
||||
}
|
||||
|
||||
impl BackgroundTask {
|
||||
fn tmp_dir(&self) -> Utf8PathBuf {
|
||||
self.c.data_dir.join("tmp")
|
||||
}
|
||||
|
||||
fn entry_tmp_path(
|
||||
&self,
|
||||
@@ -173,9 +275,8 @@ impl BasebackupCache {
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Utf8PathBuf {
|
||||
self.data_dir
|
||||
.join("tmp")
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
self.tmp_dir()
|
||||
.join(BasebackupCache::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
|
||||
@@ -194,18 +295,21 @@ impl BasebackupCache {
|
||||
Some((tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
async fn cleanup(&self) -> anyhow::Result<()> {
|
||||
// Cleanup tmp directory.
|
||||
let tmp_dir = self.data_dir.join("tmp");
|
||||
let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
|
||||
while let Some(dir_entry) = tmp_dir.next_entry().await? {
|
||||
if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
|
||||
tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
|
||||
}
|
||||
// Recreate the tmp directory to clear all files in it.
|
||||
async fn clean_tmp_dir(&self) -> anyhow::Result<()> {
|
||||
let tmp_dir = self.tmp_dir();
|
||||
if tmp_dir.exists() {
|
||||
tokio::fs::remove_dir_all(&tmp_dir).await?;
|
||||
}
|
||||
tokio::fs::create_dir_all(&tmp_dir).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Remove outdated entries.
|
||||
let entries_old = self.entries.lock().unwrap().clone();
|
||||
async fn cleanup(&mut self) -> anyhow::Result<()> {
|
||||
self.clean_tmp_dir().await?;
|
||||
|
||||
// Leave only up-to-date entries.
|
||||
let entries_old = self.c.entries.lock().unwrap().clone();
|
||||
let mut entries_new = HashMap::new();
|
||||
for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
@@ -218,43 +322,42 @@ impl BasebackupCache {
|
||||
|
||||
for timeline in tenant.list_timelines() {
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
|
||||
if let Some(&entry_lsn) = entries_old.get(&tti) {
|
||||
if timeline.get_last_record_lsn() <= entry_lsn {
|
||||
entries_new.insert(tti, entry_lsn);
|
||||
if let Some(entry) = entries_old.get(&tti) {
|
||||
if timeline.get_last_record_lsn() <= entry.lsn {
|
||||
entries_new.insert(tti, entry.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (&tti, &lsn) in entries_old.iter() {
|
||||
// Try to remove all entries that are not up-to-date.
|
||||
for (&tti, entry) in entries_old.iter() {
|
||||
if !entries_new.contains_key(&tti) {
|
||||
self.remove_entry_sender
|
||||
.send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
|
||||
.unwrap();
|
||||
self.try_remove_entry(tti.tenant_id, tti.timeline_id, entry)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
|
||||
*self.entries.lock().unwrap() = entries_new;
|
||||
// Note: BackgroundTask is the only writer for self.c.entries,
|
||||
// so it couldn't have been modified concurrently.
|
||||
*self.c.entries.lock().unwrap() = entries_new;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn on_startup(&self) -> anyhow::Result<()> {
|
||||
// Create data_dir and tmp directory if they do not exist.
|
||||
tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
|
||||
async fn on_startup(&mut self) -> anyhow::Result<()> {
|
||||
// Create data_dir if it does not exist.
|
||||
tokio::fs::create_dir_all(&self.c.data_dir)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to create basebackup cache data_dir {:?}: {:?}",
|
||||
self.data_dir,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
.context("Failed to create basebackup cache data directory")?;
|
||||
|
||||
self.clean_tmp_dir()
|
||||
.await
|
||||
.context("Failed to clean tmp directory")?;
|
||||
|
||||
// Read existing entries from the data_dir and add them to in-memory state.
|
||||
let mut entries = HashMap::new();
|
||||
let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
|
||||
let mut entries = HashMap::<TenantTimelineId, CacheEntry>::new();
|
||||
let mut dir = tokio::fs::read_dir(&self.c.data_dir).await?;
|
||||
while let Some(dir_entry) = dir.next_entry().await? {
|
||||
let filename = dir_entry.file_name();
|
||||
|
||||
@@ -263,33 +366,43 @@ impl BasebackupCache {
|
||||
continue;
|
||||
}
|
||||
|
||||
let size_bytes = dir_entry
|
||||
.metadata()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!("Failed to read metadata for file {:?}: {:?}", filename, e)
|
||||
})?
|
||||
.len();
|
||||
|
||||
self.entry_count += 1;
|
||||
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
|
||||
|
||||
self.total_size_bytes += size_bytes;
|
||||
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
|
||||
|
||||
let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
|
||||
let Some((tenant_id, timeline_id, lsn)) = parsed else {
|
||||
tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
|
||||
continue;
|
||||
};
|
||||
|
||||
let cur_entry = CacheEntry { lsn, size_bytes };
|
||||
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
use std::collections::hash_map::Entry::*;
|
||||
|
||||
match entries.entry(tti) {
|
||||
Occupied(mut entry) => {
|
||||
let entry_lsn = *entry.get();
|
||||
let found_entry = entry.get();
|
||||
// Leave only the latest entry, remove the old one.
|
||||
if lsn < entry_lsn {
|
||||
self.remove_entry_sender.send(self.entry_path(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
))?;
|
||||
} else if lsn > entry_lsn {
|
||||
self.remove_entry_sender.send(self.entry_path(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
entry_lsn,
|
||||
))?;
|
||||
entry.insert(lsn);
|
||||
if cur_entry.lsn < found_entry.lsn {
|
||||
self.try_remove_entry(tenant_id, timeline_id, &cur_entry)
|
||||
.await;
|
||||
} else if cur_entry.lsn > found_entry.lsn {
|
||||
self.try_remove_entry(tenant_id, timeline_id, found_entry)
|
||||
.await;
|
||||
entry.insert(cur_entry);
|
||||
} else {
|
||||
// Two different filenames parsed to the same timline_id and LSN.
|
||||
// Should never happen.
|
||||
@@ -300,22 +413,17 @@ impl BasebackupCache {
|
||||
}
|
||||
}
|
||||
Vacant(entry) => {
|
||||
entry.insert(lsn);
|
||||
entry.insert(cur_entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
|
||||
*self.entries.lock().unwrap() = entries;
|
||||
*self.c.entries.lock().unwrap() = entries;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn background(
|
||||
self: Arc<Self>,
|
||||
mut prepare_receiver: BasebackupPrepareReceiver,
|
||||
mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
|
||||
) {
|
||||
async fn run(mut self, mut prepare_receiver: BasebackupPrepareReceiver) {
|
||||
// Panic in the background is a safe fallback.
|
||||
// It will drop receivers and the cache will be effectively disabled.
|
||||
self.on_startup()
|
||||
@@ -328,6 +436,7 @@ impl BasebackupCache {
|
||||
loop {
|
||||
tokio::select! {
|
||||
Some(req) = prepare_receiver.recv() => {
|
||||
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
|
||||
if let Err(err) = self.prepare_basebackup(
|
||||
req.tenant_shard_id,
|
||||
req.timeline_id,
|
||||
@@ -338,11 +447,6 @@ impl BasebackupCache {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Some(req) = remove_entry_receiver.recv() => {
|
||||
if let Err(e) = tokio::fs::remove_file(req).await {
|
||||
tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
|
||||
}
|
||||
}
|
||||
_ = cleanup_ticker.tick() => {
|
||||
self.cleanup().await.unwrap_or_else(|e| {
|
||||
tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
|
||||
@@ -356,6 +460,67 @@ impl BasebackupCache {
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to remove an entry from disk.
|
||||
/// The caller is responsible for removing the entry from the in-memory state.
|
||||
/// Updates size counters and corresponding metrics.
|
||||
/// Ignores the filesystem errors as not-so-important, but the size counters
|
||||
/// are not decremented in this case, so the file will continue to be counted
|
||||
/// towards the size limits.
|
||||
async fn try_remove_entry(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
entry: &CacheEntry,
|
||||
) {
|
||||
let entry_path = self.c.entry_path(tenant_id, timeline_id, entry.lsn);
|
||||
|
||||
match tokio::fs::remove_file(&entry_path).await {
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Failed to remove basebackup cache file for tenant {} timeline {} LSN {}: {:#}",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
entry.lsn,
|
||||
e
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
self.entry_count -= 1;
|
||||
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
|
||||
|
||||
self.total_size_bytes -= entry.size_bytes;
|
||||
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
|
||||
}
|
||||
|
||||
/// Insert the cache entry into in-memory state and update the size counters.
|
||||
/// Assumes that the file for the entry already exists on disk.
|
||||
/// If the entry already exists with previous LSN, it will be removed.
|
||||
async fn upsert_entry(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
entry: CacheEntry,
|
||||
) {
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
self.entry_count += 1;
|
||||
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
|
||||
|
||||
self.total_size_bytes += entry.size_bytes;
|
||||
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
|
||||
|
||||
let old_entry = self.c.entries.lock().unwrap().insert(tti, entry);
|
||||
|
||||
if let Some(old_entry) = old_entry {
|
||||
self.try_remove_entry(tenant_id, timeline_id, &old_entry)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepare a basebackup for the given timeline.
|
||||
///
|
||||
/// If the basebackup already exists with a higher LSN or the timeline already
|
||||
@@ -364,7 +529,7 @@ impl BasebackupCache {
|
||||
/// The basebackup is prepared in a temporary directory and then moved to the final
|
||||
/// location to make the operation atomic.
|
||||
async fn prepare_basebackup(
|
||||
&self,
|
||||
&mut self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
req_lsn: Lsn,
|
||||
@@ -378,30 +543,44 @@ impl BasebackupCache {
|
||||
|
||||
let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
// TODO(diko): I don't think we will hit the limit,
|
||||
// but if we do, it makes sense to try to evict oldest entries. here
|
||||
if self.entry_count >= self.config.max_size_entries {
|
||||
tracing::info!(
|
||||
%tenant_shard_id,
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Basebackup cache is full (max_size_entries), skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.total_size_bytes >= self.config.max_total_size_bytes {
|
||||
tracing::info!(
|
||||
%tenant_shard_id,
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Basebackup cache is full (max_total_size_bytes), skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
{
|
||||
let entries = self.entries.lock().unwrap();
|
||||
if let Some(&entry_lsn) = entries.get(&tti) {
|
||||
if entry_lsn >= req_lsn {
|
||||
let entries = self.c.entries.lock().unwrap();
|
||||
if let Some(entry) = entries.get(&tti) {
|
||||
if entry.lsn >= req_lsn {
|
||||
tracing::info!(
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
%entry_lsn,
|
||||
%entry.lsn,
|
||||
"Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if entries.len() as i64 >= self.config.max_size_entries {
|
||||
tracing::info!(
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Basebackup cache is full, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let tenant = self
|
||||
@@ -437,56 +616,54 @@ impl BasebackupCache {
|
||||
.prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
|
||||
.await;
|
||||
|
||||
if let Err(err) = res {
|
||||
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
|
||||
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
|
||||
match tokio::fs::remove_file(&entry_tmp_path).await {
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => {
|
||||
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
|
||||
let entry = match res {
|
||||
Ok(entry) => entry,
|
||||
Err(err) => {
|
||||
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
|
||||
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
|
||||
match tokio::fs::remove_file(&entry_tmp_path).await {
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => {
|
||||
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
|
||||
}
|
||||
}
|
||||
return Err(err);
|
||||
}
|
||||
return Err(err);
|
||||
}
|
||||
};
|
||||
|
||||
// Move the tmp file to the final location atomically.
|
||||
let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
|
||||
// The tmp file is fsynced, so it's guaranteed that we will not have a partial file
|
||||
// in the main directory.
|
||||
// It's not necessary to fsync the inode after renaming, because the worst case is that
|
||||
// the rename operation will be rolled back on the disk failure, the entry will disappear
|
||||
// from the main directory, and the entry access will cause a cache miss.
|
||||
let entry_path = self
|
||||
.c
|
||||
.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
|
||||
tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
|
||||
|
||||
let mut entries = self.entries.lock().unwrap();
|
||||
if let Some(old_lsn) = entries.insert(tti, req_lsn) {
|
||||
// Remove the old entry if it exists.
|
||||
self.remove_entry_sender
|
||||
.send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
|
||||
.unwrap();
|
||||
}
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
|
||||
self.upsert_entry(tenant_shard_id.tenant_id, timeline_id, entry)
|
||||
.await;
|
||||
|
||||
self.prepare_ok_count.inc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepares a basebackup in a temporary file.
|
||||
/// Guarantees that the tmp file is fsynced before returning.
|
||||
async fn prepare_basebackup_tmp(
|
||||
&self,
|
||||
emptry_tmp_path: &Utf8Path,
|
||||
entry_tmp_path: &Utf8Path,
|
||||
timeline: &Arc<Timeline>,
|
||||
req_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<CacheEntry> {
|
||||
let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
|
||||
let ctx = ctx.with_scope_timeline(timeline);
|
||||
|
||||
let file = tokio::fs::File::create(emptry_tmp_path).await?;
|
||||
let file = tokio::fs::File::create(entry_tmp_path).await?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
&mut writer,
|
||||
// Level::Best because compression is not on the hot path of basebackup requests.
|
||||
// The decompression is almost not affected by the compression level.
|
||||
async_compression::Level::Best,
|
||||
);
|
||||
|
||||
// We may receive a request before the WAL record is applied to the timeline.
|
||||
// Wait for the requested LSN to be applied.
|
||||
timeline
|
||||
@@ -499,20 +676,28 @@ impl BasebackupCache {
|
||||
.await?;
|
||||
|
||||
send_basebackup_tarball(
|
||||
&mut encoder,
|
||||
&mut writer,
|
||||
timeline,
|
||||
Some(req_lsn),
|
||||
None,
|
||||
false,
|
||||
false,
|
||||
// Level::Best because compression is not on the hot path of basebackup requests.
|
||||
// The decompression is almost not affected by the compression level.
|
||||
Some(async_compression::Level::Best),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
encoder.shutdown().await?;
|
||||
writer.flush().await?;
|
||||
writer.into_inner().sync_all().await?;
|
||||
|
||||
Ok(())
|
||||
// TODO(diko): we can count it via Writer wrapper instead of a syscall.
|
||||
let size_bytes = tokio::fs::metadata(entry_tmp_path).await?.len();
|
||||
|
||||
Ok(CacheEntry {
|
||||
lsn: req_lsn,
|
||||
size_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use pageserver::deletion_queue::DeletionQueue;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::feature_resolver::FeatureResolver;
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::page_service::GrpcPageServiceHandler;
|
||||
use pageserver::task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
};
|
||||
@@ -581,11 +582,14 @@ fn start_pageserver(
|
||||
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let (basebackup_prepare_sender, basebackup_prepare_receiver) =
|
||||
tokio::sync::mpsc::unbounded_channel();
|
||||
let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new(
|
||||
conf.basebackup_cache_dir(),
|
||||
conf.basebackup_cache_config.clone(),
|
||||
);
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
let background_purges = mgr::BackgroundPurges::default();
|
||||
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
|
||||
let tenant_manager = mgr::init(
|
||||
conf,
|
||||
background_purges.clone(),
|
||||
TenantSharedResources {
|
||||
@@ -593,18 +597,16 @@ fn start_pageserver(
|
||||
remote_storage: remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
feature_resolver,
|
||||
basebackup_cache: Arc::clone(&basebackup_cache),
|
||||
feature_resolver: feature_resolver.clone(),
|
||||
},
|
||||
order,
|
||||
shutdown_pageserver.clone(),
|
||||
))?;
|
||||
);
|
||||
let tenant_manager = Arc::new(tenant_manager);
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;
|
||||
|
||||
let basebackup_cache = BasebackupCache::spawn(
|
||||
basebackup_cache.spawn_background_task(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
conf.basebackup_cache_dir(),
|
||||
conf.basebackup_cache_config.clone(),
|
||||
basebackup_prepare_receiver,
|
||||
Arc::clone(&tenant_manager),
|
||||
shutdown_pageserver.child_token(),
|
||||
@@ -726,6 +728,7 @@ fn start_pageserver(
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue.new_client(),
|
||||
secondary_controller,
|
||||
feature_resolver,
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
);
|
||||
@@ -816,7 +819,6 @@ fn start_pageserver(
|
||||
} else {
|
||||
None
|
||||
},
|
||||
basebackup_cache,
|
||||
);
|
||||
|
||||
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for
|
||||
@@ -827,7 +829,7 @@ fn start_pageserver(
|
||||
// necessary?
|
||||
let mut page_service_grpc = None;
|
||||
if let Some(grpc_listener) = grpc_listener {
|
||||
page_service_grpc = Some(page_service::spawn_grpc(
|
||||
page_service_grpc = Some(GrpcPageServiceHandler::spawn(
|
||||
tenant_manager.clone(),
|
||||
grpc_auth,
|
||||
otel_guard.as_ref().map(|g| g.dispatch.clone()),
|
||||
|
||||
@@ -2,7 +2,9 @@ use std::io::{Read, Write, stdin, stdout};
|
||||
use std::time::Duration;
|
||||
|
||||
use clap::Parser;
|
||||
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
|
||||
use pageserver_api::pagestream_api::{
|
||||
PagestreamFeMessage, PagestreamRequest, PagestreamTestRequest,
|
||||
};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -28,17 +30,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mut msg = 0;
|
||||
loop {
|
||||
msg += 1;
|
||||
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
|
||||
PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(23),
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
let fut = sender.send(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(23),
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
));
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
}));
|
||||
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
|
||||
eprintln!("pipe seems full");
|
||||
break;
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::num::NonZeroUsize;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, bail, ensure};
|
||||
use anyhow::{Context, ensure};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{
|
||||
@@ -22,6 +22,7 @@ use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pem::Pem;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use reqwest::Url;
|
||||
use storage_broker::Uri;
|
||||
@@ -338,20 +339,16 @@ impl PageServerConf {
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
|
||||
pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
|
||||
let path = self.pg_distrib_dir.clone();
|
||||
|
||||
#[allow(clippy::manual_range_patterns)]
|
||||
match pg_version {
|
||||
14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
Ok(path.join(pg_version.v_str()))
|
||||
}
|
||||
|
||||
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
|
||||
pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
|
||||
}
|
||||
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
|
||||
pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
|
||||
}
|
||||
|
||||
@@ -765,4 +762,23 @@ mod tests {
|
||||
let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
|
||||
assert_eq!(result.is_ok(), is_valid);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_posthog_config_is_valid() {
|
||||
let input = r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
|
||||
[posthog_config]
|
||||
server_api_key = "phs_AAA"
|
||||
client_api_key = "phc_BBB"
|
||||
project_id = "000"
|
||||
private_api_url = "https://us.posthog.com"
|
||||
public_api_url = "https://us.i.posthog.com"
|
||||
"#;
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("posthogconfig is valid");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,14 +159,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
|
||||
Ok(m) => {
|
||||
// Since we run one time at startup, be generous in our logging and
|
||||
// dump all metadata.
|
||||
tracing::info!(
|
||||
"Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
|
||||
m.postgres_host,
|
||||
m.postgres_port,
|
||||
m.http_host,
|
||||
m.http_port,
|
||||
m.other
|
||||
);
|
||||
tracing::info!("Loaded node metadata: {m}");
|
||||
|
||||
let az_id = {
|
||||
let az_id_from_metadata = m
|
||||
@@ -195,6 +188,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_grpc_addr: m.grpc_host,
|
||||
listen_grpc_port: m.grpc_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
listen_https_port: m.https_port,
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
use posthog_client_lite::{
|
||||
CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
PostHogFlagFilterPropertyValue,
|
||||
@@ -11,10 +13,13 @@ use utils::id::TenantId;
|
||||
|
||||
use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
|
||||
|
||||
const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600);
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FeatureResolver {
|
||||
inner: Option<Arc<FeatureResolverBackgroundLoop>>,
|
||||
internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
|
||||
force_overrides_for_testing: Arc<ArcSwap<HashMap<String, String>>>,
|
||||
}
|
||||
|
||||
impl FeatureResolver {
|
||||
@@ -22,9 +27,17 @@ impl FeatureResolver {
|
||||
Self {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update(&self, spec: String) -> anyhow::Result<()> {
|
||||
if let Some(inner) = &self.inner {
|
||||
inner.update(spec)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
conf: &PageServerConf,
|
||||
shutdown_pageserver: CancellationToken,
|
||||
@@ -86,7 +99,35 @@ impl FeatureResolver {
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: add pageserver URL.
|
||||
// TODO: move this to a background task so that we don't block startup in case of slow disk
|
||||
let metadata_path = conf.metadata_path();
|
||||
match std::fs::read_to_string(&metadata_path) {
|
||||
Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
|
||||
Ok(metadata) => {
|
||||
properties.insert(
|
||||
"hostname".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(metadata.http_host),
|
||||
);
|
||||
if let Some(cplane_region) = metadata.other.get("region_id") {
|
||||
if let Some(cplane_region) = cplane_region.as_str() {
|
||||
// This region contains the cell number
|
||||
properties.insert(
|
||||
"neon_region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(
|
||||
cplane_region.to_string(),
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to parse metadata.json: {}", e);
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to read metadata.json: {}", e);
|
||||
}
|
||||
}
|
||||
Arc::new(properties)
|
||||
};
|
||||
let fake_tenants = {
|
||||
@@ -110,18 +151,23 @@ impl FeatureResolver {
|
||||
}
|
||||
tenants
|
||||
};
|
||||
// TODO: make refresh period configurable
|
||||
inner
|
||||
.clone()
|
||||
.spawn(handle, Duration::from_secs(60), fake_tenants);
|
||||
inner.clone().spawn(
|
||||
handle,
|
||||
posthog_config
|
||||
.refresh_interval
|
||||
.unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL),
|
||||
fake_tenants,
|
||||
);
|
||||
Ok(FeatureResolver {
|
||||
inner: Some(inner),
|
||||
internal_properties: Some(internal_properties),
|
||||
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
|
||||
})
|
||||
} else {
|
||||
Ok(FeatureResolver {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -161,6 +207,11 @@ impl FeatureResolver {
|
||||
flag_key: &str,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<String, PostHogEvaluationError> {
|
||||
let force_overrides = self.force_overrides_for_testing.load();
|
||||
if let Some(value) = force_overrides.get(flag_key) {
|
||||
return Ok(value.clone());
|
||||
}
|
||||
|
||||
if let Some(inner) = &self.inner {
|
||||
let res = inner.feature_store().evaluate_multivariate(
|
||||
flag_key,
|
||||
@@ -199,6 +250,15 @@ impl FeatureResolver {
|
||||
flag_key: &str,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), PostHogEvaluationError> {
|
||||
let force_overrides = self.force_overrides_for_testing.load();
|
||||
if let Some(value) = force_overrides.get(flag_key) {
|
||||
return if value == "true" {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NoConditionGroupMatched)
|
||||
};
|
||||
}
|
||||
|
||||
if let Some(inner) = &self.inner {
|
||||
let res = inner.feature_store().evaluate_boolean(
|
||||
flag_key,
|
||||
@@ -230,8 +290,22 @@ impl FeatureResolver {
|
||||
inner.feature_store().is_feature_flag_boolean(flag_key)
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NotAvailable(
|
||||
"PostHog integration is not enabled".to_string(),
|
||||
"PostHog integration is not enabled, cannot auto-determine the flag type"
|
||||
.to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Force override a feature flag for testing. This is only for testing purposes. Assume the caller only call it
|
||||
/// from a single thread so it won't race.
|
||||
pub fn force_override_for_testing(&self, flag_key: &str, value: Option<&str>) {
|
||||
let mut force_overrides = self.force_overrides_for_testing.load().as_ref().clone();
|
||||
if let Some(value) = value {
|
||||
force_overrides.insert(flag_key.to_string(), value.to_string());
|
||||
} else {
|
||||
force_overrides.remove(flag_key);
|
||||
}
|
||||
self.force_overrides_for_testing
|
||||
.store(Arc::new(force_overrides));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,6 +41,7 @@ use pageserver_api::models::{
|
||||
TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
use scopeguard::defer;
|
||||
use serde_json::json;
|
||||
@@ -59,6 +60,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::feature_resolver::FeatureResolver;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationConf;
|
||||
@@ -73,6 +75,7 @@ use crate::tenant::remote_timeline_client::{
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
use crate::tenant::timeline::{
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
|
||||
@@ -106,6 +109,7 @@ pub struct State {
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
|
||||
feature_resolver: FeatureResolver,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -119,6 +123,7 @@ impl State {
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
feature_resolver: FeatureResolver,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = &[
|
||||
"/v1/status",
|
||||
@@ -139,6 +144,7 @@ impl State {
|
||||
deletion_queue_client,
|
||||
secondary_controller,
|
||||
latest_utilization: Default::default(),
|
||||
feature_resolver,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -282,11 +288,11 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||
ApiError::ShuttingDown
|
||||
}
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{e}")),
|
||||
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
||||
GetActiveTenantError::NotFound(gte) => gte.into(),
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => {
|
||||
ApiError::ResourceUnavailable(format!("{}", e).into())
|
||||
ApiError::ResourceUnavailable(format!("{e}").into())
|
||||
}
|
||||
GetActiveTenantError::SwitchedTenant => {
|
||||
// in our HTTP handlers, this error doesn't happen
|
||||
@@ -1010,7 +1016,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let timestamp_raw = must_get_query_param(&request, "timestamp")?;
|
||||
let timestamp = humantime::parse_rfc3339(×tamp_raw)
|
||||
.with_context(|| format!("Invalid time: {:?}", timestamp_raw))
|
||||
.with_context(|| format!("Invalid time: {timestamp_raw:?}"))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
@@ -1105,7 +1111,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => Err(ApiError::PreconditionFailed(
|
||||
format!("Timestamp for lsn {} not found", lsn).into(),
|
||||
format!("Timestamp for lsn {lsn} not found").into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -1451,7 +1457,10 @@ async fn timeline_layer_scan_disposable_keys(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
|
||||
@@ -2413,7 +2422,7 @@ async fn timeline_offload_handler(
|
||||
}
|
||||
if let (false, reason) = timeline.can_offload() {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Timeline::can_offload() check failed: {}", reason) .into(),
|
||||
format!("Timeline::can_offload() check failed: {reason}") .into(),
|
||||
));
|
||||
}
|
||||
offload_timeline(&tenant, &timeline)
|
||||
@@ -3377,7 +3386,7 @@ async fn put_tenant_timeline_import_basebackup(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
|
||||
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
|
||||
let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
|
||||
let pg_version: PgMajorVersion = must_parse_query_param(&request, "pg_version")?;
|
||||
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
@@ -3671,8 +3680,8 @@ async fn tenant_evaluate_feature_flag(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let flag: String = must_parse_query_param(&request, "flag")?;
|
||||
let as_type: String = must_parse_query_param(&request, "as")?;
|
||||
let flag: String = parse_request_param(&request, "flag_key")?;
|
||||
let as_type: Option<String> = parse_query_param(&request, "as")?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
@@ -3681,11 +3690,11 @@ async fn tenant_evaluate_feature_flag(
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
|
||||
if as_type == "boolean" {
|
||||
if as_type.as_deref() == Some("boolean") {
|
||||
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
|
||||
let result = result.map(|_| true).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else if as_type == "multivariate" {
|
||||
} else if as_type.as_deref() == Some("multivariate") {
|
||||
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else {
|
||||
@@ -3705,6 +3714,49 @@ async fn tenant_evaluate_feature_flag(
|
||||
.await
|
||||
}
|
||||
|
||||
async fn force_override_feature_flag_for_testing_put(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let flag: String = parse_request_param(&request, "flag_key")?;
|
||||
let value: String = must_parse_query_param(&request, "value")?;
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.feature_resolver
|
||||
.force_override_for_testing(&flag, Some(&value));
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn force_override_feature_flag_for_testing_delete(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let flag: String = parse_request_param(&request, "flag_key")?;
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.feature_resolver
|
||||
.force_override_for_testing(&flag, None);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn update_feature_flag_spec(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let body = json_request(&mut request).await?;
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.feature_resolver
|
||||
.update(body)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -4081,8 +4133,17 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
|
||||
|r| api_handler(r, activate_post_import_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/feature_flag", |r| {
|
||||
.get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
|
||||
api_handler(r, tenant_evaluate_feature_flag)
|
||||
})
|
||||
.put("/v1/feature_flag/:flag_key", |r| {
|
||||
testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
|
||||
})
|
||||
.delete("/v1/feature_flag/:flag_key", |r| {
|
||||
testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete)
|
||||
})
|
||||
.post("/v1/feature_flag_spec", |r| {
|
||||
api_handler(r, update_feature_flag_spec)
|
||||
})
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -520,7 +520,7 @@ async fn import_file(
|
||||
}
|
||||
|
||||
if file_path.starts_with("global") {
|
||||
let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
let spcnode = postgres_ffi_types::constants::GLOBALTABLESPACE_OID;
|
||||
let dbnode = 0;
|
||||
|
||||
match file_name.as_ref() {
|
||||
@@ -553,7 +553,7 @@ async fn import_file(
|
||||
}
|
||||
}
|
||||
} else if file_path.starts_with("base") {
|
||||
let spcnode = pg_constants::DEFAULTTABLESPACE_OID;
|
||||
let spcnode = postgres_ffi_types::constants::DEFAULTTABLESPACE_OID;
|
||||
let dbnode: u32 = file_path
|
||||
.iter()
|
||||
.nth(1)
|
||||
|
||||
@@ -38,6 +38,7 @@ pub mod walredo;
|
||||
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use tenant::mgr::{BackgroundPurges, TenantManager};
|
||||
use tenant::secondary;
|
||||
use tracing::{info, info_span};
|
||||
@@ -51,7 +52,7 @@ use tracing::{info, info_span};
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 17;
|
||||
pub const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
|
||||
@@ -1053,6 +1053,15 @@ pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
pub(crate) static TIMELINE_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_timeline_states_count",
|
||||
"Count of timelines per state",
|
||||
&["state"]
|
||||
)
|
||||
.expect("Failed to register pageserver_timeline_states_count metric")
|
||||
});
|
||||
|
||||
/// A set of broken tenants.
|
||||
///
|
||||
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
|
||||
@@ -1718,12 +1727,7 @@ impl Drop for SmgrOpTimer {
|
||||
|
||||
impl SmgrOpFlushInProgress {
|
||||
/// The caller must guarantee that `socket_fd`` outlives this function.
|
||||
pub(crate) async fn measure<Fut, O>(
|
||||
self,
|
||||
started_at: Instant,
|
||||
mut fut: Fut,
|
||||
socket_fd: RawFd,
|
||||
) -> O
|
||||
pub(crate) async fn measure<Fut, O>(self, started_at: Instant, fut: Fut, socket_fd: RawFd) -> O
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
@@ -3325,6 +3329,8 @@ impl TimelineMetrics {
|
||||
&timeline_id,
|
||||
);
|
||||
|
||||
TIMELINE_STATE_METRIC.with_label_values(&["active"]).inc();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
@@ -3415,7 +3421,7 @@ impl TimelineMetrics {
|
||||
pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
|
||||
assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
|
||||
let labels = self.make_frozen_layer_labels(layer);
|
||||
let size = layer.try_len().expect("frozen layer should have no writer");
|
||||
let size = layer.len();
|
||||
TIMELINE_LAYER_COUNT
|
||||
.get_metric_with_label_values(&labels)
|
||||
.unwrap()
|
||||
@@ -3430,7 +3436,7 @@ impl TimelineMetrics {
|
||||
pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
|
||||
assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
|
||||
let labels = self.make_frozen_layer_labels(layer);
|
||||
let size = layer.try_len().expect("frozen layer should have no writer");
|
||||
let size = layer.len();
|
||||
TIMELINE_LAYER_COUNT
|
||||
.get_metric_with_label_values(&labels)
|
||||
.unwrap()
|
||||
@@ -3479,6 +3485,8 @@ impl TimelineMetrics {
|
||||
return;
|
||||
}
|
||||
|
||||
TIMELINE_STATE_METRIC.with_label_values(&["active"]).dec();
|
||||
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
@@ -4415,24 +4423,30 @@ pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_basebackup_cache_entries_total",
|
||||
"Number of entries in the basebackup cache"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// FIXME: Support basebackup cache size metrics.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_basebackup_cache_size_bytes",
|
||||
"Total size of all basebackup cache entries on disk in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_basebackup_cache_prepare_queue_size",
|
||||
"Number of requests in the basebackup prepare channel"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_config_ignored_items",
|
||||
|
||||
@@ -13,8 +13,7 @@ use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{io, str};
|
||||
|
||||
use anyhow::{Context as _, anyhow, bail};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::{Buf, BytesMut};
|
||||
use bytes::{Buf as _, BufMut as _, BytesMut};
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, Stream};
|
||||
use itertools::Itertools;
|
||||
@@ -25,12 +24,13 @@ use pageserver_api::config::{
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::{
|
||||
self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
use pageserver_api::models::{PageTraceEvent, TenantState};
|
||||
use pageserver_api::pagestream_api::{
|
||||
self, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
|
||||
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
PagestreamProtocolVersion, PagestreamRequest, TenantState,
|
||||
PagestreamProtocolVersion, PagestreamRequest,
|
||||
};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -40,7 +40,7 @@ use postgres_backend::{
|
||||
AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
|
||||
};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi_types::constants::DEFAULTTABLESPACE_OID;
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
|
||||
use smallvec::{SmallVec, smallvec};
|
||||
@@ -62,7 +62,6 @@ use utils::{failpoint_support, span_record};
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup::{self, BasebackupError};
|
||||
use crate::basebackup_cache::BasebackupCache;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
@@ -137,7 +136,6 @@ pub fn spawn(
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
@@ -159,7 +157,6 @@ pub fn spawn(
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
basebackup_cache,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
)
|
||||
@@ -169,99 +166,6 @@ pub fn spawn(
|
||||
Listener { cancel, task }
|
||||
}
|
||||
|
||||
/// Spawns a gRPC server for the page service.
|
||||
///
|
||||
/// TODO: move this onto GrpcPageServiceHandler::spawn().
|
||||
/// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
|
||||
/// need to reimplement the TCP+TLS accept loop ourselves.
|
||||
pub fn spawn_grpc(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<CancellableTask> {
|
||||
let cancel = CancellationToken::new();
|
||||
let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch)
|
||||
.detached_child();
|
||||
let gate = Gate::default();
|
||||
|
||||
// Set up the TCP socket. We take a preconfigured TcpListener to bind the
|
||||
// port early during startup.
|
||||
let incoming = {
|
||||
let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std
|
||||
listener.set_nonblocking(true)?;
|
||||
tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std(listener)?)
|
||||
.with_nodelay(Some(GRPC_TCP_NODELAY))
|
||||
.with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME))
|
||||
};
|
||||
|
||||
// Set up the gRPC server.
|
||||
//
|
||||
// TODO: consider tuning window sizes.
|
||||
let mut server = tonic::transport::Server::builder()
|
||||
.http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL))
|
||||
.http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT))
|
||||
.max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS));
|
||||
|
||||
// Main page service stack. Uses a mix of Tonic interceptors and Tower layers:
|
||||
//
|
||||
// * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service.
|
||||
//
|
||||
// * Layers: allow async code, can run code after the service response. However, only has access
|
||||
// to the raw HTTP request/response, not the gRPC types.
|
||||
let page_service_handler = GrpcPageServiceHandler {
|
||||
tenant_manager,
|
||||
ctx,
|
||||
gate_guard: gate.enter().expect("gate was just created"),
|
||||
get_vectored_concurrent_io,
|
||||
};
|
||||
|
||||
let observability_layer = ObservabilityLayer;
|
||||
let mut tenant_interceptor = TenantMetadataInterceptor;
|
||||
let mut auth_interceptor = TenantAuthInterceptor::new(auth);
|
||||
|
||||
let page_service = tower::ServiceBuilder::new()
|
||||
// Create tracing span and record request start time.
|
||||
.layer(observability_layer)
|
||||
// Intercept gRPC requests.
|
||||
.layer(tonic::service::InterceptorLayer::new(move |mut req| {
|
||||
// Extract tenant metadata.
|
||||
req = tenant_interceptor.call(req)?;
|
||||
// Authenticate tenant JWT token.
|
||||
req = auth_interceptor.call(req)?;
|
||||
Ok(req)
|
||||
}))
|
||||
.service(proto::PageServiceServer::new(page_service_handler));
|
||||
let server = server.add_service(page_service);
|
||||
|
||||
// Reflection service for use with e.g. grpcurl.
|
||||
let reflection_service = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET)
|
||||
.build_v1()?;
|
||||
let server = server.add_service(reflection_service);
|
||||
|
||||
// Spawn server task.
|
||||
let task_cancel = cancel.clone();
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"grpc listener",
|
||||
async move {
|
||||
let result = server
|
||||
.serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
|
||||
.await;
|
||||
if result.is_ok() {
|
||||
// TODO: revisit shutdown logic once page service is implemented.
|
||||
gate.close().await;
|
||||
}
|
||||
result
|
||||
},
|
||||
));
|
||||
|
||||
Ok(CancellableTask { task, cancel })
|
||||
}
|
||||
|
||||
impl Listener {
|
||||
pub async fn stop_accepting(self) -> Connections {
|
||||
self.cancel.cancel();
|
||||
@@ -311,7 +215,6 @@ pub async fn libpq_listener_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
@@ -355,7 +258,6 @@ pub async fn libpq_listener_main(
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
Arc::clone(&basebackup_cache),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
@@ -398,7 +300,6 @@ async fn page_service_conn_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
@@ -464,7 +365,6 @@ async fn page_service_conn_main(
|
||||
pipelining_config,
|
||||
conf.get_vectored_concurrent_io,
|
||||
perf_span_fields,
|
||||
basebackup_cache,
|
||||
connection_ctx,
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
@@ -484,16 +384,14 @@ async fn page_service_conn_main(
|
||||
} else {
|
||||
let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
|
||||
Err(io_error).context(format!(
|
||||
"Postgres connection error for tenant_id={:?} client at peer_addr={}",
|
||||
tenant_id, peer_addr
|
||||
"Postgres connection error for tenant_id={tenant_id:?} client at peer_addr={peer_addr}"
|
||||
))
|
||||
}
|
||||
}
|
||||
other => {
|
||||
let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
|
||||
other.context(format!(
|
||||
"Postgres query error for tenant_id={:?} client peer_addr={}",
|
||||
tenant_id, peer_addr
|
||||
"Postgres query error for tenant_id={tenant_id:?} client peer_addr={peer_addr}"
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -520,8 +418,6 @@ struct PageServerHandler {
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
gate_guard: GateGuard,
|
||||
}
|
||||
|
||||
@@ -716,60 +612,6 @@ enum PageStreamError {
|
||||
BadRequest(Cow<'static, str>),
|
||||
}
|
||||
|
||||
impl PageStreamError {
|
||||
/// Converts a PageStreamError into a proto::GetPageResponse with the appropriate status
|
||||
/// code, or a gRPC status if it should terminate the stream (e.g. shutdown). This is a
|
||||
/// convenience method for use from a get_pages gRPC stream.
|
||||
#[allow(clippy::result_large_err)]
|
||||
fn into_get_page_response(
|
||||
self,
|
||||
request_id: page_api::RequestID,
|
||||
) -> Result<proto::GetPageResponse, tonic::Status> {
|
||||
use page_api::GetPageStatusCode;
|
||||
use tonic::Code;
|
||||
|
||||
// We dispatch to Into<tonic::Status> first, and then map it to a GetPageResponse.
|
||||
let status: tonic::Status = self.into();
|
||||
let status_code = match status.code() {
|
||||
// We shouldn't see an OK status here, because we're emitting an error.
|
||||
Code::Ok => {
|
||||
debug_assert_ne!(status.code(), Code::Ok);
|
||||
return Err(tonic::Status::internal(format!(
|
||||
"unexpected OK status: {status:?}",
|
||||
)));
|
||||
}
|
||||
|
||||
// These are per-request errors, returned as GetPageResponses.
|
||||
Code::AlreadyExists => GetPageStatusCode::InvalidRequest,
|
||||
Code::DataLoss => GetPageStatusCode::InternalError,
|
||||
Code::FailedPrecondition => GetPageStatusCode::InvalidRequest,
|
||||
Code::InvalidArgument => GetPageStatusCode::InvalidRequest,
|
||||
Code::Internal => GetPageStatusCode::InternalError,
|
||||
Code::NotFound => GetPageStatusCode::NotFound,
|
||||
Code::OutOfRange => GetPageStatusCode::InvalidRequest,
|
||||
Code::ResourceExhausted => GetPageStatusCode::SlowDown,
|
||||
|
||||
// These should terminate the stream.
|
||||
Code::Aborted => return Err(status),
|
||||
Code::Cancelled => return Err(status),
|
||||
Code::DeadlineExceeded => return Err(status),
|
||||
Code::PermissionDenied => return Err(status),
|
||||
Code::Unauthenticated => return Err(status),
|
||||
Code::Unavailable => return Err(status),
|
||||
Code::Unimplemented => return Err(status),
|
||||
Code::Unknown => return Err(status),
|
||||
};
|
||||
|
||||
Ok(page_api::GetPageResponse {
|
||||
request_id,
|
||||
status_code,
|
||||
reason: Some(status.message().to_string()),
|
||||
page_images: Vec::new(),
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageStreamError> for tonic::Status {
|
||||
fn from(err: PageStreamError) -> Self {
|
||||
use tonic::Code;
|
||||
@@ -859,7 +701,7 @@ struct BatchedGetPageRequest {
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct BatchedTestRequest {
|
||||
req: models::PagestreamTestRequest,
|
||||
req: pagestream_api::PagestreamTestRequest,
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
@@ -873,13 +715,13 @@ enum BatchedFeMessage {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamExistsRequest,
|
||||
req: PagestreamExistsRequest,
|
||||
},
|
||||
Nblocks {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamNblocksRequest,
|
||||
req: PagestreamNblocksRequest,
|
||||
},
|
||||
GetPage {
|
||||
span: Span,
|
||||
@@ -891,13 +733,13 @@ enum BatchedFeMessage {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamDbSizeRequest,
|
||||
req: PagestreamDbSizeRequest,
|
||||
},
|
||||
GetSlruSegment {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: WeakHandle<TenantManagerTypes>,
|
||||
req: models::PagestreamGetSlruSegmentRequest,
|
||||
req: PagestreamGetSlruSegmentRequest,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
@@ -1061,7 +903,6 @@ impl PageServerHandler {
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
perf_span_fields: ConnectionPerfSpanFields,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
@@ -1075,7 +916,6 @@ impl PageServerHandler {
|
||||
cancel,
|
||||
pipelining_config,
|
||||
get_vectored_concurrent_io,
|
||||
basebackup_cache,
|
||||
gate_guard,
|
||||
}
|
||||
}
|
||||
@@ -2286,8 +2126,7 @@ impl PageServerHandler {
|
||||
if request_lsn < not_modified_since {
|
||||
return Err(PageStreamError::BadRequest(
|
||||
format!(
|
||||
"invalid request with request LSN {} and not_modified_since {}",
|
||||
request_lsn, not_modified_since,
|
||||
"invalid request with request LSN {request_lsn} and not_modified_since {not_modified_since}",
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
@@ -2590,10 +2429,9 @@ impl PageServerHandler {
|
||||
.map(|(req, res)| {
|
||||
res.map(|page| {
|
||||
(
|
||||
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
|
||||
req: req.req,
|
||||
page,
|
||||
}),
|
||||
PagestreamBeMessage::GetPage(
|
||||
pagestream_api::PagestreamGetPageResponse { req: req.req, page },
|
||||
),
|
||||
req.timer,
|
||||
req.ctx,
|
||||
)
|
||||
@@ -2660,7 +2498,7 @@ impl PageServerHandler {
|
||||
.map(|(req, res)| {
|
||||
res.map(|()| {
|
||||
(
|
||||
PagestreamBeMessage::Test(models::PagestreamTestResponse {
|
||||
PagestreamBeMessage::Test(pagestream_api::PagestreamTestResponse {
|
||||
req: req.req.clone(),
|
||||
}),
|
||||
req.timer,
|
||||
@@ -2763,6 +2601,7 @@ impl PageServerHandler {
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
None,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2776,9 +2615,7 @@ impl PageServerHandler {
|
||||
&& lsn.is_some()
|
||||
&& prev_lsn.is_none()
|
||||
{
|
||||
self.basebackup_cache
|
||||
.get(tenant_id, timeline_id, lsn.unwrap())
|
||||
.await
|
||||
timeline.get_cached_basebackup(lsn.unwrap()).await
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -2791,31 +2628,6 @@ impl PageServerHandler {
|
||||
.map_err(|err| {
|
||||
BasebackupError::Client(err, "handle_basebackup_request,cached,copy")
|
||||
})?;
|
||||
} else if gzip {
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
&mut writer,
|
||||
// NOTE using fast compression because it's on the critical path
|
||||
// for compute startup. For an empty database, we get
|
||||
// <100KB with this method. The Level::Best compression method
|
||||
// gives us <20KB, but maybe we should add basebackup caching
|
||||
// on compute shutdown first.
|
||||
async_compression::Level::Fastest,
|
||||
);
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut encoder,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
encoder
|
||||
.shutdown()
|
||||
.await
|
||||
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
|
||||
} else {
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
@@ -2824,6 +2636,11 @@ impl PageServerHandler {
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
// NB: using fast compression because it's on the critical path for compute
|
||||
// startup. For an empty database, we get <100KB with this method. The
|
||||
// Level::Best compression method gives us <20KB, but maybe we should add
|
||||
// basebackup caching on compute shutdown first.
|
||||
gzip.then_some(async_compression::Level::Fastest),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -3366,6 +3183,108 @@ pub struct GrpcPageServiceHandler {
|
||||
}
|
||||
|
||||
impl GrpcPageServiceHandler {
|
||||
/// Spawns a gRPC server for the page service.
|
||||
///
|
||||
/// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
|
||||
/// need to reimplement the TCP+TLS accept loop ourselves.
|
||||
pub fn spawn(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<CancellableTask> {
|
||||
let cancel = CancellationToken::new();
|
||||
let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch)
|
||||
.detached_child();
|
||||
let gate = Gate::default();
|
||||
|
||||
// Set up the TCP socket. We take a preconfigured TcpListener to bind the
|
||||
// port early during startup.
|
||||
let incoming = {
|
||||
let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std
|
||||
listener.set_nonblocking(true)?;
|
||||
tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std(
|
||||
listener,
|
||||
)?)
|
||||
.with_nodelay(Some(GRPC_TCP_NODELAY))
|
||||
.with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME))
|
||||
};
|
||||
|
||||
// Set up the gRPC server.
|
||||
//
|
||||
// TODO: consider tuning window sizes.
|
||||
let mut server = tonic::transport::Server::builder()
|
||||
.http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL))
|
||||
.http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT))
|
||||
.max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS));
|
||||
|
||||
// Main page service stack. Uses a mix of Tonic interceptors and Tower layers:
|
||||
//
|
||||
// * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service.
|
||||
//
|
||||
// * Layers: allow async code, can run code after the service response. However, only has access
|
||||
// to the raw HTTP request/response, not the gRPC types.
|
||||
let page_service_handler = GrpcPageServiceHandler {
|
||||
tenant_manager,
|
||||
ctx,
|
||||
gate_guard: gate.enter().expect("gate was just created"),
|
||||
get_vectored_concurrent_io,
|
||||
};
|
||||
|
||||
let observability_layer = ObservabilityLayer;
|
||||
let mut tenant_interceptor = TenantMetadataInterceptor;
|
||||
let mut auth_interceptor = TenantAuthInterceptor::new(auth);
|
||||
|
||||
let page_service = tower::ServiceBuilder::new()
|
||||
// Create tracing span and record request start time.
|
||||
.layer(observability_layer)
|
||||
// Intercept gRPC requests.
|
||||
.layer(tonic::service::InterceptorLayer::new(move |mut req| {
|
||||
// Extract tenant metadata.
|
||||
req = tenant_interceptor.call(req)?;
|
||||
// Authenticate tenant JWT token.
|
||||
req = auth_interceptor.call(req)?;
|
||||
Ok(req)
|
||||
}))
|
||||
// Run the page service.
|
||||
.service(
|
||||
proto::PageServiceServer::new(page_service_handler)
|
||||
// Support both gzip and zstd compression. The client decides what to use.
|
||||
.accept_compressed(tonic::codec::CompressionEncoding::Gzip)
|
||||
.accept_compressed(tonic::codec::CompressionEncoding::Zstd)
|
||||
.send_compressed(tonic::codec::CompressionEncoding::Gzip)
|
||||
.send_compressed(tonic::codec::CompressionEncoding::Zstd),
|
||||
);
|
||||
let server = server.add_service(page_service);
|
||||
|
||||
// Reflection service for use with e.g. grpcurl.
|
||||
let reflection_service = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET)
|
||||
.build_v1()?;
|
||||
let server = server.add_service(reflection_service);
|
||||
|
||||
// Spawn server task.
|
||||
let task_cancel = cancel.clone();
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"grpc listener",
|
||||
async move {
|
||||
let result = server
|
||||
.serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
|
||||
.await;
|
||||
if result.is_ok() {
|
||||
// TODO: revisit shutdown logic once page service is implemented.
|
||||
gate.close().await;
|
||||
}
|
||||
result
|
||||
},
|
||||
));
|
||||
|
||||
Ok(CancellableTask { task, cancel })
|
||||
}
|
||||
|
||||
/// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of
|
||||
/// relations and their sizes, as well as SLRU segments and similar data.
|
||||
#[allow(clippy::result_large_err)]
|
||||
@@ -3436,8 +3355,8 @@ impl GrpcPageServiceHandler {
|
||||
|
||||
/// Processes a GetPage batch request, via the GetPages bidirectional streaming RPC.
|
||||
///
|
||||
/// NB: errors will terminate the stream. Per-request errors should return a GetPageResponse
|
||||
/// with an appropriate status code instead.
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
@@ -3454,7 +3373,7 @@ impl GrpcPageServiceHandler {
|
||||
let ctx = ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
let req: page_api::GetPageRequest = req.try_into()?;
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
span_record!(
|
||||
req_id = %req.request_id,
|
||||
@@ -3465,7 +3384,7 @@ impl GrpcPageServiceHandler {
|
||||
);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
|
||||
let effective_lsn = match PageServerHandler::effective_request_lsn(
|
||||
let effective_lsn = PageServerHandler::effective_request_lsn(
|
||||
&timeline,
|
||||
timeline.get_last_record_lsn(),
|
||||
req.read_lsn.request_lsn,
|
||||
@@ -3473,10 +3392,7 @@ impl GrpcPageServiceHandler {
|
||||
.not_modified_since_lsn
|
||||
.unwrap_or(req.read_lsn.request_lsn),
|
||||
&latest_gc_cutoff_lsn,
|
||||
) {
|
||||
Ok(lsn) => lsn,
|
||||
Err(err) => return err.into_get_page_response(req.request_id),
|
||||
};
|
||||
)?;
|
||||
|
||||
let mut batch = SmallVec::with_capacity(req.block_numbers.len());
|
||||
for blkno in req.block_numbers {
|
||||
@@ -3533,7 +3449,7 @@ impl GrpcPageServiceHandler {
|
||||
"unexpected response: {resp:?}"
|
||||
)));
|
||||
}
|
||||
Err(err) => return err.err.into_get_page_response(req.request_id),
|
||||
Err(err) => return Err(err.err.into()),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -3587,57 +3503,66 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
Ok(tonic::Response::new(resp.into()))
|
||||
}
|
||||
|
||||
// TODO: ensure clients use gzip compression for the stream.
|
||||
#[instrument(skip_all, fields(lsn))]
|
||||
async fn get_base_backup(
|
||||
&self,
|
||||
req: tonic::Request<proto::GetBaseBackupRequest>,
|
||||
) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
|
||||
// Send 64 KB chunks to avoid large memory allocations.
|
||||
const CHUNK_SIZE: usize = 64 * 1024;
|
||||
// Send chunks of 256 KB to avoid large memory allocations. pagebench basebackup shows this
|
||||
// to be the sweet spot where throughput is saturated.
|
||||
const CHUNK_SIZE: usize = 256 * 1024;
|
||||
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and wait for the LSN to arrive.
|
||||
//
|
||||
// TODO: this requires a read LSN, is that ok?
|
||||
// Validate the request and decorate the span.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(tonic::Status::failed_precondition("timeline is archived"));
|
||||
}
|
||||
let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(lsn=%req.read_lsn);
|
||||
span_record!(lsn=?req.lsn);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
timeline
|
||||
.wait_lsn(
|
||||
req.read_lsn.request_lsn,
|
||||
WaitLsnWaiter::PageService,
|
||||
WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(req.read_lsn.request_lsn, &latest_gc_cutoff_lsn)
|
||||
.map_err(|err| {
|
||||
tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
|
||||
})?;
|
||||
// Wait for the LSN to arrive, if given.
|
||||
if let Some(lsn) = req.lsn {
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
WaitLsnWaiter::PageService,
|
||||
WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.map_err(|err| {
|
||||
tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
|
||||
})?;
|
||||
}
|
||||
|
||||
// Spawn a task to run the basebackup.
|
||||
//
|
||||
// TODO: do we need to support full base backups, for debugging?
|
||||
let span = Span::current();
|
||||
let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
|
||||
let jh = tokio::spawn(async move {
|
||||
let gzip_level = match req.compression {
|
||||
page_api::BaseBackupCompression::None => None,
|
||||
// NB: using fast compression because it's on the critical path for compute
|
||||
// startup. For an empty database, we get <100KB with this method. The
|
||||
// Level::Best compression method gives us <20KB, but maybe we should add
|
||||
// basebackup caching on compute shutdown first.
|
||||
page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest),
|
||||
};
|
||||
|
||||
let result = basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
Some(req.read_lsn.request_lsn),
|
||||
req.lsn,
|
||||
None,
|
||||
false,
|
||||
req.full,
|
||||
req.replica,
|
||||
gzip_level,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(span) // propagate request span
|
||||
@@ -3650,20 +3575,21 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
|
||||
// Emit chunks of size CHUNK_SIZE.
|
||||
let chunks = async_stream::try_stream! {
|
||||
let mut chunk = BytesMut::with_capacity(CHUNK_SIZE);
|
||||
loop {
|
||||
let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
|
||||
tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
|
||||
})?;
|
||||
|
||||
// If we read 0 bytes, either the chunk is full or the stream is closed.
|
||||
if n == 0 {
|
||||
if chunk.is_empty() {
|
||||
break;
|
||||
let mut chunk = BytesMut::with_capacity(CHUNK_SIZE).limit(CHUNK_SIZE);
|
||||
loop {
|
||||
let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
|
||||
tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
|
||||
})?;
|
||||
if n == 0 {
|
||||
break; // full chunk or closed stream
|
||||
}
|
||||
yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze());
|
||||
chunk.clear();
|
||||
}
|
||||
let chunk = chunk.into_inner().freeze();
|
||||
if chunk.is_empty() {
|
||||
break;
|
||||
}
|
||||
yield proto::GetBaseBackupResponseChunk::from(chunk);
|
||||
}
|
||||
// Wait for the basebackup task to exit and check for errors.
|
||||
jh.await.map_err(|err| {
|
||||
@@ -3740,9 +3666,16 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
.await?
|
||||
.downgrade();
|
||||
while let Some(req) = reqs.message().await? {
|
||||
yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
let req_id = req.request_id;
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await?
|
||||
.await;
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status.
|
||||
Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -23,12 +23,11 @@ use pageserver_api::key::{
|
||||
};
|
||||
use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
|
||||
use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi_types::{Oid, RepOriginId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -36,6 +35,8 @@ use tracing::{debug, info, info_span, trace, warn};
|
||||
use utils::bin_ser::{BeSer, DeserializeError};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pausable_failpoint;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
use wal_decoder::models::value::Value;
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
@@ -720,6 +721,7 @@ impl Timeline {
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
|
||||
BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
@@ -960,6 +962,7 @@ impl Timeline {
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
|
||||
BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
@@ -1078,7 +1081,7 @@ impl Timeline {
|
||||
// fetch directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||
|
||||
if self.pg_version >= 17 {
|
||||
if self.pg_version >= PgMajorVersion::PG17 {
|
||||
Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
|
||||
} else {
|
||||
Ok(TwoPhaseDirectory::des(&buf)?
|
||||
@@ -1182,7 +1185,7 @@ impl Timeline {
|
||||
}
|
||||
let origin_id = k.field6 as RepOriginId;
|
||||
let origin_lsn = Lsn::des(&v)
|
||||
.with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?;
|
||||
.with_context(|| format!("decode replorigin value for {origin_id}: {v:?}"))?;
|
||||
if origin_lsn != Lsn::INVALID {
|
||||
result.insert(origin_id, origin_lsn);
|
||||
}
|
||||
@@ -1610,7 +1613,7 @@ impl DatadirModification<'_> {
|
||||
.push((DirectoryKind::Db, MetricsUpdate::Set(0)));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
let buf = if self.tline.pg_version >= 17 {
|
||||
let buf = if self.tline.pg_version >= PgMajorVersion::PG17 {
|
||||
TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
|
||||
xids: HashSet::new(),
|
||||
})
|
||||
@@ -1964,7 +1967,7 @@ impl DatadirModification<'_> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Add it to the directory entry
|
||||
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
|
||||
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
|
||||
@@ -2380,7 +2383,7 @@ impl DatadirModification<'_> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
|
||||
let mut dir = TwoPhaseDirectoryV17::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -2437,8 +2440,7 @@ impl DatadirModification<'_> {
|
||||
if path == p {
|
||||
assert!(
|
||||
modifying_file.is_none(),
|
||||
"duplicated entries found for {}",
|
||||
path
|
||||
"duplicated entries found for {path}"
|
||||
);
|
||||
modifying_file = Some(content);
|
||||
} else {
|
||||
|
||||
@@ -38,6 +38,7 @@ use pageserver_api::models::{
|
||||
WalRedoManagerStatus,
|
||||
};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel};
|
||||
use remote_timeline_client::index::GcCompactionState;
|
||||
use remote_timeline_client::manifest::{
|
||||
@@ -51,6 +52,7 @@ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
|
||||
use timeline::import_pgdata::ImportingTimeline;
|
||||
use timeline::layer_manager::LayerManagerLockHolder;
|
||||
use timeline::offload::{OffloadError, offload_timeline};
|
||||
use timeline::{
|
||||
CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata,
|
||||
@@ -78,7 +80,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
|
||||
use self::timeline::{
|
||||
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
|
||||
};
|
||||
use crate::basebackup_cache::BasebackupPrepareSender;
|
||||
use crate::basebackup_cache::BasebackupCache;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
use crate::context::RequestContextBuilder;
|
||||
@@ -89,7 +91,8 @@ use crate::l0_flush::L0FlushGlobalState;
|
||||
use crate::metrics::{
|
||||
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
|
||||
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, TIMELINE_STATE_METRIC,
|
||||
remove_tenant_metrics,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
@@ -159,7 +162,7 @@ pub struct TenantSharedResources {
|
||||
pub remote_storage: GenericRemoteStorage,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
pub l0_flush_global_state: L0FlushGlobalState,
|
||||
pub basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
pub basebackup_cache: Arc<BasebackupCache>,
|
||||
pub feature_resolver: FeatureResolver,
|
||||
}
|
||||
|
||||
@@ -328,7 +331,7 @@ pub struct TenantShard {
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
@@ -494,8 +497,8 @@ impl WalRedoManager {
|
||||
key: pageserver_api::key::Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>,
|
||||
pg_version: PgMajorVersion,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
@@ -544,6 +547,28 @@ pub struct OffloadedTimeline {
|
||||
|
||||
/// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it
|
||||
pub deleted_from_ancestor: AtomicBool,
|
||||
|
||||
_metrics_guard: OffloadedTimelineMetricsGuard,
|
||||
}
|
||||
|
||||
/// Increases the offloaded timeline count metric when created, and decreases when dropped.
|
||||
struct OffloadedTimelineMetricsGuard;
|
||||
|
||||
impl OffloadedTimelineMetricsGuard {
|
||||
fn new() -> Self {
|
||||
TIMELINE_STATE_METRIC
|
||||
.with_label_values(&["offloaded"])
|
||||
.inc();
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for OffloadedTimelineMetricsGuard {
|
||||
fn drop(&mut self) {
|
||||
TIMELINE_STATE_METRIC
|
||||
.with_label_values(&["offloaded"])
|
||||
.dec();
|
||||
}
|
||||
}
|
||||
|
||||
impl OffloadedTimeline {
|
||||
@@ -576,6 +601,8 @@ impl OffloadedTimeline {
|
||||
|
||||
delete_progress: timeline.delete_progress.clone(),
|
||||
deleted_from_ancestor: AtomicBool::new(false),
|
||||
|
||||
_metrics_guard: OffloadedTimelineMetricsGuard::new(),
|
||||
})
|
||||
}
|
||||
fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
|
||||
@@ -595,6 +622,7 @@ impl OffloadedTimeline {
|
||||
archived_at,
|
||||
delete_progress: TimelineDeleteProgress::default(),
|
||||
deleted_from_ancestor: AtomicBool::new(false),
|
||||
_metrics_guard: OffloadedTimelineMetricsGuard::new(),
|
||||
}
|
||||
}
|
||||
fn manifest(&self) -> OffloadedTimelineManifest {
|
||||
@@ -906,7 +934,7 @@ pub(crate) enum CreateTimelineParams {
|
||||
pub(crate) struct CreateTimelineParamsBootstrap {
|
||||
pub(crate) new_timeline_id: TimelineId,
|
||||
pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
|
||||
pub(crate) pg_version: u32,
|
||||
pub(crate) pg_version: PgMajorVersion,
|
||||
}
|
||||
|
||||
/// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
|
||||
@@ -944,7 +972,7 @@ pub(crate) enum CreateTimelineIdempotency {
|
||||
/// NB: special treatment, see comment in [`Self`].
|
||||
FailWithConflict,
|
||||
Bootstrap {
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
},
|
||||
/// NB: branches always have the same `pg_version` as their ancestor.
|
||||
/// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
|
||||
@@ -1289,7 +1317,7 @@ impl TenantShard {
|
||||
ancestor.is_some()
|
||||
|| timeline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::LoadLayerMap)
|
||||
.await
|
||||
.layer_map()
|
||||
.expect(
|
||||
@@ -1335,7 +1363,7 @@ impl TenantShard {
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache,
|
||||
feature_resolver,
|
||||
} = resources;
|
||||
|
||||
@@ -1352,7 +1380,7 @@ impl TenantShard {
|
||||
remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache,
|
||||
feature_resolver,
|
||||
));
|
||||
|
||||
@@ -1832,6 +1860,29 @@ impl TenantShard {
|
||||
}
|
||||
}
|
||||
|
||||
// At this point we've initialized all timelines and are tracking them.
|
||||
// Now compute the layer visibility for all (not offloaded) timelines.
|
||||
let compute_visiblity_for = {
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let mut timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap();
|
||||
|
||||
timelines_offloaded_accessor.extend(offloaded_timelines_list.into_iter());
|
||||
|
||||
// Before activation, populate each Timeline's GcInfo with information about its children
|
||||
self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None);
|
||||
|
||||
timelines_accessor.values().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
for tl in compute_visiblity_for {
|
||||
tl.update_layer_visibility().await.with_context(|| {
|
||||
format!(
|
||||
"failed initial timeline visibility computation {} for tenant {}",
|
||||
tl.timeline_id, self.tenant_shard_id
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
// Walk through deleted timelines, resume deletion
|
||||
for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
|
||||
remote_timeline_client
|
||||
@@ -1851,10 +1902,6 @@ impl TenantShard {
|
||||
.context("resume_deletion")
|
||||
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
|
||||
}
|
||||
{
|
||||
let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
|
||||
offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
|
||||
}
|
||||
|
||||
// Stash the preloaded tenant manifest, and upload a new manifest if changed.
|
||||
//
|
||||
@@ -2495,7 +2542,7 @@ impl TenantShard {
|
||||
self: &Arc<Self>,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
|
||||
anyhow::ensure!(
|
||||
@@ -2547,7 +2594,7 @@ impl TenantShard {
|
||||
self: &Arc<Self>,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let (uninit_tl, ctx) = self
|
||||
@@ -2586,7 +2633,7 @@ impl TenantShard {
|
||||
self: &Arc<Self>,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
ctx: &RequestContext,
|
||||
in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
|
||||
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
|
||||
@@ -2617,7 +2664,7 @@ impl TenantShard {
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
@@ -2852,7 +2899,7 @@ impl TenantShard {
|
||||
Lsn(0),
|
||||
initdb_lsn,
|
||||
initdb_lsn,
|
||||
15,
|
||||
PgMajorVersion::PG15,
|
||||
);
|
||||
this.prepare_new_timeline(
|
||||
new_timeline_id,
|
||||
@@ -3132,7 +3179,12 @@ impl TenantShard {
|
||||
|
||||
for timeline in &compact {
|
||||
// Collect L0 counts. Can't await while holding lock above.
|
||||
if let Ok(lm) = timeline.layers.read().await.layer_map() {
|
||||
if let Ok(lm) = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await
|
||||
.layer_map()
|
||||
{
|
||||
l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
|
||||
}
|
||||
}
|
||||
@@ -3398,7 +3450,7 @@ impl TenantShard {
|
||||
use pageserver_api::models::ActivatingFrom;
|
||||
match &*current_state {
|
||||
TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
|
||||
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {current_state:?}");
|
||||
}
|
||||
TenantState::Attaching => {
|
||||
*current_state = TenantState::Activating(ActivatingFrom::Attaching);
|
||||
@@ -3417,9 +3469,6 @@ impl TenantShard {
|
||||
.values()
|
||||
.filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
|
||||
|
||||
// Before activation, populate each Timeline's GcInfo with information about its children
|
||||
self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None);
|
||||
|
||||
// Spawn gc and compaction loops. The loops will shut themselves
|
||||
// down when they notice that the tenant is inactive.
|
||||
tasks::start_background_loops(self, background_jobs_can_start);
|
||||
@@ -4331,7 +4380,7 @@ impl TenantShard {
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
feature_resolver: FeatureResolver,
|
||||
) -> TenantShard {
|
||||
assert!(!attached_conf.location.generation.is_none());
|
||||
@@ -4436,7 +4485,7 @@ impl TenantShard {
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
basebackup_cache,
|
||||
feature_resolver,
|
||||
}
|
||||
}
|
||||
@@ -4874,7 +4923,7 @@ impl TenantShard {
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
@@ -5042,7 +5091,7 @@ impl TenantShard {
|
||||
pub(crate) async fn bootstrap_timeline_test(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
@@ -5184,7 +5233,7 @@ impl TenantShard {
|
||||
async fn bootstrap_timeline(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CreateTimelineResult, CreateTimelineError> {
|
||||
@@ -5365,7 +5414,7 @@ impl TenantShard {
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
|
||||
basebackup_cache: self.basebackup_cache.clone(),
|
||||
feature_resolver: self.feature_resolver.clone(),
|
||||
}
|
||||
}
|
||||
@@ -5722,7 +5771,7 @@ impl TenantShard {
|
||||
async fn run_initdb(
|
||||
conf: &'static PageServerConf,
|
||||
initdb_target_dir: &Utf8Path,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), InitdbError> {
|
||||
let initdb_bin_path = conf
|
||||
@@ -5804,10 +5853,10 @@ pub(crate) mod harness {
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use utils::id::TenantId;
|
||||
use utils::logging;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use super::*;
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
@@ -5951,7 +6000,7 @@ pub(crate) mod harness {
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
|
||||
|
||||
let tenant = Arc::new(TenantShard::new(
|
||||
TenantState::Attaching,
|
||||
@@ -5969,7 +6018,7 @@ pub(crate) mod harness {
|
||||
self.deletion_queue.new_client(),
|
||||
// TODO: ideally we should run all unit tests with both configs
|
||||
L0FlushGlobalState::new(L0FlushConfig::default()),
|
||||
basebackup_requst_sender,
|
||||
basebackup_cache,
|
||||
FeatureResolver::new_disabled(),
|
||||
));
|
||||
|
||||
@@ -6003,7 +6052,7 @@ pub(crate) mod harness {
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
_pg_version: PgMajorVersion,
|
||||
_redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, walredo::Error> {
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
@@ -6062,9 +6111,6 @@ mod tests {
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::keyspace::KeySpaceRandomAccum;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
#[cfg(feature = "testing")]
|
||||
use rand::SeedableRng;
|
||||
@@ -6085,6 +6131,9 @@ mod tests {
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::{ShardCount, ShardNumber};
|
||||
#[cfg(feature = "testing")]
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::*;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
@@ -6175,7 +6224,7 @@ mod tests {
|
||||
async fn randomize_timeline(
|
||||
tenant: &Arc<TenantShard>,
|
||||
new_timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
spec: TestTimelineSpecification,
|
||||
random: &mut rand::rngs::StdRng,
|
||||
ctx: &RequestContext,
|
||||
@@ -6568,7 +6617,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {lsn}"))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -6578,7 +6627,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {lsn}"))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -6592,7 +6641,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {lsn}"))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -6602,7 +6651,7 @@ mod tests {
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("foo at {}", lsn))),
|
||||
&Value::Image(test_img(&format!("foo at {lsn}"))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -6944,7 +6993,7 @@ mod tests {
|
||||
.await?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
|
||||
|
||||
let layer_map = tline.layers.read().await;
|
||||
let layer_map = tline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let level0_deltas = layer_map
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
@@ -7101,7 +7150,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -7180,7 +7229,7 @@ mod tests {
|
||||
let lsn = Lsn(0x10);
|
||||
let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let guard = tline.layers.read().await;
|
||||
let guard = tline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let lm = guard.layer_map()?;
|
||||
|
||||
lm.dump(true, &ctx).await?;
|
||||
@@ -7389,7 +7438,7 @@ mod tests {
|
||||
.put(
|
||||
gap_at_key,
|
||||
current_lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
|
||||
&Value::Image(test_img(&format!("{gap_at_key} at {current_lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -7428,7 +7477,7 @@ mod tests {
|
||||
.put(
|
||||
current_key,
|
||||
current_lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
|
||||
&Value::Image(test_img(&format!("{current_key} at {current_lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -7536,7 +7585,7 @@ mod tests {
|
||||
while key < end_key {
|
||||
current_lsn += 0x10;
|
||||
|
||||
let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
|
||||
let image_value = format!("{child_gap_at_key} at {current_lsn}");
|
||||
|
||||
let mut writer = parent_timeline.writer().await;
|
||||
writer
|
||||
@@ -7579,7 +7628,7 @@ mod tests {
|
||||
.put(
|
||||
key,
|
||||
current_lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
|
||||
&Value::Image(test_img(&format!("{key} at {current_lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -7700,7 +7749,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -7721,7 +7770,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -7735,7 +7784,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
test_img(&format!("{blknum} at {last_lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -7781,7 +7830,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -7810,11 +7859,11 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
println!("updating {} at {}", blknum, lsn);
|
||||
println!("updating {blknum} at {lsn}");
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
updated[blknum] = lsn;
|
||||
@@ -7825,7 +7874,7 @@ mod tests {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
test_img(&format!("{blknum} at {last_lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -7878,11 +7927,11 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{idx} {blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
println!("updating [{}][{}] at {}", idx, blknum, lsn);
|
||||
println!("updating [{idx}][{blknum}] at {lsn}");
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
updated[idx][blknum] = lsn;
|
||||
@@ -8088,7 +8137,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -8105,7 +8154,7 @@ mod tests {
|
||||
test_key.field6 = (blknum * STEP) as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn, &ctx).await?,
|
||||
test_img(&format!("{} at {}", blknum, last_lsn))
|
||||
test_img(&format!("{blknum} at {last_lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8142,7 +8191,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -8208,12 +8257,23 @@ mod tests {
|
||||
tline.freeze_and_flush().await?; // force create a delta layer
|
||||
}
|
||||
|
||||
let before_num_l0_delta_files =
|
||||
tline.layers.read().await.layer_map()?.level0_deltas().len();
|
||||
let before_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
.len();
|
||||
|
||||
tline.compact(&cancel, EnumSet::default(), &ctx).await?;
|
||||
|
||||
let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
|
||||
let after_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
.len();
|
||||
|
||||
assert!(
|
||||
after_num_l0_delta_files < before_num_l0_delta_files,
|
||||
@@ -8384,7 +8444,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -8404,7 +8464,7 @@ mod tests {
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
|
||||
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -9325,12 +9385,7 @@ mod tests {
|
||||
let end_lsn = Lsn(0x100);
|
||||
let image_layers = (0x20..=0x90)
|
||||
.step_by(0x10)
|
||||
.map(|n| {
|
||||
(
|
||||
Lsn(n),
|
||||
vec![(key, test_img(&format!("data key at {:x}", n)))],
|
||||
)
|
||||
})
|
||||
.map(|n| (Lsn(n), vec![(key, test_img(&format!("data key at {n:x}")))]))
|
||||
.collect();
|
||||
|
||||
let timeline = tenant
|
||||
|
||||
@@ -63,8 +63,7 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
|
||||
&& overlaps_with(&layer.key_range, &other_layer.key_range)
|
||||
{
|
||||
let err = format!(
|
||||
"layer violates the layer map LSN split assumption: layer {} intersects with layer {}",
|
||||
layer, other_layer
|
||||
"layer violates the layer map LSN split assumption: layer {layer} intersects with layer {other_layer}"
|
||||
);
|
||||
return Some(err);
|
||||
}
|
||||
|
||||
@@ -61,8 +61,10 @@ pub(crate) struct LocationConf {
|
||||
/// The detailed shard identity. This structure is already scoped within
|
||||
/// a TenantShardId, but we need the full ShardIdentity to enable calculating
|
||||
/// key->shard mappings.
|
||||
#[serde(default = "ShardIdentity::unsharded")]
|
||||
#[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
|
||||
///
|
||||
/// NB: we store this even for unsharded tenants, so that we agree with storcon on the intended
|
||||
/// stripe size. Otherwise, a split request that does not specify a stripe size may use a
|
||||
/// different default than storcon, which can lead to incorrect stripe sizes and corruption.
|
||||
pub(crate) shard: ShardIdentity,
|
||||
|
||||
/// The pan-cluster tenant configuration, the same on all locations
|
||||
@@ -149,7 +151,12 @@ impl LocationConf {
|
||||
/// For use when attaching/re-attaching: update the generation stored in this
|
||||
/// structure. If we were in a secondary state, promote to attached (posession
|
||||
/// of a fresh generation implies this).
|
||||
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
|
||||
pub(crate) fn attach_in_generation(
|
||||
&mut self,
|
||||
mode: AttachmentMode,
|
||||
generation: Generation,
|
||||
stripe_size: ShardStripeSize,
|
||||
) {
|
||||
match &mut self.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
attach_conf.generation = generation;
|
||||
@@ -163,6 +170,8 @@ impl LocationConf {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
self.shard.stripe_size = stripe_size;
|
||||
}
|
||||
|
||||
pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use std::io;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use num_traits::Num;
|
||||
@@ -18,6 +18,7 @@ use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache;
|
||||
use crate::tenant::storage_layer::inmemory_layer::GlobalResourceUnits;
|
||||
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
|
||||
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
|
||||
@@ -30,9 +31,13 @@ pub struct EphemeralFile {
|
||||
_tenant_shard_id: TenantShardId,
|
||||
_timeline_id: TimelineId,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
bytes_written: u64,
|
||||
file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
|
||||
buffered_writer: BufferedWriter,
|
||||
|
||||
buffered_writer: tokio::sync::RwLock<BufferedWriter>,
|
||||
|
||||
bytes_written: AtomicU64,
|
||||
|
||||
resource_units: std::sync::Mutex<GlobalResourceUnits>,
|
||||
}
|
||||
|
||||
type BufferedWriter = owned_buffers_io::write::BufferedWriter<
|
||||
@@ -94,9 +99,8 @@ impl EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
page_cache_file_id,
|
||||
bytes_written: 0,
|
||||
file: file.clone(),
|
||||
buffered_writer: BufferedWriter::new(
|
||||
buffered_writer: tokio::sync::RwLock::new(BufferedWriter::new(
|
||||
file,
|
||||
0,
|
||||
|| IoBufferMut::with_capacity(TAIL_SZ),
|
||||
@@ -104,7 +108,9 @@ impl EphemeralFile {
|
||||
cancel.child_token(),
|
||||
ctx,
|
||||
info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename),
|
||||
),
|
||||
)),
|
||||
bytes_written: AtomicU64::new(0),
|
||||
resource_units: std::sync::Mutex::new(GlobalResourceUnits::new()),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -151,15 +157,17 @@ impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum EphemeralFileWriteError {
|
||||
#[error("{0}")]
|
||||
TooLong(String),
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
pub(crate) fn len(&self) -> u64 {
|
||||
self.bytes_written
|
||||
// TODO(vlad): The value returned here is not always correct if
|
||||
// we have more than one concurrent writer. Writes are always
|
||||
// sequenced, but we could grab the buffered writer lock if we wanted
|
||||
// to.
|
||||
self.bytes_written.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||
@@ -186,7 +194,7 @@ impl EphemeralFile {
|
||||
/// Panics if the write is short because there's no way we can recover from that.
|
||||
/// TODO: make upstack handle this as an error.
|
||||
pub(crate) async fn write_raw(
|
||||
&mut self,
|
||||
&self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, EphemeralFileWriteError> {
|
||||
@@ -198,22 +206,13 @@ impl EphemeralFile {
|
||||
}
|
||||
|
||||
async fn write_raw_controlled(
|
||||
&mut self,
|
||||
&self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(u64, Option<owned_buffers_io::write::FlushControl>), EphemeralFileWriteError> {
|
||||
let pos = self.bytes_written;
|
||||
let mut writer = self.buffered_writer.write().await;
|
||||
|
||||
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
|
||||
EphemeralFileWriteError::TooLong(format!(
|
||||
"write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
|
||||
srcbuf_len = srcbuf.len(),
|
||||
))
|
||||
})?;
|
||||
|
||||
// Write the payload
|
||||
let (nwritten, control) = self
|
||||
.buffered_writer
|
||||
let (nwritten, control) = writer
|
||||
.write_buffered_borrowed_controlled(srcbuf, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
@@ -225,43 +224,69 @@ impl EphemeralFile {
|
||||
"buffered writer has no short writes"
|
||||
);
|
||||
|
||||
self.bytes_written = new_bytes_written;
|
||||
// There's no realistic risk of overflow here. We won't have exabytes sized files on disk.
|
||||
let pos = self
|
||||
.bytes_written
|
||||
.fetch_add(srcbuf.len().into_u64(), Ordering::AcqRel);
|
||||
|
||||
let mut resource_units = self.resource_units.lock().unwrap();
|
||||
resource_units.maybe_publish_size(self.bytes_written.load(Ordering::Relaxed));
|
||||
|
||||
Ok((pos, control))
|
||||
}
|
||||
|
||||
pub(crate) fn tick(&self) -> Option<u64> {
|
||||
let mut resource_units = self.resource_units.lock().unwrap();
|
||||
let len = self.bytes_written.load(Ordering::Relaxed);
|
||||
resource_units.publish_size(len)
|
||||
}
|
||||
}
|
||||
|
||||
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
|
||||
async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
|
||||
&self,
|
||||
start: u64,
|
||||
dst: tokio_epoll_uring::Slice<B>,
|
||||
mut dst: tokio_epoll_uring::Slice<B>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
|
||||
let submitted_offset = self.buffered_writer.bytes_submitted();
|
||||
// We will fill the slice in back to front. Hence, we need
|
||||
// the slice to be fully initialized.
|
||||
// TODO(vlad): Is there a nicer way of doing this?
|
||||
dst.as_mut_rust_slice_full_zeroed();
|
||||
|
||||
let mutable = match self.buffered_writer.inspect_mutable() {
|
||||
Some(mutable) => &mutable[0..mutable.pending()],
|
||||
None => {
|
||||
// Timeline::cancel and hence buffered writer flush was cancelled.
|
||||
// Remain read-available while timeline is shutting down.
|
||||
&[]
|
||||
}
|
||||
};
|
||||
let writer = self.buffered_writer.read().await;
|
||||
|
||||
let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
|
||||
// Read bytes written while under lock. This is a hack to deal with concurrent
|
||||
// writes updating the number of bytes written. `bytes_written` is not DIO alligned
|
||||
// but we may end the read there.
|
||||
//
|
||||
// TODO(vlad): Feels like there's a nicer path where we align the end if it
|
||||
// shoots over the end of the file.
|
||||
let bytes_written = self.bytes_written.load(Ordering::Acquire);
|
||||
|
||||
let dst_cap = dst.bytes_total().into_u64();
|
||||
let end = {
|
||||
// saturating_add is correct here because the max file size is u64::MAX, so,
|
||||
// if start + dst.len() > u64::MAX, then we know it will be a short read
|
||||
let mut end: u64 = start.saturating_add(dst_cap);
|
||||
if end > self.bytes_written {
|
||||
end = self.bytes_written;
|
||||
if end > bytes_written {
|
||||
end = bytes_written;
|
||||
}
|
||||
end
|
||||
};
|
||||
|
||||
let submitted_offset = writer.bytes_submitted();
|
||||
let maybe_flushed = writer.inspect_maybe_flushed();
|
||||
|
||||
let mutable = match writer.inspect_mutable() {
|
||||
Some(mutable) => &mutable[0..mutable.pending()],
|
||||
None => {
|
||||
// Timeline::cancel and hence buffered writer flush was cancelled.
|
||||
// Remain read-available while timeline is shutting down.
|
||||
&[]
|
||||
}
|
||||
};
|
||||
|
||||
// inclusive, exclusive
|
||||
#[derive(Debug)]
|
||||
struct Range<N>(N, N);
|
||||
@@ -306,13 +331,33 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
|
||||
|
||||
let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
|
||||
|
||||
let dst = if written_range.len() > 0 {
|
||||
// There are three sources from which we might have to read data:
|
||||
// 1. The file itself
|
||||
// 2. The buffer which contains changes currently being flushed
|
||||
// 3. The buffer which contains chnages yet to be flushed
|
||||
//
|
||||
// For better concurrency, we do them in reverse order: perform the in-memory
|
||||
// reads while holding the writer lock, drop the writer lock and read from the
|
||||
// file if required.
|
||||
|
||||
let dst = if mutable_range.len() > 0 {
|
||||
let offset_in_buffer = mutable_range
|
||||
.0
|
||||
.checked_sub(submitted_offset)
|
||||
.unwrap()
|
||||
.into_usize();
|
||||
let to_copy =
|
||||
&mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
|
||||
let bounds = dst.bounds();
|
||||
let slice = self
|
||||
.file
|
||||
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
|
||||
.await?;
|
||||
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
|
||||
let mut view = dst.slice({
|
||||
let start =
|
||||
written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
|
||||
let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
|
||||
start..end
|
||||
});
|
||||
view.as_mut_rust_slice_full_zeroed()
|
||||
.copy_from_slice(to_copy);
|
||||
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
|
||||
} else {
|
||||
dst
|
||||
};
|
||||
@@ -342,24 +387,15 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
|
||||
dst
|
||||
};
|
||||
|
||||
let dst = if mutable_range.len() > 0 {
|
||||
let offset_in_buffer = mutable_range
|
||||
.0
|
||||
.checked_sub(submitted_offset)
|
||||
.unwrap()
|
||||
.into_usize();
|
||||
let to_copy =
|
||||
&mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
|
||||
drop(writer);
|
||||
|
||||
let dst = if written_range.len() > 0 {
|
||||
let bounds = dst.bounds();
|
||||
let mut view = dst.slice({
|
||||
let start =
|
||||
written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
|
||||
let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
|
||||
start..end
|
||||
});
|
||||
view.as_mut_rust_slice_full_zeroed()
|
||||
.copy_from_slice(to_copy);
|
||||
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
|
||||
let slice = self
|
||||
.file
|
||||
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
|
||||
.await?;
|
||||
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
|
||||
} else {
|
||||
dst
|
||||
};
|
||||
@@ -460,13 +496,15 @@ mod tests {
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mutable = file.buffered_writer.mutable();
|
||||
let writer = file.buffered_writer.read().await;
|
||||
let mutable = writer.mutable();
|
||||
let cap = mutable.capacity();
|
||||
let align = mutable.align();
|
||||
drop(writer);
|
||||
|
||||
let write_nbytes = cap * 2 + cap / 2;
|
||||
|
||||
@@ -504,10 +542,11 @@ mod tests {
|
||||
let file_contents = std::fs::read(file.file.path()).unwrap();
|
||||
assert!(file_contents == content[0..cap * 2]);
|
||||
|
||||
let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
|
||||
let writer = file.buffered_writer.read().await;
|
||||
let maybe_flushed_buffer_contents = writer.inspect_maybe_flushed().unwrap();
|
||||
assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
|
||||
|
||||
let mutable_buffer_contents = file.buffered_writer.mutable();
|
||||
let mutable_buffer_contents = writer.mutable();
|
||||
assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
|
||||
}
|
||||
|
||||
@@ -517,12 +556,14 @@ mod tests {
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let cancel = CancellationToken::new();
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// mutable buffer and maybe_flushed buffer each has `cap` bytes.
|
||||
let cap = file.buffered_writer.mutable().capacity();
|
||||
let writer = file.buffered_writer.read().await;
|
||||
let cap = writer.mutable().capacity();
|
||||
drop(writer);
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
@@ -540,12 +581,13 @@ mod tests {
|
||||
2 * cap.into_u64(),
|
||||
"buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
|
||||
);
|
||||
let writer = file.buffered_writer.read().await;
|
||||
assert_eq!(
|
||||
&file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
|
||||
&writer.inspect_maybe_flushed().unwrap()[0..cap],
|
||||
&content[cap..cap * 2]
|
||||
);
|
||||
assert_eq!(
|
||||
&file.buffered_writer.mutable()[0..cap / 2],
|
||||
&writer.mutable()[0..cap / 2],
|
||||
&content[cap * 2..cap * 2 + cap / 2]
|
||||
);
|
||||
}
|
||||
@@ -563,13 +605,15 @@ mod tests {
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mutable = file.buffered_writer.mutable();
|
||||
let writer = file.buffered_writer.read().await;
|
||||
let mutable = writer.mutable();
|
||||
let cap = mutable.capacity();
|
||||
let align = mutable.align();
|
||||
drop(writer);
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(cap * 2 + cap / 2)
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart
|
||||
|
||||
use anyhow::ensure;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::{BeSer, SerializeError};
|
||||
use utils::id::TimelineId;
|
||||
@@ -136,7 +137,7 @@ struct TimelineMetadataBodyV2 {
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -167,7 +168,7 @@ impl TimelineMetadata {
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
) -> Self {
|
||||
Self {
|
||||
hdr: TimelineMetadataHeader {
|
||||
@@ -215,7 +216,7 @@ impl TimelineMetadata {
|
||||
ancestor_lsn: body.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: body.initdb_lsn,
|
||||
pg_version: 14, // All timelines created before this version had pg_version 14
|
||||
pg_version: PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
|
||||
};
|
||||
|
||||
hdr.format_version = METADATA_FORMAT_VERSION;
|
||||
@@ -317,7 +318,7 @@ impl TimelineMetadata {
|
||||
self.body.initdb_lsn
|
||||
}
|
||||
|
||||
pub fn pg_version(&self) -> u32 {
|
||||
pub fn pg_version(&self) -> PgMajorVersion {
|
||||
self.body.pg_version
|
||||
}
|
||||
|
||||
@@ -331,7 +332,7 @@ impl TimelineMetadata {
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
0,
|
||||
PgMajorVersion::PG14,
|
||||
);
|
||||
let bytes = instance.to_bytes().unwrap();
|
||||
Self::from_bytes(&bytes).unwrap()
|
||||
@@ -545,13 +546,12 @@ mod tests {
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
14, // All timelines created before this version had pg_version 14
|
||||
PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata.body, expected_metadata.body,
|
||||
"Metadata of the old version {} should be upgraded to the latest version {}",
|
||||
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
|
||||
"Metadata of the old version {METADATA_OLD_FORMAT_VERSION} should be upgraded to the latest version {METADATA_FORMAT_VERSION}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -566,7 +566,7 @@ mod tests {
|
||||
Lsn(0),
|
||||
// Updating this version to 17 will cause the test to fail at the
|
||||
// next assert_eq!().
|
||||
16,
|
||||
PgMajorVersion::PG16,
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* TimelineMetadataHeader */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -427,8 +427,8 @@ impl GcBlocking {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use std::str::FromStr;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
@@ -831,7 +831,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
@@ -893,7 +893,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
|
||||
@@ -957,7 +957,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
@@ -1033,7 +1033,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
@@ -1114,7 +1114,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
@@ -1199,7 +1199,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
@@ -1287,7 +1287,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use std::io::{ErrorKind, SeekFrom};
|
||||
use std::num::NonZeroU32;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
@@ -228,11 +229,25 @@ pub(crate) async fn time_travel_recover_tenant(
|
||||
let timelines_path = super::remote_timelines_path(tenant_shard_id);
|
||||
prefixes.push(timelines_path);
|
||||
}
|
||||
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
|
||||
|
||||
for prefix in &prefixes {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
storage
|
||||
.time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
|
||||
.time_travel_recover(
|
||||
Some(prefix),
|
||||
timestamp,
|
||||
done_if_after,
|
||||
cancel,
|
||||
COMPLEXITY_LIMIT,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|e| !matches!(e, TimeTravelError::Other(_)),
|
||||
|
||||
@@ -1427,7 +1427,7 @@ async fn init_timeline_state(
|
||||
let local_meta = dentry
|
||||
.metadata()
|
||||
.await
|
||||
.fatal_err(&format!("Read metadata on {}", file_path));
|
||||
.fatal_err(&format!("Read metadata on {file_path}"));
|
||||
|
||||
let file_name = file_path.file_name().expect("created it from the dentry");
|
||||
if crate::is_temporary(&file_path)
|
||||
|
||||
@@ -34,11 +34,11 @@ pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
|
||||
use pageserver_api::config::GetVectoredConcurrentIo;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use tracing::{Instrument, info_span, trace};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
use super::PageReconstructError;
|
||||
@@ -109,7 +109,7 @@ pub(crate) enum OnDiskValue {
|
||||
|
||||
/// Reconstruct data accumulated for a single key during a vectored get
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct VectoredValueReconstructState {
|
||||
pub struct VectoredValueReconstructState {
|
||||
pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>,
|
||||
|
||||
pub(crate) situation: ValueReconstructSituation,
|
||||
@@ -244,13 +244,60 @@ impl VectoredValueReconstructState {
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Benchmarking utility to await for the completion of all pending ios
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Technically fine to stop polling this future, but, the IOs will still
|
||||
/// be executed to completion by the sidecar task and hold on to / consume resources.
|
||||
/// Better not do it to make reasonsing about the system easier.
|
||||
#[cfg(feature = "benchmarking")]
|
||||
pub async fn sink_pending_ios(self) -> Result<(), std::io::Error> {
|
||||
let mut res = Ok(());
|
||||
|
||||
// We should try hard not to bail early, so that by the time we return from this
|
||||
// function, all IO for this value is done. It's not required -- we could totally
|
||||
// stop polling the IO futures in the sidecar task, they need to support that,
|
||||
// but just stopping to poll doesn't reduce the IO load on the disk. It's easier
|
||||
// to reason about the system if we just wait for all IO to complete, even if
|
||||
// we're no longer interested in the result.
|
||||
//
|
||||
// Revisit this when IO futures are replaced with a more sophisticated IO system
|
||||
// and an IO scheduler, where we know which IOs were submitted and which ones
|
||||
// just queued. Cf the comment on IoConcurrency::spawn_io.
|
||||
for (_lsn, waiter) in self.on_disk_values {
|
||||
let value_recv_res = waiter
|
||||
.wait_completion()
|
||||
// we rely on the caller to poll us to completion, so this is not a bail point
|
||||
.await;
|
||||
|
||||
match (&mut res, value_recv_res) {
|
||||
(Err(_), _) => {
|
||||
// We've already failed, no need to process more.
|
||||
}
|
||||
(Ok(_), Err(_wait_err)) => {
|
||||
// This shouldn't happen - likely the sidecar task panicked.
|
||||
unreachable!();
|
||||
}
|
||||
(Ok(_), Ok(Err(err))) => {
|
||||
let err: std::io::Error = err;
|
||||
res = Err(err);
|
||||
}
|
||||
(Ok(_ok), Ok(Ok(OnDiskValue::RawImage(_img)))) => {}
|
||||
(Ok(_ok), Ok(Ok(OnDiskValue::WalRecordOrImage(_buf)))) => {}
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
/// Bag of data accumulated during a vectored get..
|
||||
pub(crate) struct ValuesReconstructState {
|
||||
pub struct ValuesReconstructState {
|
||||
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
|
||||
/// should not expect to get anything from this hashmap.
|
||||
pub(crate) keys: HashMap<Key, VectoredValueReconstructState>,
|
||||
pub keys: HashMap<Key, VectoredValueReconstructState>,
|
||||
/// The keys which are already retrieved
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
|
||||
@@ -272,7 +319,7 @@ pub(crate) struct ValuesReconstructState {
|
||||
/// The desired end state is that we always do parallel IO.
|
||||
/// This struct and the dispatching in the impl will be removed once
|
||||
/// we've built enough confidence.
|
||||
pub(crate) enum IoConcurrency {
|
||||
pub enum IoConcurrency {
|
||||
Sequential,
|
||||
SidecarTask {
|
||||
task_id: usize,
|
||||
@@ -317,10 +364,7 @@ impl IoConcurrency {
|
||||
Self::spawn(SelectedIoConcurrency::Sequential)
|
||||
}
|
||||
|
||||
pub(crate) fn spawn_from_conf(
|
||||
conf: GetVectoredConcurrentIo,
|
||||
gate_guard: GateGuard,
|
||||
) -> IoConcurrency {
|
||||
pub fn spawn_from_conf(conf: GetVectoredConcurrentIo, gate_guard: GateGuard) -> IoConcurrency {
|
||||
let selected = match conf {
|
||||
GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
|
||||
GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
|
||||
@@ -425,16 +469,6 @@ impl IoConcurrency {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clone(&self) -> Self {
|
||||
match self {
|
||||
IoConcurrency::Sequential => IoConcurrency::Sequential,
|
||||
IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
|
||||
task_id: *task_id,
|
||||
ios_tx: ios_tx.clone(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string.
|
||||
///
|
||||
/// The IO is represented as an opaque future.
|
||||
@@ -573,6 +607,18 @@ impl IoConcurrency {
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for IoConcurrency {
|
||||
fn clone(&self) -> Self {
|
||||
match self {
|
||||
IoConcurrency::Sequential => IoConcurrency::Sequential,
|
||||
IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
|
||||
task_id: *task_id,
|
||||
ios_tx: ios_tx.clone(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Make noise in case the [`ValuesReconstructState`] gets dropped while
|
||||
/// there are still IOs in flight.
|
||||
/// Refer to `collect_pending_ios` for why we prefer not to do that.
|
||||
@@ -603,7 +649,7 @@ impl Drop for ValuesReconstructState {
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
pub(crate) fn new(io_concurrency: IoConcurrency) -> Self {
|
||||
pub fn new(io_concurrency: IoConcurrency) -> Self {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
|
||||
@@ -4,11 +4,11 @@ use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{KEY_SIZE, Key};
|
||||
use pageserver_api::value::Value;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::TenantShardId;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::errors::PutError;
|
||||
use super::layer::S3_UPLOAD_LIMIT;
|
||||
|
||||
@@ -44,7 +44,6 @@ use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
@@ -54,6 +53,7 @@ use utils::bin_ser::BeSer;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::errors::PutError;
|
||||
use super::{
|
||||
@@ -783,7 +783,7 @@ impl DeltaLayer {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
.with_context(|| format!("Failed to open file '{path}'"))?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
@@ -1306,7 +1306,7 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
|
||||
let will_init = wal_decoder::models::value::ValueBytes::will_init(&data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
@@ -1369,7 +1369,7 @@ impl DeltaLayerInner {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
|
||||
let wal_desc = wal_decoder::models::record::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
@@ -1401,7 +1401,7 @@ impl DeltaLayerInner {
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
let checkpoint = CheckPoint::decode(&img)?;
|
||||
println!(" CHECKPOINT: {:?}", checkpoint);
|
||||
println!(" CHECKPOINT: {checkpoint:?}");
|
||||
}
|
||||
Value::WalRecord(_rec) => {
|
||||
println!(" unexpected walrecord value for checkpoint key");
|
||||
@@ -1622,12 +1622,6 @@ impl DeltaLayerIterator<'_> {
|
||||
pub(crate) mod test {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use bytes::Bytes;
|
||||
use itertools::MinMaxResult;
|
||||
use pageserver_api::value::Value;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use rand::{Rng, RngCore};
|
||||
|
||||
use super::*;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use crate::context::DownloadBehavior;
|
||||
@@ -1635,7 +1629,13 @@ pub(crate) mod test {
|
||||
use crate::tenant::disk_btree::tests::TestDisk;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::{TenantShard, Timeline};
|
||||
use bytes::Bytes;
|
||||
use itertools::MinMaxResult;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use rand::{Rng, RngCore};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1987,7 +1987,7 @@ pub(crate) mod test {
|
||||
#[tokio::test]
|
||||
async fn copy_delta_prefix_smoke() {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
|
||||
.await
|
||||
@@ -1995,14 +1995,14 @@ pub(crate) mod test {
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let ctx = &ctx;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let initdb_layer = timeline
|
||||
.layers
|
||||
.read()
|
||||
.read(crate::tenant::timeline::layer_manager::LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.next()
|
||||
@@ -2078,7 +2078,7 @@ pub(crate) mod test {
|
||||
|
||||
let new_layer = timeline
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Testing)
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.find(|&x| x != &initdb_layer)
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::sync::Arc;
|
||||
use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, SparseKeySpace};
|
||||
use pageserver_api::value::Value;
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::PersistentLayerKey;
|
||||
use super::merge_iterator::{MergeIterator, MergeIteratorItem};
|
||||
@@ -126,7 +126,6 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
|
||||
@@ -42,7 +42,6 @@ use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::value::Value;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_stream::StreamExt;
|
||||
@@ -52,6 +51,7 @@ use utils::bin_ser::BeSer;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::errors::PutError;
|
||||
use super::layer_name::ImageLayerName;
|
||||
@@ -272,8 +272,7 @@ impl ImageLayer {
|
||||
|
||||
conf.timeline_path(&tenant_shard_id, &timeline_id)
|
||||
.join(format!(
|
||||
"{fname}.{:x}.{TEMP_FILE_SUFFIX}",
|
||||
filename_disambiguator
|
||||
"{fname}.{filename_disambiguator:x}.{TEMP_FILE_SUFFIX}"
|
||||
))
|
||||
}
|
||||
|
||||
@@ -370,7 +369,7 @@ impl ImageLayer {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
.with_context(|| format!("Failed to open file '{path}'"))?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
@@ -1232,10 +1231,10 @@ mod test {
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use pageserver_api::value::Value;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::{ImageLayerIterator, ImageLayerWriter};
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
@@ -1475,7 +1474,7 @@ mod test {
|
||||
assert_eq!(l1, expect_lsn);
|
||||
assert_eq!(&i1, i2);
|
||||
}
|
||||
(o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2),
|
||||
(o1, o2) => panic!("iterators length mismatch: {o1:?}, {o2:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,23 +70,15 @@ pub struct InMemoryLayer {
|
||||
/// We use a separate lock for the index to reduce the critical section
|
||||
/// during which reads cannot be planned.
|
||||
///
|
||||
/// If you need access to both the index and the underlying file at the same time,
|
||||
/// respect the following locking order to avoid deadlocks:
|
||||
/// 1. [`InMemoryLayer::inner`]
|
||||
/// 2. [`InMemoryLayer::index`]
|
||||
///
|
||||
/// Note that the file backing [`InMemoryLayer::inner`] is append-only,
|
||||
/// so it is not necessary to hold simultaneous locks on index.
|
||||
/// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
|
||||
/// Note that the file backing [`InMemoryLayer::file`] is append-only,
|
||||
/// so it is not necessary to hold a lock on the index while reading or writing from the file.
|
||||
/// In particular:
|
||||
/// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
|
||||
/// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
|
||||
/// 1. It is safe to read and release [`InMemoryLayer::index`] before reading from [`InMemoryLayer::file`].
|
||||
/// 2. It is safe to write to [`InMemoryLayer::file`] before locking and updating [`InMemoryLayer::index`].
|
||||
index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
|
||||
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once,
|
||||
/// and `index` (see rationale there).
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
/// Wrapper for the actual on-disk file. Uses interior mutability for concurrent reads/writes.
|
||||
file: EphemeralFile,
|
||||
|
||||
estimated_in_mem_size: AtomicU64,
|
||||
}
|
||||
@@ -96,20 +88,10 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
f.debug_struct("InMemoryLayer")
|
||||
.field("start_lsn", &self.start_lsn)
|
||||
.field("end_lsn", &self.end_lsn)
|
||||
.field("inner", &self.inner)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
/// PerSeg::page_versions map stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
|
||||
resource_units: GlobalResourceUnits,
|
||||
}
|
||||
|
||||
/// Support the same max blob length as blob_io, because ultimately
|
||||
/// all the InMemoryLayer contents end up being written into a delta layer,
|
||||
/// using the [`crate::tenant::blob_io`].
|
||||
@@ -258,12 +240,6 @@ struct IndexEntryUnpacked {
|
||||
pos: u64,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("InMemoryLayerInner").finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
|
||||
/// to minimize contention.
|
||||
///
|
||||
@@ -280,7 +256,7 @@ pub(crate) struct GlobalResources {
|
||||
}
|
||||
|
||||
// Per-timeline RAII struct for its contribution to [`GlobalResources`]
|
||||
struct GlobalResourceUnits {
|
||||
pub(crate) struct GlobalResourceUnits {
|
||||
// How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
|
||||
// for decrementing the global counter by this many bytes when dropped.
|
||||
dirty_bytes: u64,
|
||||
@@ -292,7 +268,7 @@ impl GlobalResourceUnits {
|
||||
// updated when the Timeline "ticks" in the background.
|
||||
const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
|
||||
|
||||
fn new() -> Self {
|
||||
pub(crate) fn new() -> Self {
|
||||
GLOBAL_RESOURCES
|
||||
.dirty_layers
|
||||
.fetch_add(1, AtomicOrdering::Relaxed);
|
||||
@@ -304,7 +280,7 @@ impl GlobalResourceUnits {
|
||||
///
|
||||
/// Returns the effective layer size limit that should be applied, if any, to keep
|
||||
/// the total number of dirty bytes below the configured maximum.
|
||||
fn publish_size(&mut self, size: u64) -> Option<u64> {
|
||||
pub(crate) fn publish_size(&mut self, size: u64) -> Option<u64> {
|
||||
let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
|
||||
Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
|
||||
Ordering::Greater => {
|
||||
@@ -349,7 +325,7 @@ impl GlobalResourceUnits {
|
||||
|
||||
// Call publish_size if the input size differs from last published size by more than
|
||||
// the drift limit
|
||||
fn maybe_publish_size(&mut self, size: u64) {
|
||||
pub(crate) fn maybe_publish_size(&mut self, size: u64) {
|
||||
let publish = match size.cmp(&self.dirty_bytes) {
|
||||
Ordering::Equal => false,
|
||||
Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
|
||||
@@ -398,8 +374,8 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_len(&self) -> Option<u64> {
|
||||
self.inner.try_read().map(|i| i.file.len()).ok()
|
||||
pub(crate) fn len(&self) -> u64 {
|
||||
self.file.len()
|
||||
}
|
||||
|
||||
pub(crate) fn assert_writable(&self) {
|
||||
@@ -430,7 +406,7 @@ impl InMemoryLayer {
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
pub async fn get_values_reconstruct_data(
|
||||
self: &Arc<InMemoryLayer>,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -479,14 +455,13 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
}
|
||||
drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
|
||||
drop(index); // release the lock before we spawn the IO
|
||||
let read_from = Arc::clone(self);
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state
|
||||
.spawn_io(async move {
|
||||
let inner = read_from.inner.read().await;
|
||||
let f = vectored_dio_read::execute(
|
||||
&inner.file,
|
||||
&read_from.file,
|
||||
reads
|
||||
.iter()
|
||||
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
|
||||
@@ -518,7 +493,6 @@ impl InMemoryLayer {
|
||||
// This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
|
||||
// but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
|
||||
// drop for consistency among all three layer types.
|
||||
drop(inner);
|
||||
drop(read_from);
|
||||
})
|
||||
.await;
|
||||
@@ -537,7 +511,7 @@ fn inmem_layer_log_display(
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
) -> std::fmt::Result {
|
||||
write!(f, "timeline {} in-memory ", timeline)?;
|
||||
write!(f, "timeline {timeline} in-memory ")?;
|
||||
inmem_layer_display(f, start_lsn, end_lsn)
|
||||
}
|
||||
|
||||
@@ -549,12 +523,6 @@ impl std::fmt::Display for InMemoryLayer {
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
/// Get layer size.
|
||||
pub async fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().await;
|
||||
Ok(inner.file.len())
|
||||
}
|
||||
|
||||
pub fn estimated_in_mem_size(&self) -> u64 {
|
||||
self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
|
||||
}
|
||||
@@ -587,10 +555,7 @@ impl InMemoryLayer {
|
||||
end_lsn: OnceLock::new(),
|
||||
opened_at: Instant::now(),
|
||||
index: RwLock::new(BTreeMap::new()),
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
file,
|
||||
estimated_in_mem_size: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
@@ -599,41 +564,37 @@ impl InMemoryLayer {
|
||||
///
|
||||
/// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
|
||||
/// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
|
||||
///
|
||||
/// This method shall not be called concurrently. We enforce this property via [`crate::tenant::Timeline::write_lock`].
|
||||
///
|
||||
/// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
|
||||
pub async fn put_batch(
|
||||
&self,
|
||||
serialized_batch: SerializedValueBatch,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let (base_offset, metadata) = {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.assert_writable();
|
||||
|
||||
let base_offset = inner.file.len();
|
||||
let base_offset = self.file.len();
|
||||
|
||||
let SerializedValueBatch {
|
||||
raw,
|
||||
metadata,
|
||||
max_lsn: _,
|
||||
len: _,
|
||||
} = serialized_batch;
|
||||
let SerializedValueBatch {
|
||||
raw,
|
||||
metadata,
|
||||
max_lsn: _,
|
||||
len: _,
|
||||
} = serialized_batch;
|
||||
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
// Write the batch to the file
|
||||
self.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = self.file.len();
|
||||
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
// also IndexEntry and higher levels in
|
||||
//the code don't allow the file to grow that large
|
||||
.unwrap();
|
||||
assert_eq!(new_size, expected_new_len);
|
||||
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
(base_offset, metadata)
|
||||
};
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
// also IndexEntry and higher levels in
|
||||
//the code don't allow the file to grow that large
|
||||
.unwrap();
|
||||
assert_eq!(new_size, expected_new_len);
|
||||
|
||||
// Update the index with the new entries
|
||||
let mut index = self.index.write().await;
|
||||
@@ -686,10 +647,8 @@ impl InMemoryLayer {
|
||||
self.opened_at
|
||||
}
|
||||
|
||||
pub(crate) async fn tick(&self) -> Option<u64> {
|
||||
let mut inner = self.inner.write().await;
|
||||
let size = inner.file.len();
|
||||
inner.resource_units.publish_size(size)
|
||||
pub(crate) fn tick(&self) -> Option<u64> {
|
||||
self.file.tick()
|
||||
}
|
||||
|
||||
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
|
||||
@@ -753,12 +712,6 @@ impl InMemoryLayer {
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. See the comment on
|
||||
// [`InMemoryLayer::freeze`] to understand how locking between the append path
|
||||
// and layer flushing works.
|
||||
let inner = self.inner.read().await;
|
||||
let index = self.index.read().await;
|
||||
|
||||
use l0_flush::Inner;
|
||||
@@ -793,7 +746,7 @@ impl InMemoryLayer {
|
||||
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents = inner.file.load_to_io_buf(ctx).await?;
|
||||
let file_contents = self.file.load_to_io_buf(ctx).await?;
|
||||
let file_contents = file_contents.freeze();
|
||||
|
||||
for (key, vec_map) in index.iter() {
|
||||
|
||||
@@ -380,7 +380,7 @@ impl<B: Buffer> std::fmt::Debug for LogicalReadState<B> {
|
||||
write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer))
|
||||
}
|
||||
LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)),
|
||||
LogicalReadState::Error(e) => write!(f, "Error({:?})", e),
|
||||
LogicalReadState::Error(e) => write!(f, "Error({e:?})"),
|
||||
LogicalReadState::Undefined => write!(f, "Undefined"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ impl std::fmt::Display for Layer {
|
||||
|
||||
impl std::fmt::Debug for Layer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self)
|
||||
write!(f, "{self}")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use pageserver_api::key::{CONTROLFILE_KEY, Key};
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::completion::{self, Completion};
|
||||
use utils::id::TimelineId;
|
||||
@@ -10,6 +11,7 @@ use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::tenant::harness::{TenantHarness, test_img};
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
|
||||
@@ -44,7 +46,7 @@ async fn smoke_test() {
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
Default::default(), // in-memory layers
|
||||
Default::default(),
|
||||
@@ -59,7 +61,7 @@ async fn smoke_test() {
|
||||
// there to avoid the timeline being illegally empty
|
||||
let (layer, dummy_layer) = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -215,7 +217,7 @@ async fn smoke_test() {
|
||||
|
||||
// Simulate GC removing our test layer.
|
||||
{
|
||||
let mut g = timeline.layers.write().await;
|
||||
let mut g = timeline.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
|
||||
let layers = &[layer];
|
||||
g.open_mut().unwrap().finish_gc_timeline(layers);
|
||||
@@ -255,13 +257,18 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -305,7 +312,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
// assert that once we remove the `layer` from the layer map and drop our reference,
|
||||
// the deletion of the layer in remote_storage happens.
|
||||
{
|
||||
let mut layers = timeline.layers.write().await;
|
||||
let mut layers = timeline.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
|
||||
}
|
||||
|
||||
@@ -340,14 +347,19 @@ fn read_wins_pending_eviction() {
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -473,14 +485,19 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -643,7 +660,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
@@ -655,7 +677,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -729,7 +751,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
@@ -741,7 +768,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -823,7 +850,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn eviction_cancellation_on_drop() {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
@@ -835,7 +862,12 @@ async fn eviction_cancellation_on_drop() {
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -862,7 +894,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
|
||||
let (evicted_layer, not_evicted) = {
|
||||
let mut layers = {
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let mut guard = timeline.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
|
||||
// remove the layers from layermap
|
||||
guard.open_mut().unwrap().finish_gc_timeline(&layers);
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::sync::Arc;
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator};
|
||||
use super::image_layer::{ImageLayerInner, ImageLayerIterator};
|
||||
@@ -402,9 +402,9 @@ impl<'a> MergeIterator<'a> {
|
||||
mod tests {
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use utils::lsn::Lsn;
|
||||
#[cfg(feature = "testing")]
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use super::*;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
@@ -436,7 +436,6 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn merge_in_between() {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between")
|
||||
.await
|
||||
@@ -501,7 +500,6 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn delta_merge() {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge")
|
||||
.await
|
||||
@@ -578,7 +576,6 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn delta_image_mixed_merge() {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
|
||||
.await
|
||||
|
||||
@@ -35,7 +35,11 @@ use fail::fail_point;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
use handle::ShardTimelineId;
|
||||
use layer_manager::Shutdown;
|
||||
use layer_manager::{
|
||||
LayerManagerLockHolder, LayerManagerReadGuard, LayerManagerWriteGuard, LockedLayerManager,
|
||||
Shutdown,
|
||||
};
|
||||
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
|
||||
@@ -52,11 +56,9 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
|
||||
#[cfg(test)]
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::v14::xlog_utils;
|
||||
use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
|
||||
use postgres_ffi::{PgMajorVersion, WAL_SEGMENT_SIZE, to_pg_timestamp};
|
||||
use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
@@ -77,12 +79,13 @@ use utils::seqwait::SeqWait;
|
||||
use utils::simple_rcu::{Rcu, RcuReadGuard};
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::{completion, critical, fs_ext, pausable_failpoint};
|
||||
#[cfg(test)]
|
||||
use wal_decoder::models::value::Value;
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
use self::eviction_task::EvictionTaskTimelineState;
|
||||
use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
@@ -92,12 +95,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
|
||||
use super::tasks::log_compaction_error;
|
||||
use super::upload_queue::NotInitialized;
|
||||
use super::{
|
||||
AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
|
||||
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
};
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::aux_file::AuxFileSizeEstimator;
|
||||
use crate::basebackup_cache::BasebackupPrepareRequest;
|
||||
use crate::basebackup_cache::BasebackupCache;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
@@ -175,19 +178,19 @@ pub enum LastImageLayerCreationStatus {
|
||||
|
||||
impl std::fmt::Display for ImageLayerCreationMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
write!(f, "{self:?}")
|
||||
}
|
||||
}
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
|
||||
fn drop_layer_manager_rlock(rlock: LayerManagerReadGuard<'_>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
|
||||
/// Can be removed after all refactors are done.
|
||||
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
fn drop_layer_manager_wlock(rlock: LayerManagerWriteGuard<'_>) {
|
||||
drop(rlock)
|
||||
}
|
||||
|
||||
@@ -198,7 +201,7 @@ pub struct TimelineResources {
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
pub basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
pub basebackup_cache: Arc<BasebackupCache>,
|
||||
pub feature_resolver: FeatureResolver,
|
||||
}
|
||||
|
||||
@@ -222,7 +225,7 @@ pub struct Timeline {
|
||||
/// to shards, and is constant through the lifetime of this Timeline.
|
||||
shard_identity: ShardIdentity,
|
||||
|
||||
pub pg_version: u32,
|
||||
pub pg_version: PgMajorVersion,
|
||||
|
||||
/// The tuple has two elements.
|
||||
/// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote).
|
||||
@@ -241,7 +244,7 @@ pub struct Timeline {
|
||||
///
|
||||
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
|
||||
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
|
||||
pub(crate) layers: tokio::sync::RwLock<LayerManager>,
|
||||
pub(crate) layers: LockedLayerManager,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
// Atomic would be more appropriate here.
|
||||
@@ -445,7 +448,7 @@ pub struct Timeline {
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore,
|
||||
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
feature_resolver: FeatureResolver,
|
||||
}
|
||||
@@ -629,7 +632,7 @@ pub enum ReadPathLayerId {
|
||||
impl std::fmt::Display for ReadPathLayerId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key),
|
||||
ReadPathLayerId::PersistentLayer(key) => write!(f, "{key}"),
|
||||
ReadPathLayerId::InMemoryLayer(range) => {
|
||||
write!(f, "in-mem {}..{}", range.start, range.end)
|
||||
}
|
||||
@@ -705,7 +708,7 @@ impl MissingKeyError {
|
||||
|
||||
impl std::fmt::Debug for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self)
|
||||
write!(f, "{self}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -718,19 +721,19 @@ impl std::fmt::Display for MissingKeyError {
|
||||
)?;
|
||||
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
write!(f, ", ancestor {ancestor_lsn}")?;
|
||||
}
|
||||
|
||||
if let Some(ref query) = self.query {
|
||||
write!(f, ", query {}", query)?;
|
||||
write!(f, ", query {query}")?;
|
||||
}
|
||||
|
||||
if let Some(ref read_path) = self.read_path {
|
||||
write!(f, "\n{}", read_path)?;
|
||||
write!(f, "\n{read_path}")?;
|
||||
}
|
||||
|
||||
if let Some(ref backtrace) = self.backtrace {
|
||||
write!(f, "\n{}", backtrace)?;
|
||||
write!(f, "\n{backtrace}")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -813,7 +816,7 @@ impl From<layer_manager::Shutdown> for FlushLayerError {
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetVectoredError {
|
||||
pub enum GetVectoredError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
@@ -846,7 +849,7 @@ impl From<GetReadyAncestorError> for GetVectoredError {
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetReadyAncestorError {
|
||||
pub enum GetReadyAncestorError {
|
||||
#[error("ancestor LSN wait error")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
@@ -936,7 +939,7 @@ impl std::fmt::Debug for Timeline {
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug, Clone)]
|
||||
pub(crate) enum WaitLsnError {
|
||||
pub enum WaitLsnError {
|
||||
// Called on a timeline which is shutting down
|
||||
#[error("Shutdown")]
|
||||
Shutdown,
|
||||
@@ -1055,8 +1058,8 @@ pub(crate) enum WaitLsnWaiter<'a> {
|
||||
/// Argument to [`Timeline::shutdown`].
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) enum ShutdownMode {
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk. This method can
|
||||
/// take multiple seconds for a busy timeline.
|
||||
///
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
@@ -1535,7 +1538,10 @@ impl Timeline {
|
||||
/// This method makes no distinction between local and remote layers.
|
||||
/// Hence, the result **does not represent local filesystem usage**.
|
||||
pub(crate) async fn layer_size_sum(&self) -> u64 {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
guard.layer_size_sum()
|
||||
}
|
||||
|
||||
@@ -1845,7 +1851,7 @@ impl Timeline {
|
||||
// time, and this was missed.
|
||||
// if write_guard.is_none() { return; }
|
||||
|
||||
let Ok(layers_guard) = self.layers.try_read() else {
|
||||
let Ok(layers_guard) = self.layers.try_read(LayerManagerLockHolder::TryFreezeLayer) else {
|
||||
// Don't block if the layer lock is busy
|
||||
return;
|
||||
};
|
||||
@@ -1896,16 +1902,11 @@ impl Timeline {
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(current_size) = open_layer.try_len() else {
|
||||
// Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
|
||||
// read lock to get size should always succeed.
|
||||
tracing::warn!("Lock conflict while reading size of open layer");
|
||||
return;
|
||||
};
|
||||
let current_size = open_layer.len();
|
||||
|
||||
let current_lsn = self.get_last_record_lsn();
|
||||
|
||||
let checkpoint_distance_override = open_layer.tick().await;
|
||||
let checkpoint_distance_override = open_layer.tick();
|
||||
|
||||
if let Some(size_override) = checkpoint_distance_override {
|
||||
if current_size > size_override {
|
||||
@@ -2158,7 +2159,7 @@ impl Timeline {
|
||||
if let ShutdownMode::FreezeAndFlush = mode {
|
||||
let do_flush = if let Some((open, frozen)) = self
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::Shutdown)
|
||||
.await
|
||||
.layer_map()
|
||||
.map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
|
||||
@@ -2262,7 +2263,10 @@ impl Timeline {
|
||||
// Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate
|
||||
// open.
|
||||
let mut write_guard = self.write_lock.lock().await;
|
||||
self.layers.write().await.shutdown(&mut write_guard);
|
||||
self.layers
|
||||
.write(LayerManagerLockHolder::Shutdown)
|
||||
.await
|
||||
.shutdown(&mut write_guard);
|
||||
}
|
||||
|
||||
// Finally wait until any gate-holders are complete.
|
||||
@@ -2365,7 +2369,10 @@ impl Timeline {
|
||||
&self,
|
||||
reset: LayerAccessStatsReset,
|
||||
) -> Result<LayerMapInfo, layer_manager::Shutdown> {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
|
||||
if let Some(open_layer) = &layer_map.open_layer {
|
||||
@@ -2493,6 +2500,13 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
|
||||
}
|
||||
|
||||
/// Try to get a basebackup from the on-disk cache.
|
||||
pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option<tokio::fs::File> {
|
||||
self.basebackup_cache
|
||||
.get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Prepare basebackup for the given LSN and store it in the basebackup cache.
|
||||
/// The method is asynchronous and returns immediately.
|
||||
/// The actual basebackup preparation is performed in the background
|
||||
@@ -2506,18 +2520,16 @@ impl Timeline {
|
||||
// Preparing basebackup doesn't make sense for shards other than shard zero.
|
||||
return;
|
||||
}
|
||||
|
||||
let res = self
|
||||
.basebackup_prepare_sender
|
||||
.send(BasebackupPrepareRequest {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
timeline_id: self.timeline_id,
|
||||
lsn,
|
||||
});
|
||||
if let Err(e) = res {
|
||||
// May happen during shutdown, it's not critical.
|
||||
info!("Failed to send shutdown checkpoint: {e:#}");
|
||||
if !self.is_active() {
|
||||
// May happen during initial timeline creation.
|
||||
// Such timeline is not in the global timeline map yet,
|
||||
// so basebackup cache will not be able to find it.
|
||||
// TODO(diko): We can prepare such timelines in finish_creation().
|
||||
return;
|
||||
}
|
||||
|
||||
self.basebackup_cache
|
||||
.send_prepare(self.tenant_shard_id, self.timeline_id, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2899,7 +2911,7 @@ impl Timeline {
|
||||
shard_identity: ShardIdentity,
|
||||
walredo_mgr: Option<Arc<super::WalRedoManager>>,
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
state: TimelineState,
|
||||
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
@@ -3074,7 +3086,7 @@ impl Timeline {
|
||||
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
|
||||
|
||||
basebackup_prepare_sender: resources.basebackup_prepare_sender,
|
||||
basebackup_cache: resources.basebackup_cache,
|
||||
|
||||
feature_resolver: resources.feature_resolver,
|
||||
};
|
||||
@@ -3225,7 +3237,7 @@ impl Timeline {
|
||||
|
||||
/// Initialize with an empty layer map. Used when creating a new timeline.
|
||||
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
|
||||
let mut layers = self.layers.try_write().expect(
|
||||
let mut layers = self.layers.try_write(LayerManagerLockHolder::Init).expect(
|
||||
"in the context where we call this function, no other task has access to the object",
|
||||
);
|
||||
layers
|
||||
@@ -3245,7 +3257,10 @@ impl Timeline {
|
||||
use init::Decision::*;
|
||||
use init::{Discovered, DismissedLayer};
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::LoadLayerMap)
|
||||
.await;
|
||||
|
||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||
|
||||
@@ -3400,10 +3415,6 @@ impl Timeline {
|
||||
// TenantShard::create_timeline will wait for these uploads to happen before returning, or
|
||||
// on retry.
|
||||
|
||||
// Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
|
||||
drop(guard); // drop write lock, update_layer_visibility will take a read lock.
|
||||
self.update_layer_visibility().await?;
|
||||
|
||||
info!(
|
||||
"loaded layer map with {} layers at {}, total physical size: {}",
|
||||
num_layers, disk_consistent_lsn, total_physical_size
|
||||
@@ -3862,7 +3873,10 @@ impl Timeline {
|
||||
&self,
|
||||
layer_name: &LayerName,
|
||||
) -> Result<Option<Layer>, layer_manager::Shutdown> {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer = guard
|
||||
.layer_map()?
|
||||
.iter_historic_layers()
|
||||
@@ -3895,7 +3909,10 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GenerateHeatmap)
|
||||
.await;
|
||||
|
||||
// Firstly, if there's any heatmap left over from when this location
|
||||
// was a secondary, take that into account. Keep layers that are:
|
||||
@@ -3993,7 +4010,10 @@ impl Timeline {
|
||||
}
|
||||
|
||||
pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GenerateHeatmap)
|
||||
.await;
|
||||
|
||||
let now = SystemTime::now();
|
||||
let mut heatmap_layers = Vec::default();
|
||||
@@ -4335,7 +4355,7 @@ impl Timeline {
|
||||
query: &VersionedKeySpaceQuery,
|
||||
) -> Result<LayerFringe, GetVectoredError> {
|
||||
let mut fringe = LayerFringe::new();
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::GetPage).await;
|
||||
|
||||
match query {
|
||||
VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
|
||||
@@ -4438,7 +4458,7 @@ impl Timeline {
|
||||
// required for correctness, but avoids visiting extra layers
|
||||
// which turns out to be a perf bottleneck in some cases.
|
||||
if !unmapped_keyspace.is_empty() {
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline.layers.read(LayerManagerLockHolder::GetPage).await;
|
||||
guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;
|
||||
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
@@ -4548,7 +4568,10 @@ impl Timeline {
|
||||
_guard: &tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::GetLayerForWrite)
|
||||
.await;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
ensure!(
|
||||
@@ -4590,7 +4613,10 @@ impl Timeline {
|
||||
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
|
||||
) -> Result<u64, FlushLayerError> {
|
||||
let frozen = {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::TryFreezeLayer)
|
||||
.await;
|
||||
guard
|
||||
.open_mut()?
|
||||
.try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
|
||||
@@ -4631,7 +4657,12 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
// Subscribe to L0 delta layer updates, for compaction backpressure.
|
||||
let mut watch_l0 = match self.layers.read().await.layer_map() {
|
||||
let mut watch_l0 = match self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::FlushLoop)
|
||||
.await
|
||||
.layer_map()
|
||||
{
|
||||
Ok(lm) => lm.watch_level0_deltas(),
|
||||
Err(Shutdown) => return,
|
||||
};
|
||||
@@ -4668,7 +4699,7 @@ impl Timeline {
|
||||
|
||||
// Fetch the next layer to flush, if any.
|
||||
let (layer, l0_count, frozen_count, frozen_size) = {
|
||||
let layers = self.layers.read().await;
|
||||
let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await;
|
||||
let Ok(lm) = layers.layer_map() else {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
return;
|
||||
@@ -4964,7 +4995,10 @@ impl Timeline {
|
||||
// in-memory layer from the map now. The flushed layer is stored in
|
||||
// the mapping in `create_delta_layer`.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::FlushFrozenLayer)
|
||||
.await;
|
||||
|
||||
guard.open_mut()?.finish_flush_l0_layer(
|
||||
delta_layer_to_add.as_ref(),
|
||||
@@ -5166,7 +5200,11 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
||||
let dense_partitioning = dense_ks.partition(
|
||||
&self.shard_identity,
|
||||
partition_size,
|
||||
postgres_ffi::BLCKSZ as u64,
|
||||
);
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
}; // no partitioning for metadata keys for now
|
||||
@@ -5179,7 +5217,7 @@ impl Timeline {
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let Ok(layers) = guard.layer_map() else {
|
||||
return false;
|
||||
};
|
||||
@@ -5597,7 +5635,7 @@ impl Timeline {
|
||||
if let ImageLayerCreationMode::Force = mode {
|
||||
// When forced to create image layers, we might try and create them where they already
|
||||
// exist. This mode is only used in tests/debug.
|
||||
let layers = self.layers.read().await;
|
||||
let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
if layers.contains_key(&PersistentLayerKey {
|
||||
key_range: img_range.clone(),
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
||||
@@ -5722,7 +5760,7 @@ impl Timeline {
|
||||
|
||||
let image_layers = batch_image_writer.finish(self, ctx).await?;
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
|
||||
|
||||
// FIXME: we could add the images to be uploaded *before* returning from here, but right
|
||||
// now they are being scheduled outside of write lock; current way is inconsistent with
|
||||
@@ -5730,7 +5768,7 @@ impl Timeline {
|
||||
guard
|
||||
.open_mut()?
|
||||
.track_new_image_layers(&image_layers, &self.metrics);
|
||||
drop_wlock(guard);
|
||||
drop_layer_manager_wlock(guard);
|
||||
let duration = timer.stop_and_record();
|
||||
|
||||
// Creating image layers may have caused some previously visible layers to be covered
|
||||
@@ -5894,7 +5932,7 @@ impl Drop for Timeline {
|
||||
if let Ok(mut gc_info) = ancestor.gc_info.write() {
|
||||
if !gc_info.remove_child_not_offloaded(self.timeline_id) {
|
||||
tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id,
|
||||
"Couldn't remove retain_lsn entry from offloaded timeline's parent: already removed");
|
||||
"Couldn't remove retain_lsn entry from timeline's parent on drop: already removed");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6100,7 +6138,7 @@ impl Timeline {
|
||||
layers_to_remove: &[Layer],
|
||||
) -> Result<(), CompactionError> {
|
||||
let mut guard = tokio::select! {
|
||||
guard = self.layers.write() => guard,
|
||||
guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
@@ -6149,7 +6187,7 @@ impl Timeline {
|
||||
self.remote_client
|
||||
.schedule_compaction_update(&remove_layers, new_deltas)?;
|
||||
|
||||
drop_wlock(guard);
|
||||
drop_layer_manager_wlock(guard);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6159,7 +6197,7 @@ impl Timeline {
|
||||
mut replace_layers: Vec<(Layer, ResidentLayer)>,
|
||||
mut drop_layers: Vec<Layer>,
|
||||
) -> Result<(), CompactionError> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
|
||||
|
||||
// Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
|
||||
// to avoid double-removing, and avoid rewriting something that was removed.
|
||||
@@ -6498,7 +6536,7 @@ impl Timeline {
|
||||
|
||||
debug!("retain_lsns: {:?}", retain_lsns);
|
||||
|
||||
let mut layers_to_remove = Vec::new();
|
||||
let max_retain_lsn = retain_lsns.iter().max();
|
||||
|
||||
// Scan all layers in the timeline (remote or on-disk).
|
||||
//
|
||||
@@ -6508,105 +6546,110 @@ impl Timeline {
|
||||
// 3. it doesn't need to be retained for 'retain_lsns';
|
||||
// 4. it does not need to be kept for LSNs holding valid leases.
|
||||
// 5. newer on-disk image layers cover the layer's whole key range
|
||||
//
|
||||
// TODO holding a write lock is too agressive and avoidable
|
||||
let mut guard = self.layers.write().await;
|
||||
let layers = guard.layer_map()?;
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
result.layers_total += 1;
|
||||
let layers_to_remove = {
|
||||
let mut layers_to_remove = Vec::new();
|
||||
|
||||
// 1. Is it newer than GC horizon cutoff point?
|
||||
if l.get_lsn_range().end > space_cutoff {
|
||||
info!(
|
||||
"keeping {} because it's newer than space_cutoff {}",
|
||||
l.layer_name(),
|
||||
space_cutoff,
|
||||
);
|
||||
result.layers_needed_by_cutoff += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
let layers = guard.layer_map()?;
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
result.layers_total += 1;
|
||||
|
||||
// 2. It is newer than PiTR cutoff point?
|
||||
if l.get_lsn_range().end > time_cutoff {
|
||||
info!(
|
||||
"keeping {} because it's newer than time_cutoff {}",
|
||||
l.layer_name(),
|
||||
time_cutoff,
|
||||
);
|
||||
result.layers_needed_by_pitr += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// 3. Is it needed by a child branch?
|
||||
// NOTE With that we would keep data that
|
||||
// might be referenced by child branches forever.
|
||||
// We can track this in child timeline GC and delete parent layers when
|
||||
// they are no longer needed. This might be complicated with long inheritance chains.
|
||||
//
|
||||
// TODO Vec is not a great choice for `retain_lsns`
|
||||
for retain_lsn in &retain_lsns {
|
||||
// start_lsn is inclusive
|
||||
if &l.get_lsn_range().start <= retain_lsn {
|
||||
info!(
|
||||
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
|
||||
// 1. Is it newer than GC horizon cutoff point?
|
||||
if l.get_lsn_range().end > space_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than space_cutoff {}",
|
||||
l.layer_name(),
|
||||
retain_lsn,
|
||||
l.is_incremental(),
|
||||
space_cutoff,
|
||||
);
|
||||
result.layers_needed_by_branches += 1;
|
||||
result.layers_needed_by_cutoff += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Is there a valid lease that requires us to keep this layer?
|
||||
if let Some(lsn) = &max_lsn_with_valid_lease {
|
||||
// keep if layer start <= any of the lease
|
||||
if &l.get_lsn_range().start <= lsn {
|
||||
info!(
|
||||
"keeping {} because there is a valid lease preventing GC at {}",
|
||||
// 2. It is newer than PiTR cutoff point?
|
||||
if l.get_lsn_range().end > time_cutoff {
|
||||
debug!(
|
||||
"keeping {} because it's newer than time_cutoff {}",
|
||||
l.layer_name(),
|
||||
lsn,
|
||||
time_cutoff,
|
||||
);
|
||||
result.layers_needed_by_leases += 1;
|
||||
result.layers_needed_by_pitr += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// 3. Is it needed by a child branch?
|
||||
// NOTE With that we would keep data that
|
||||
// might be referenced by child branches forever.
|
||||
// We can track this in child timeline GC and delete parent layers when
|
||||
// they are no longer needed. This might be complicated with long inheritance chains.
|
||||
if let Some(retain_lsn) = max_retain_lsn {
|
||||
// start_lsn is inclusive
|
||||
if &l.get_lsn_range().start <= retain_lsn {
|
||||
debug!(
|
||||
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
|
||||
l.layer_name(),
|
||||
retain_lsn,
|
||||
l.is_incremental(),
|
||||
);
|
||||
result.layers_needed_by_branches += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Is there a valid lease that requires us to keep this layer?
|
||||
if let Some(lsn) = &max_lsn_with_valid_lease {
|
||||
// keep if layer start <= any of the lease
|
||||
if &l.get_lsn_range().start <= lsn {
|
||||
debug!(
|
||||
"keeping {} because there is a valid lease preventing GC at {}",
|
||||
l.layer_name(),
|
||||
lsn,
|
||||
);
|
||||
result.layers_needed_by_leases += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Is there a later on-disk layer for this relation?
|
||||
//
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
//
|
||||
// For example, imagine that the following layers exist:
|
||||
//
|
||||
// 1000 - image (A)
|
||||
// 1000-2000 - delta (B)
|
||||
// 2000 - image (C)
|
||||
// 2000-3000 - delta (D)
|
||||
// 3000 - image (E)
|
||||
//
|
||||
// If GC horizon is at 2500, we can remove layers A and B, but
|
||||
// we cannot remove C, even though it's older than 2500, because
|
||||
// the delta layer 2000-3000 depends on it.
|
||||
if !layers
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
|
||||
{
|
||||
debug!("keeping {} because it is the latest layer", l.layer_name());
|
||||
result.layers_not_updated += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// We didn't find any reason to keep this file, so remove it.
|
||||
info!(
|
||||
"garbage collecting {} is_dropped: xx is_incremental: {}",
|
||||
l.layer_name(),
|
||||
l.is_incremental(),
|
||||
);
|
||||
layers_to_remove.push(l);
|
||||
}
|
||||
|
||||
// 5. Is there a later on-disk layer for this relation?
|
||||
//
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
//
|
||||
// For example, imagine that the following layers exist:
|
||||
//
|
||||
// 1000 - image (A)
|
||||
// 1000-2000 - delta (B)
|
||||
// 2000 - image (C)
|
||||
// 2000-3000 - delta (D)
|
||||
// 3000 - image (E)
|
||||
//
|
||||
// If GC horizon is at 2500, we can remove layers A and B, but
|
||||
// we cannot remove C, even though it's older than 2500, because
|
||||
// the delta layer 2000-3000 depends on it.
|
||||
if !layers
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
|
||||
{
|
||||
info!("keeping {} because it is the latest layer", l.layer_name());
|
||||
result.layers_not_updated += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// We didn't find any reason to keep this file, so remove it.
|
||||
info!(
|
||||
"garbage collecting {} is_dropped: xx is_incremental: {}",
|
||||
l.layer_name(),
|
||||
l.is_incremental(),
|
||||
);
|
||||
layers_to_remove.push(l);
|
||||
}
|
||||
layers_to_remove
|
||||
};
|
||||
|
||||
if !layers_to_remove.is_empty() {
|
||||
// Persist the new GC cutoff value before we actually remove anything.
|
||||
@@ -6622,15 +6665,19 @@ impl Timeline {
|
||||
}
|
||||
})?;
|
||||
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
|
||||
let gc_layers = layers_to_remove
|
||||
.iter()
|
||||
.map(|x| guard.get_from_desc(x))
|
||||
.flat_map(|desc| guard.try_get_from_key(&desc.key()).cloned())
|
||||
.collect::<Vec<Layer>>();
|
||||
|
||||
result.layers_removed = gc_layers.len() as u64;
|
||||
|
||||
self.remote_client.schedule_gc_update(&gc_layers)?;
|
||||
|
||||
guard.open_mut()?.finish_gc_timeline(&gc_layers);
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -6812,7 +6859,10 @@ impl Timeline {
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskState;
|
||||
|
||||
let remaining = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let Ok(lm) = guard.layer_map() else {
|
||||
// technically here we could look into iterating accessible layers, but downloading
|
||||
// all layers of a shutdown timeline makes no sense regardless.
|
||||
@@ -6918,7 +6968,7 @@ impl Timeline {
|
||||
impl Timeline {
|
||||
/// Returns non-remote layers for eviction.
|
||||
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
|
||||
let resident_layers = guard
|
||||
@@ -7019,7 +7069,7 @@ impl Timeline {
|
||||
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
info!("force created image layer {}", image_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
@@ -7082,7 +7132,7 @@ impl Timeline {
|
||||
let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
info!("force created delta layer {}", delta_layer.local_path());
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
guard
|
||||
.open_mut()
|
||||
.unwrap()
|
||||
@@ -7127,9 +7177,7 @@ impl Timeline {
|
||||
if let Some(end) = layer_end_lsn {
|
||||
assert!(
|
||||
end <= last_record_lsn,
|
||||
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
|
||||
end,
|
||||
last_record_lsn,
|
||||
"advance last record lsn before inserting a layer, end_lsn={end}, last_record_lsn={last_record_lsn}",
|
||||
);
|
||||
}
|
||||
|
||||
@@ -7177,7 +7225,7 @@ impl Timeline {
|
||||
|
||||
// Link the layer to the layer map
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
|
||||
let layer_map = guard.open_mut().unwrap();
|
||||
layer_map.force_insert_in_memory_layer(Arc::new(layer));
|
||||
}
|
||||
@@ -7194,7 +7242,7 @@ impl Timeline {
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> anyhow::Result<Vec<(Key, Bytes)>> {
|
||||
let mut all_data = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
if !layer.is_delta() && layer.image_layer_lsn() == lsn {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
@@ -7223,7 +7271,7 @@ impl Timeline {
|
||||
self: &Arc<Timeline>,
|
||||
) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
|
||||
let mut layers = Vec::new();
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
layers.push(layer.key());
|
||||
}
|
||||
@@ -7315,7 +7363,7 @@ impl TimelineWriter<'_> {
|
||||
.tl
|
||||
.get_layer_for_write(at, &self.write_guard, ctx)
|
||||
.await?;
|
||||
let initial_size = layer.size().await?;
|
||||
let initial_size = layer.len();
|
||||
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
self.write_guard.replace(TimelineWriterState::new(
|
||||
@@ -7335,7 +7383,7 @@ impl TimelineWriter<'_> {
|
||||
let l0_count = self
|
||||
.tl
|
||||
.layers
|
||||
.read()
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
@@ -7543,17 +7591,19 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use std::iter::Iterator;
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use super::HeatMapTimeline;
|
||||
use crate::context::RequestContextBuilder;
|
||||
use crate::tenant::harness::{TenantHarness, test_img};
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError};
|
||||
use crate::tenant::{PreviousHeatmap, Timeline};
|
||||
|
||||
@@ -7616,7 +7666,7 @@ mod tests {
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
delta_layers,
|
||||
@@ -7661,7 +7711,7 @@ mod tests {
|
||||
// Evict all the layers and stash the old heatmap in the timeline.
|
||||
// This simulates a migration to a cold secondary location.
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let mut all_layers = Vec::new();
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
for layer in guard.likely_resident_layers() {
|
||||
@@ -7752,7 +7802,7 @@ mod tests {
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
delta_layers,
|
||||
@@ -7783,7 +7833,7 @@ mod tests {
|
||||
})));
|
||||
|
||||
// Evict all the layers in the previous heatmap
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
for layer in guard.likely_resident_layers() {
|
||||
layer.evict_and_wait(forever).await.unwrap();
|
||||
@@ -7812,7 +7862,12 @@ mod tests {
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.create_test_timeline(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
PgMajorVersion::PG14,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -7846,7 +7901,10 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn find_some_layer(timeline: &Timeline) -> Layer {
|
||||
let layers = timeline.layers.read().await;
|
||||
let layers = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let desc = layers
|
||||
.layer_map()
|
||||
.unwrap()
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::Timeline;
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub(crate) struct RangeAnalysis {
|
||||
@@ -24,7 +25,10 @@ impl Timeline {
|
||||
|
||||
let num_of_l0;
|
||||
let all_layer_files = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
num_of_l0 = guard.layer_map().unwrap().level0_deltas().len();
|
||||
guard.all_persistent_layers()
|
||||
};
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard};
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
|
||||
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
|
||||
@@ -29,9 +29,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
|
||||
use pageserver_api::key::{KEY_SIZE, Key};
|
||||
use pageserver_api::keyspace::{KeySpace, ShardedRange};
|
||||
use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use pageserver_api::value::Value;
|
||||
use pageserver_compaction::helpers::{fully_contains, overlaps_with};
|
||||
use pageserver_compaction::interface::*;
|
||||
use serde::Serialize;
|
||||
@@ -41,6 +39,8 @@ use tracing::{Instrument, debug, error, info, info_span, trace, warn};
|
||||
use utils::critical;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
use wal_decoder::models::value::Value;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
@@ -62,7 +62,7 @@ use crate::tenant::storage_layer::{
|
||||
use crate::tenant::tasks::log_compaction_error;
|
||||
use crate::tenant::timeline::{
|
||||
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
|
||||
ResidentLayer, drop_rlock,
|
||||
ResidentLayer, drop_layer_manager_rlock,
|
||||
};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
@@ -314,7 +314,10 @@ impl GcCompactionQueue {
|
||||
.unwrap_or(Lsn::INVALID);
|
||||
|
||||
let layers = {
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
@@ -408,7 +411,10 @@ impl GcCompactionQueue {
|
||||
timeline: &Arc<Timeline>,
|
||||
lsn: Lsn,
|
||||
) -> Result<u64, CompactionError> {
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let layers = layer_map.iter_historic_layers().collect_vec();
|
||||
let mut size = 0;
|
||||
@@ -851,7 +857,7 @@ impl KeyHistoryRetention {
|
||||
}
|
||||
let layer_generation;
|
||||
{
|
||||
let guard = tline.layers.read().await;
|
||||
let guard = tline.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
if !guard.contains_key(key) {
|
||||
return false;
|
||||
}
|
||||
@@ -971,7 +977,7 @@ impl KeyHistoryRetention {
|
||||
tline
|
||||
.reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
|
||||
.await
|
||||
.with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
|
||||
.with_context(|| format!("verification failed for key {key} at lsn {lsn}"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1282,7 +1288,10 @@ impl Timeline {
|
||||
// We do the repartition on the L0-L1 boundary. All data below the boundary
|
||||
// are compacted by L0 with low read amplification, thus making the `repartition`
|
||||
// function run fast.
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
guard
|
||||
.all_persistent_layers()
|
||||
.iter()
|
||||
@@ -1461,7 +1470,7 @@ impl Timeline {
|
||||
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let layers_iter = layers.layer_map()?.iter_historic_layers();
|
||||
let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
|
||||
for layer_desc in layers_iter {
|
||||
@@ -1722,7 +1731,10 @@ impl Timeline {
|
||||
// are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
|
||||
// Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
|
||||
// they will be subject to L0->L1 compaction in the near future.
|
||||
let layer_manager = self.layers.read().await;
|
||||
let layer_manager = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GetLayerMapInfo)
|
||||
.await;
|
||||
let layer_map = layer_manager.layer_map()?;
|
||||
|
||||
let readable_points = {
|
||||
@@ -1775,7 +1787,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let begin = tokio::time::Instant::now();
|
||||
let phase1_layers_locked = self.layers.read().await;
|
||||
let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let now = tokio::time::Instant::now();
|
||||
stats.read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
@@ -1803,7 +1815,7 @@ impl Timeline {
|
||||
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
|
||||
async fn compact_level0_phase1<'a>(
|
||||
self: &'a Arc<Self>,
|
||||
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
|
||||
guard: LayerManagerReadGuard<'a>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
target_file_size: u64,
|
||||
force_compaction_ignore_threshold: bool,
|
||||
@@ -2029,7 +2041,7 @@ impl Timeline {
|
||||
holes
|
||||
};
|
||||
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
|
||||
drop_rlock(guard);
|
||||
drop_layer_manager_rlock(guard);
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
@@ -2469,7 +2481,7 @@ impl Timeline {
|
||||
|
||||
// Find the top of the historical layers
|
||||
let end_lsn = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let layers = guard.layer_map()?;
|
||||
|
||||
let l0_deltas = layers.level0_deltas();
|
||||
@@ -2635,15 +2647,15 @@ impl Timeline {
|
||||
use std::fmt::Write;
|
||||
let mut output = String::new();
|
||||
if let Some((key, _, _)) = replay_history.first() {
|
||||
write!(output, "key={} ", key).unwrap();
|
||||
write!(output, "key={key} ").unwrap();
|
||||
let mut cnt = 0;
|
||||
for (_, lsn, val) in replay_history {
|
||||
if val.is_image() {
|
||||
write!(output, "i@{} ", lsn).unwrap();
|
||||
write!(output, "i@{lsn} ").unwrap();
|
||||
} else if val.will_init() {
|
||||
write!(output, "di@{} ", lsn).unwrap();
|
||||
write!(output, "di@{lsn} ").unwrap();
|
||||
} else {
|
||||
write!(output, "d@{} ", lsn).unwrap();
|
||||
write!(output, "d@{lsn} ").unwrap();
|
||||
}
|
||||
cnt += 1;
|
||||
if cnt >= 128 {
|
||||
@@ -3008,7 +3020,7 @@ impl Timeline {
|
||||
}
|
||||
split_key_ranges.sort();
|
||||
let all_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
@@ -3112,12 +3124,12 @@ impl Timeline {
|
||||
.await?;
|
||||
let jobs_len = jobs.len();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
info!(
|
||||
"running enhanced gc bottom-most compaction, sub-compaction {}/{}",
|
||||
idx + 1,
|
||||
jobs_len
|
||||
);
|
||||
let sub_compaction_progress = format!("{}/{}", idx + 1, jobs_len);
|
||||
self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
|
||||
.instrument(info_span!(
|
||||
"sub_compaction",
|
||||
sub_compaction_progress = sub_compaction_progress
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
if jobs_len == 0 {
|
||||
@@ -3185,7 +3197,10 @@ impl Timeline {
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let job_desc = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
let layers = guard.layer_map()?;
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut retain_lsns_below_horizon = Vec::new();
|
||||
@@ -3956,7 +3971,10 @@ impl Timeline {
|
||||
|
||||
// First, do a sanity check to ensure the newly-created layer map does not contain overlaps.
|
||||
let all_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
@@ -4020,7 +4038,10 @@ impl Timeline {
|
||||
let update_guard = self.gc_compaction_layer_update_lock.write().await;
|
||||
// Acquiring the update guard ensures current read operations end and new read operations are blocked.
|
||||
// TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = self
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::GarbageCollection)
|
||||
.await;
|
||||
guard
|
||||
.open_mut()?
|
||||
.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
|
||||
@@ -4088,7 +4109,11 @@ impl TimelineAdaptor {
|
||||
|
||||
pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
|
||||
let layers_to_delete = {
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let guard = self
|
||||
.timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await;
|
||||
self.layers_to_delete
|
||||
.iter()
|
||||
.map(|x| guard.get_from_desc(x))
|
||||
@@ -4133,7 +4158,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
|
||||
self.flush_updates().await?;
|
||||
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let guard = self
|
||||
.timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
|
||||
let result = layer_map
|
||||
@@ -4172,7 +4201,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
// this is a lot more complex than a simple downcast...
|
||||
if layer.is_delta() {
|
||||
let l = {
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let guard = self
|
||||
.timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::Compaction)
|
||||
.await;
|
||||
guard.get_from_desc(layer)
|
||||
};
|
||||
let result = l.download_and_keep_resident(ctx).await?;
|
||||
|
||||
@@ -19,7 +19,7 @@ use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::layer_manager::{LayerManager, LayerManagerLockHolder};
|
||||
use super::{FlushLayerError, Timeline};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -199,7 +199,10 @@ pub(crate) async fn generate_tombstone_image_layer(
|
||||
let image_lsn = ancestor_lsn;
|
||||
|
||||
{
|
||||
let layers = detached.layers.read().await;
|
||||
let layers = detached
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::DetachAncestor)
|
||||
.await;
|
||||
for layer in layers.all_persistent_layers() {
|
||||
if !layer.is_delta
|
||||
&& layer.lsn_range.start == image_lsn
|
||||
@@ -423,7 +426,7 @@ pub(super) async fn prepare(
|
||||
// we do not need to start from our layers, because they can only be layers that come
|
||||
// *after* ancestor_lsn
|
||||
let layers = tokio::select! {
|
||||
guard = ancestor.layers.read() => guard,
|
||||
guard = ancestor.layers.read(LayerManagerLockHolder::DetachAncestor) => guard,
|
||||
_ = detached.cancel.cancelled() => {
|
||||
return Err(ShuttingDown);
|
||||
}
|
||||
@@ -869,7 +872,12 @@ async fn remote_copy(
|
||||
|
||||
// Double check that the file is orphan (probably from an earlier attempt), then delete it
|
||||
let key = file_name.clone().into();
|
||||
if adoptee.layers.read().await.contains_key(&key) {
|
||||
if adoptee
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::DetachAncestor)
|
||||
.await
|
||||
.contains_key(&key)
|
||||
{
|
||||
// We are supposed to filter out such cases before coming to this function
|
||||
return Err(Error::Prepare(anyhow::anyhow!(
|
||||
"layer file {file_name} already present and inside layer map"
|
||||
|
||||
@@ -33,6 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
|
||||
use crate::tenant::storage_layer::LayerVisibilityHint;
|
||||
use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
|
||||
use crate::tenant::timeline::EvictionError;
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, TenantShard};
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -208,7 +209,7 @@ impl Timeline {
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
{
|
||||
let guard = self.layers.read().await;
|
||||
let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;
|
||||
|
||||
guard
|
||||
.likely_resident_layers()
|
||||
|
||||
@@ -15,6 +15,7 @@ use super::{Timeline, TimelineDeleteProgress};
|
||||
use crate::context::RequestContext;
|
||||
use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
mod flow;
|
||||
mod importbucket_client;
|
||||
@@ -163,7 +164,10 @@ async fn prepare_import(
|
||||
info!("wipe the slate clean");
|
||||
{
|
||||
// TODO: do we need to hold GC lock for this?
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let mut guard = timeline
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::ImportPgData)
|
||||
.await;
|
||||
assert!(
|
||||
guard.layer_map()?.open_layer.is_none(),
|
||||
"while importing, there should be no in-memory layer" // this just seems like a good place to assert it
|
||||
|
||||
@@ -36,8 +36,8 @@ use pageserver_api::keyspace::{ShardedRange, singleton_range};
|
||||
use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::relfile_utils::parse_relfilename;
|
||||
use postgres_ffi::{BLCKSZ, pg_constants};
|
||||
use remote_storage::RemotePath;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_stream::StreamExt;
|
||||
@@ -56,6 +56,7 @@ use crate::pgdatadir_mapping::{
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer};
|
||||
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
|
||||
|
||||
pub async fn run(
|
||||
timeline: Arc<Timeline>,
|
||||
@@ -557,7 +558,7 @@ impl PgDataDir {
|
||||
PgDataDirDb::new(
|
||||
storage,
|
||||
&basedir.join(dboid.to_string()),
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
postgres_ffi_types::constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&datadir_path,
|
||||
)
|
||||
@@ -570,7 +571,7 @@ impl PgDataDir {
|
||||
PgDataDirDb::new(
|
||||
storage,
|
||||
&datadir_path.join("global"),
|
||||
postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
|
||||
postgres_ffi_types::constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&datadir_path,
|
||||
)
|
||||
@@ -984,7 +985,10 @@ impl ChunkProcessingJob {
|
||||
let (desc, path) = writer.finish(ctx).await?;
|
||||
|
||||
{
|
||||
let guard = timeline.layers.read().await;
|
||||
let guard = timeline
|
||||
.layers
|
||||
.read(LayerManagerLockHolder::ImportPgData)
|
||||
.await;
|
||||
let existing_layer = guard.try_get_from_key(&desc.key());
|
||||
if let Some(layer) = existing_layer {
|
||||
if layer.metadata().generation == timeline.generation {
|
||||
@@ -1007,7 +1011,10 @@ impl ChunkProcessingJob {
|
||||
// certain that the existing layer is identical to the new one, so in that case
|
||||
// we replace the old layer with the one we just generated.
|
||||
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let mut guard = timeline
|
||||
.layers
|
||||
.write(LayerManagerLockHolder::ImportPgData)
|
||||
.await;
|
||||
|
||||
let existing_layer = guard
|
||||
.try_get_from_key(&resident_layer.layer_desc().key())
|
||||
@@ -1036,7 +1043,7 @@ impl ChunkProcessingJob {
|
||||
}
|
||||
}
|
||||
|
||||
crate::tenant::timeline::drop_wlock(guard);
|
||||
crate::tenant::timeline::drop_layer_manager_wlock(guard);
|
||||
|
||||
timeline
|
||||
.remote_client
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::ControlFileData;
|
||||
use postgres_ffi::{ControlFileData, PgMajorVersion};
|
||||
use remote_storage::{
|
||||
Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
|
||||
ListingObject, RemotePath, RemoteStorageConfig,
|
||||
@@ -264,7 +264,7 @@ impl ControlFile {
|
||||
pub(crate) fn base_lsn(&self) -> Lsn {
|
||||
Lsn(self.control_file_data.checkPoint).align()
|
||||
}
|
||||
pub(crate) fn pg_version(&self) -> u32 {
|
||||
pub(crate) fn pg_version(&self) -> PgMajorVersion {
|
||||
self.try_pg_version()
|
||||
.expect("prepare() checks that try_pg_version doesn't error")
|
||||
}
|
||||
@@ -274,13 +274,14 @@ impl ControlFile {
|
||||
pub(crate) fn control_file_buf(&self) -> &Bytes {
|
||||
&self.control_file_buf
|
||||
}
|
||||
fn try_pg_version(&self) -> anyhow::Result<u32> {
|
||||
|
||||
fn try_pg_version(&self) -> anyhow::Result<PgMajorVersion> {
|
||||
Ok(match self.control_file_data.catalog_version_no {
|
||||
// thesea are from catversion.h
|
||||
202107181 => 14,
|
||||
202209061 => 15,
|
||||
202307071 => 16,
|
||||
202406281 => 17,
|
||||
202107181 => PgMajorVersion::PG14,
|
||||
202209061 => PgMajorVersion::PG15,
|
||||
202307071 => PgMajorVersion::PG16,
|
||||
202406281 => PgMajorVersion::PG17,
|
||||
catversion => {
|
||||
anyhow::bail!("unrecognized catalog version {catversion}")
|
||||
}
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::mem::ManuallyDrop;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, bail, ensure};
|
||||
use itertools::Itertools;
|
||||
@@ -20,6 +23,155 @@ use crate::tenant::storage_layer::{
|
||||
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
|
||||
};
|
||||
|
||||
/// Warn if the lock was held for longer than this threshold.
|
||||
/// It's very generous and we should bring this value down over time.
|
||||
const LAYER_MANAGER_LOCK_WARN_THRESHOLD: Duration = Duration::from_secs(5);
|
||||
const LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD: Duration = Duration::from_secs(30);
|
||||
|
||||
/// Describes the operation that is holding the layer manager lock
|
||||
#[derive(Debug, Clone, Copy, strum_macros::Display)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum LayerManagerLockHolder {
|
||||
GetLayerMapInfo,
|
||||
GenerateHeatmap,
|
||||
GetPage,
|
||||
Init,
|
||||
LoadLayerMap,
|
||||
GetLayerForWrite,
|
||||
TryFreezeLayer,
|
||||
FlushFrozenLayer,
|
||||
FlushLoop,
|
||||
Compaction,
|
||||
GarbageCollection,
|
||||
Shutdown,
|
||||
ImportPgData,
|
||||
DetachAncestor,
|
||||
Eviction,
|
||||
#[cfg(test)]
|
||||
Testing,
|
||||
}
|
||||
|
||||
/// Wrapper for the layer manager that tracks the amount of time during which
|
||||
/// it was held under read or write lock
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LockedLayerManager {
|
||||
locked: tokio::sync::RwLock<LayerManager>,
|
||||
}
|
||||
|
||||
pub(crate) struct LayerManagerReadGuard<'a> {
|
||||
guard: ManuallyDrop<tokio::sync::RwLockReadGuard<'a, LayerManager>>,
|
||||
acquired_at: std::time::Instant,
|
||||
holder: LayerManagerLockHolder,
|
||||
}
|
||||
|
||||
pub(crate) struct LayerManagerWriteGuard<'a> {
|
||||
guard: ManuallyDrop<tokio::sync::RwLockWriteGuard<'a, LayerManager>>,
|
||||
acquired_at: std::time::Instant,
|
||||
holder: LayerManagerLockHolder,
|
||||
}
|
||||
|
||||
impl Drop for LayerManagerReadGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
// Drop the lock first, before potentially warning if it was held for too long.
|
||||
// SAFETY: ManuallyDrop in Drop implementation
|
||||
unsafe { ManuallyDrop::drop(&mut self.guard) };
|
||||
|
||||
let held_for = self.acquired_at.elapsed();
|
||||
if held_for >= LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD {
|
||||
tracing::warn!(
|
||||
holder=%self.holder,
|
||||
"Layer manager read lock held for {}s",
|
||||
held_for.as_secs_f64(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LayerManagerWriteGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
// Drop the lock first, before potentially warning if it was held for too long.
|
||||
// SAFETY: ManuallyDrop in Drop implementation
|
||||
unsafe { ManuallyDrop::drop(&mut self.guard) };
|
||||
|
||||
let held_for = self.acquired_at.elapsed();
|
||||
if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD {
|
||||
tracing::warn!(
|
||||
holder=%self.holder,
|
||||
"Layer manager write lock held for {}s",
|
||||
held_for.as_secs_f64(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for LayerManagerReadGuard<'_> {
|
||||
type Target = LayerManager;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.guard.deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for LayerManagerWriteGuard<'_> {
|
||||
type Target = LayerManager;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.guard.deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for LayerManagerWriteGuard<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.guard.deref_mut()
|
||||
}
|
||||
}
|
||||
|
||||
impl LockedLayerManager {
|
||||
pub(crate) async fn read(&self, holder: LayerManagerLockHolder) -> LayerManagerReadGuard {
|
||||
let guard = ManuallyDrop::new(self.locked.read().await);
|
||||
LayerManagerReadGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_read(
|
||||
&self,
|
||||
holder: LayerManagerLockHolder,
|
||||
) -> Result<LayerManagerReadGuard, tokio::sync::TryLockError> {
|
||||
let guard = ManuallyDrop::new(self.locked.try_read()?);
|
||||
|
||||
Ok(LayerManagerReadGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn write(&self, holder: LayerManagerLockHolder) -> LayerManagerWriteGuard {
|
||||
let guard = ManuallyDrop::new(self.locked.write().await);
|
||||
LayerManagerWriteGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_write(
|
||||
&self,
|
||||
holder: LayerManagerLockHolder,
|
||||
) -> Result<LayerManagerWriteGuard, tokio::sync::TryLockError> {
|
||||
let guard = ManuallyDrop::new(self.locked.try_write()?);
|
||||
|
||||
Ok(LayerManagerWriteGuard {
|
||||
guard,
|
||||
acquired_at: std::time::Instant::now(),
|
||||
holder,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
pub(crate) enum LayerManager {
|
||||
/// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
|
||||
|
||||
@@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
|
||||
let walingest_res = select! {
|
||||
walingest_res = walingest_future => walingest_res,
|
||||
_ = cancellation.cancelled() => {
|
||||
// We are doing reads in WalIngest::new, and those can hang as they come from the network.
|
||||
// Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
|
||||
debug!("Connection cancelled");
|
||||
return Err(WalReceiverError::Cancelled);
|
||||
},
|
||||
};
|
||||
let mut walingest = walingest_res.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let (format, compression) = match protocol {
|
||||
PostgresClientProtocol::Interpreted {
|
||||
@@ -360,8 +368,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
match raw_wal_start_lsn.cmp(&expected_wal_start) {
|
||||
std::cmp::Ordering::Greater => {
|
||||
let msg = format!(
|
||||
"Gap in streamed WAL: [{}, {})",
|
||||
expected_wal_start, raw_wal_start_lsn
|
||||
"Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn})"
|
||||
);
|
||||
critical!("{msg}");
|
||||
return Err(WalReceiverError::Other(anyhow!(msg)));
|
||||
|
||||
@@ -68,16 +68,9 @@ impl<A: Alignment> AlignedBuffer<A> {
|
||||
|
||||
assert!(
|
||||
begin <= end,
|
||||
"range start must not be greater than end: {:?} <= {:?}",
|
||||
begin,
|
||||
end,
|
||||
);
|
||||
assert!(
|
||||
end <= len,
|
||||
"range end out of bounds: {:?} <= {:?}",
|
||||
end,
|
||||
len,
|
||||
"range start must not be greater than end: {begin:?} <= {end:?}",
|
||||
);
|
||||
assert!(end <= len, "range end out of bounds: {end:?} <= {len:?}",);
|
||||
|
||||
let begin = self.range.start + begin;
|
||||
let end = self.range.start + end;
|
||||
|
||||
@@ -242,10 +242,7 @@ unsafe impl<A: Alignment> bytes::BufMut for AlignedBufferMut<A> {
|
||||
/// Panic with a nice error message.
|
||||
#[cold]
|
||||
fn panic_advance(idx: usize, len: usize) -> ! {
|
||||
panic!(
|
||||
"advance out of bounds: the len is {} but advancing by {}",
|
||||
len, idx
|
||||
);
|
||||
panic!("advance out of bounds: the len is {len} but advancing by {idx}");
|
||||
}
|
||||
|
||||
/// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer,
|
||||
|
||||
@@ -28,20 +28,20 @@ use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::{Key, rel_block_to_key};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::{
|
||||
TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch,
|
||||
fsm_logical_to_physical, pg_constants,
|
||||
PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion,
|
||||
enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants,
|
||||
};
|
||||
use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::{DeserializeError, SerializeError};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{critical, failpoint_support};
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
use wal_decoder::models::*;
|
||||
|
||||
use crate::ZERO_PAGE;
|
||||
@@ -781,7 +781,7 @@ impl WalIngest {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let (xact_common, is_commit, is_prepared) = match record {
|
||||
XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
|
||||
let xid: u64 = if modification.tline.pg_version >= 17 {
|
||||
let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
|
||||
self.adjust_to_full_transaction_id(xl_xid)?
|
||||
} else {
|
||||
xl_xid as u64
|
||||
@@ -886,7 +886,7 @@ impl WalIngest {
|
||||
xl_xid, parsed.xid, lsn,
|
||||
);
|
||||
|
||||
let xid: u64 = if modification.tline.pg_version >= 17 {
|
||||
let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
|
||||
self.adjust_to_full_transaction_id(parsed.xid)?
|
||||
} else {
|
||||
parsed.xid as u64
|
||||
@@ -1241,7 +1241,7 @@ impl WalIngest {
|
||||
if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
|
||||
&& info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
let oldest_active_xid = if pg_version >= 17 {
|
||||
let oldest_active_xid = if pg_version >= PgMajorVersion::PG17 {
|
||||
let mut oldest_active_full_xid = cp.nextXid.value;
|
||||
for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
|
||||
if xid < oldest_active_full_xid {
|
||||
@@ -1475,10 +1475,11 @@ impl WalIngest {
|
||||
|
||||
const fn rate_limiter(
|
||||
&self,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
) -> Option<&Lazy<Mutex<RateLimit>>> {
|
||||
const MIN_PG_VERSION: u32 = 14;
|
||||
const MAX_PG_VERSION: u32 = 17;
|
||||
const MIN_PG_VERSION: u32 = PgMajorVersion::PG14.major_version_num();
|
||||
const MAX_PG_VERSION: u32 = PgMajorVersion::PG17.major_version_num();
|
||||
let pg_version = pg_version.major_version_num();
|
||||
|
||||
if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION {
|
||||
return None;
|
||||
@@ -1603,6 +1604,7 @@ async fn get_relsize(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use anyhow::Result;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
|
||||
use super::*;
|
||||
@@ -1625,7 +1627,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
|
||||
for i in 14..=16 {
|
||||
for i in PgMajorVersion::ALL {
|
||||
dispatch_pgversion!(i, {
|
||||
pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
|
||||
});
|
||||
@@ -2108,7 +2110,7 @@ mod tests {
|
||||
// Check relation content
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
let data = format!("foo blk {blkno} at {lsn}");
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(
|
||||
@@ -2142,7 +2144,7 @@ mod tests {
|
||||
|
||||
for blkno in 0..1 {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
let data = format!("foo blk {blkno} at {lsn}");
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(
|
||||
@@ -2167,7 +2169,7 @@ mod tests {
|
||||
);
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
let data = format!("foo blk {blkno} at {lsn}");
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(
|
||||
@@ -2188,7 +2190,7 @@ mod tests {
|
||||
let lsn = Lsn(0x80);
|
||||
let mut m = tline.begin_modification(lsn);
|
||||
for blkno in 0..relsize {
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
let data = format!("foo blk {blkno} at {lsn}");
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
|
||||
.await?;
|
||||
@@ -2210,7 +2212,7 @@ mod tests {
|
||||
// Check relation content
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x80);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
let data = format!("foo blk {blkno} at {lsn}");
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(
|
||||
@@ -2335,7 +2337,7 @@ mod tests {
|
||||
// 5. Grep sk logs for "restart decoder" to get startpoint
|
||||
// 6. Run just the decoder from this test to get the endpoint.
|
||||
// It's the last LSN the decoder will output.
|
||||
let pg_version = 15; // The test data was generated by pg15
|
||||
let pg_version = PgMajorVersion::PG15; // The test data was generated by pg15
|
||||
let path = "test_data/sk_wal_segment_from_pgbench";
|
||||
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
|
||||
let source_initdb_path = format!("{path}/{INITDB_PATH}");
|
||||
@@ -2414,6 +2416,6 @@ mod tests {
|
||||
}
|
||||
|
||||
let duration = started_at.elapsed();
|
||||
println!("done in {:?}", duration);
|
||||
println!("done in {duration:?}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,12 +32,13 @@ use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
use utils::sync::heavier_once_cell;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::{
|
||||
@@ -165,7 +166,7 @@ impl PostgresRedoManager {
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
redo_attempt_type: RedoAttemptType,
|
||||
) -> Result<Bytes, Error> {
|
||||
if records.is_empty() {
|
||||
@@ -232,7 +233,7 @@ impl PostgresRedoManager {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
|
||||
pub async fn ping(&self, pg_version: PgMajorVersion) -> Result<(), Error> {
|
||||
self.do_with_walredo_process(pg_version, |proc| async move {
|
||||
proc.ping(Duration::from_secs(1))
|
||||
.await
|
||||
@@ -342,7 +343,7 @@ impl PostgresRedoManager {
|
||||
O,
|
||||
>(
|
||||
&self,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
closure: F,
|
||||
) -> Result<O, Error> {
|
||||
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
|
||||
@@ -442,7 +443,7 @@ impl PostgresRedoManager {
|
||||
base_img_lsn: Lsn,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
max_retry_attempts: u32,
|
||||
) -> Result<Bytes, Error> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
@@ -571,11 +572,12 @@ mod tests {
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::PgMajorVersion;
|
||||
use tracing::Instrument;
|
||||
use utils::id::TenantId;
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use super::PostgresRedoManager;
|
||||
use crate::config::PageServerConf;
|
||||
@@ -586,7 +588,7 @@ mod tests {
|
||||
let h = RedoHarness::new().unwrap();
|
||||
|
||||
h.manager
|
||||
.ping(14)
|
||||
.ping(PgMajorVersion::PG14)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
.expect("ping should work");
|
||||
@@ -612,7 +614,7 @@ mod tests {
|
||||
Lsn::from_str("0/16E2408").unwrap(),
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
@@ -641,7 +643,7 @@ mod tests {
|
||||
Lsn::from_str("0/16E2408").unwrap(),
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
PgMajorVersion::PG14,
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
@@ -663,7 +665,7 @@ mod tests {
|
||||
Lsn::INVALID,
|
||||
None,
|
||||
short_records(),
|
||||
16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
|
||||
PgMajorVersion::PG16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
|
||||
RedoAttemptType::ReadPage,
|
||||
)
|
||||
.instrument(h.span())
|
||||
|
||||
@@ -2,16 +2,16 @@ use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
use postgres_ffi::v14::nonrelfile_utils::{
|
||||
mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
|
||||
transaction_id_set_status,
|
||||
};
|
||||
use postgres_ffi::{BLCKSZ, pg_constants};
|
||||
use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
/// or we need to pass it to wal-redo postgres process?
|
||||
@@ -52,8 +52,7 @@ pub(crate) fn apply_in_neon(
|
||||
let (rel, _) = key.to_rel_block().context("invalid record")?;
|
||||
assert!(
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"TruncateVisibilityMap record on unexpected rel {}",
|
||||
rel
|
||||
"TruncateVisibilityMap record on unexpected rel {rel}"
|
||||
);
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
map[*trunc_byte + 1..].fill(0u8);
|
||||
@@ -78,8 +77,7 @@ pub(crate) fn apply_in_neon(
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
assert!(
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
rel
|
||||
"ClearVisibilityMapFlags record on unexpected rel {rel}"
|
||||
);
|
||||
if let Some(heap_blkno) = *new_heap_blkno {
|
||||
// Calculate the VM block and offset that corresponds to the heap block.
|
||||
@@ -124,8 +122,7 @@ pub(crate) fn apply_in_neon(
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetCommitted record with unexpected key {}",
|
||||
key
|
||||
"ClogSetCommitted record with unexpected key {key}"
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
@@ -135,15 +132,11 @@ pub(crate) fn apply_in_neon(
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
"ClogSetCommitted record for XID {xid} with unexpected key {key}"
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
"ClogSetCommitted record for XID {xid} with unexpected key {key}"
|
||||
);
|
||||
|
||||
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page);
|
||||
@@ -169,8 +162,7 @@ pub(crate) fn apply_in_neon(
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetAborted record with unexpected key {}",
|
||||
key
|
||||
"ClogSetAborted record with unexpected key {key}"
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
@@ -180,15 +172,11 @@ pub(crate) fn apply_in_neon(
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
"ClogSetAborted record for XID {xid} with unexpected key {key}"
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
"ClogSetAborted record for XID {xid} with unexpected key {key}"
|
||||
);
|
||||
|
||||
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
|
||||
@@ -199,8 +187,7 @@ pub(crate) fn apply_in_neon(
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactOffsets,
|
||||
"MultixactOffsetCreate record with unexpected key {}",
|
||||
key
|
||||
"MultixactOffsetCreate record with unexpected key {key}"
|
||||
);
|
||||
// Compute the block and offset to modify.
|
||||
// See RecordNewMultiXact in PostgreSQL sources.
|
||||
@@ -213,15 +200,11 @@ pub(crate) fn apply_in_neon(
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
"MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}"
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
"MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}"
|
||||
);
|
||||
|
||||
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
|
||||
@@ -231,8 +214,7 @@ pub(crate) fn apply_in_neon(
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactMembers,
|
||||
"MultixactMembersCreate record with unexpected key {}",
|
||||
key
|
||||
"MultixactMembersCreate record with unexpected key {key}"
|
||||
);
|
||||
for (i, member) in members.iter().enumerate() {
|
||||
let offset = moff + i as u32;
|
||||
@@ -249,15 +231,11 @@ pub(crate) fn apply_in_neon(
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
"MultiXactMembersCreate record for offset {moff} with unexpected key {key}"
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
"MultiXactMembersCreate record for offset {moff} with unexpected key {key}"
|
||||
);
|
||||
|
||||
let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
|
||||
@@ -10,14 +10,14 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{BLCKSZ, PgMajorVersion};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tracing::{Instrument, debug, error, instrument};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::poison::Poison;
|
||||
use wal_decoder::models::record::NeonWalRecord;
|
||||
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::config::PageServerConf;
|
||||
@@ -54,11 +54,11 @@ impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
#[instrument(skip_all,fields(pg_version=pg_version.major_version_num()))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
pg_version: PgMajorVersion,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user