Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517

This commit is contained in:
John Spray
2025-06-26 07:32:08 -07:00
committed by GitHub
425 changed files with 12213 additions and 5436 deletions

View File

@@ -12,6 +12,9 @@ testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "
fuzz-read-path = ["testing"]
# Enables benchmarking only APIs
benchmarking = []
[dependencies]
anyhow.workspace = true
arc-swap.workspace = true
@@ -56,6 +59,7 @@ pin-project-lite.workspace = true
postgres_backend.workspace = true
postgres_connection.workspace = true
postgres_ffi.workspace = true
postgres_ffi_types.workspace = true
postgres_initdb.workspace = true
postgres-protocol.workspace = true
postgres-types.workspace = true
@@ -126,6 +130,7 @@ harness = false
[[bench]]
name = "bench_ingest"
harness = false
required-features = ["benchmarking"]
[[bench]]
name = "upload_queue"

View File

@@ -1,23 +1,30 @@
use std::env;
use std::num::NonZeroUsize;
use std::sync::Arc;
use bytes::Bytes;
use camino::Utf8PathBuf;
use criterion::{Criterion, criterion_group, criterion_main};
use futures::stream::FuturesUnordered;
use pageserver::config::PageServerConf;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::keyspace::KeySpace;
use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::storage_layer::InMemoryLayer;
use pageserver::tenant::storage_layer::IoConcurrency;
use pageserver::tenant::storage_layer::{InMemoryLayer, ValuesReconstructState};
use pageserver::{page_cache, virtual_file};
use pageserver_api::config::GetVectoredConcurrentIo;
use pageserver_api::key::Key;
use pageserver_api::models::virtual_file::IoMode;
use pageserver_api::shard::TenantShardId;
use pageserver_api::value::Value;
use strum::IntoEnumIterator;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
use utils::bin_ser::BeSer;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::sync::gate::Gate;
use wal_decoder::models::value::Value;
use wal_decoder::serialized_batch::SerializedValueBatch;
// A very cheap hash for generating non-sequential keys.
@@ -30,7 +37,7 @@ fn murmurhash32(mut h: u32) -> u32 {
h
}
#[derive(serde::Serialize, Clone, Copy, Debug)]
#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
enum KeyLayout {
/// Sequential unique keys
Sequential,
@@ -40,19 +47,30 @@ enum KeyLayout {
RandomReuse(u32),
}
#[derive(serde::Serialize, Clone, Copy, Debug)]
#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
enum WriteDelta {
Yes,
No,
}
#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
enum ConcurrentReads {
Yes,
No,
}
async fn ingest(
conf: &'static PageServerConf,
put_size: usize,
put_count: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
concurrent_reads: ConcurrentReads,
) -> anyhow::Result<()> {
if concurrent_reads == ConcurrentReads::Yes {
assert_eq!(key_layout, KeyLayout::Sequential);
}
let mut lsn = utils::lsn::Lsn(1000);
let mut key = Key::from_i128(0x0);
@@ -68,16 +86,18 @@ async fn ingest(
let gate = utils::sync::gate::Gate::default();
let cancel = CancellationToken::new();
let layer = InMemoryLayer::create(
conf,
timeline_id,
tenant_shard_id,
lsn,
&gate,
&cancel,
&ctx,
)
.await?;
let layer = Arc::new(
InMemoryLayer::create(
conf,
timeline_id,
tenant_shard_id,
lsn,
&gate,
&cancel,
&ctx,
)
.await?,
);
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
let data_ser_size = data.serialized_size().unwrap() as usize;
@@ -86,6 +106,61 @@ async fn ingest(
pageserver::context::DownloadBehavior::Download,
);
const READ_BATCH_SIZE: u32 = 32;
let (tx, mut rx) = tokio::sync::watch::channel::<Option<Key>>(None);
let reader_cancel = CancellationToken::new();
let reader_handle = if concurrent_reads == ConcurrentReads::Yes {
Some(tokio::task::spawn({
let cancel = reader_cancel.clone();
let layer = layer.clone();
let ctx = ctx.attached_child();
async move {
let gate = Gate::default();
let gate_guard = gate.enter().unwrap();
let io_concurrency = IoConcurrency::spawn_from_conf(
GetVectoredConcurrentIo::SidecarTask,
gate_guard,
);
rx.wait_for(|key| key.is_some()).await.unwrap();
while !cancel.is_cancelled() {
let key = match *rx.borrow() {
Some(some) => some,
None => unreachable!(),
};
let mut start_key = key;
start_key.field6 = key.field6.saturating_sub(READ_BATCH_SIZE);
let key_range = start_key..key.next();
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
layer
.get_values_reconstruct_data(
KeySpace::single(key_range),
Lsn(1)..Lsn(u64::MAX),
&mut reconstruct_state,
&ctx,
)
.await
.unwrap();
let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
.into_values()
.map(|state| state.sink_pending_ios())
.collect::<FuturesUnordered<_>>();
while collect_futs.next().await.is_some() {}
}
drop(io_concurrency);
gate.close().await;
}
}))
} else {
None
};
const BATCH_SIZE: usize = 16;
let mut batch = Vec::new();
@@ -113,19 +188,27 @@ async fn ingest(
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
if batch.len() >= BATCH_SIZE {
let last_key = Key::from_compact(batch.last().unwrap().0);
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedValueBatch::from_values(this_batch);
layer.put_batch(serialized, &ctx).await?;
tx.send(Some(last_key)).unwrap();
}
}
if !batch.is_empty() {
let last_key = Key::from_compact(batch.last().unwrap().0);
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedValueBatch::from_values(this_batch);
layer.put_batch(serialized, &ctx).await?;
tx.send(Some(last_key)).unwrap();
}
layer.freeze(lsn + 1).await;
if matches!(write_delta, WriteDelta::Yes) {
if write_delta == WriteDelta::Yes {
let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
max_concurrency: NonZeroUsize::new(1).unwrap(),
});
@@ -136,6 +219,11 @@ async fn ingest(
tokio::fs::remove_file(path).await?;
}
reader_cancel.cancel();
if let Some(handle) = reader_handle {
handle.await.unwrap();
}
Ok(())
}
@@ -147,6 +235,7 @@ fn ingest_main(
put_count: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
concurrent_reads: ConcurrentReads,
) {
pageserver::virtual_file::set_io_mode(io_mode);
@@ -156,7 +245,15 @@ fn ingest_main(
.unwrap();
runtime.block_on(async move {
let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
let r = ingest(
conf,
put_size,
put_count,
key_layout,
write_delta,
concurrent_reads,
)
.await;
if let Err(e) = r {
panic!("{e:?}");
}
@@ -195,6 +292,7 @@ fn criterion_benchmark(c: &mut Criterion) {
key_size: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
concurrent_reads: ConcurrentReads,
}
#[derive(Clone)]
struct HandPickedParameters {
@@ -245,7 +343,7 @@ fn criterion_benchmark(c: &mut Criterion) {
];
let exploded_parameters = {
let mut out = Vec::new();
for io_mode in IoMode::iter() {
for concurrent_reads in [ConcurrentReads::Yes, ConcurrentReads::No] {
for param in expect.clone() {
let HandPickedParameters {
volume_mib,
@@ -253,12 +351,18 @@ fn criterion_benchmark(c: &mut Criterion) {
key_layout,
write_delta,
} = param;
if key_layout != KeyLayout::Sequential && concurrent_reads == ConcurrentReads::Yes {
continue;
}
out.push(ExplodedParameters {
io_mode,
io_mode: IoMode::DirectRw,
volume_mib,
key_size,
key_layout,
write_delta,
concurrent_reads,
});
}
}
@@ -272,9 +376,10 @@ fn criterion_benchmark(c: &mut Criterion) {
key_size,
key_layout,
write_delta,
concurrent_reads,
} = self;
format!(
"io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}"
"io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?} concurrent_reads={concurrent_reads:?}"
)
}
}
@@ -287,12 +392,23 @@ fn criterion_benchmark(c: &mut Criterion) {
key_size,
key_layout,
write_delta,
concurrent_reads,
} = params;
let put_count = volume_mib * 1024 * 1024 / key_size;
group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64));
group.sample_size(10);
group.bench_function(id, |b| {
b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta))
b.iter(|| {
ingest_main(
conf,
io_mode,
key_size,
put_count,
key_layout,
write_delta,
concurrent_reads,
)
})
});
}
}

File diff suppressed because one or more lines are too long

View File

@@ -18,6 +18,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio.workspace = true
postgres_versioninfo.workspace = true
futures.workspace = true
tokio-util.workspace = true
anyhow.workspace = true

View File

@@ -7,6 +7,7 @@ use detach_ancestor::AncestorDetached;
use http_utils::error::HttpErrorBody;
use pageserver_api::models::*;
use pageserver_api::shard::TenantShardId;
use postgres_versioninfo::PgMajorVersion;
pub use reqwest::Body as ReqwestBody;
use reqwest::{IntoUrl, Method, StatusCode, Url};
use utils::id::{TenantId, TimelineId};
@@ -508,11 +509,11 @@ impl Client {
.expect("Cannot build URL");
path.query_pairs_mut()
.append_pair("recurse", &format!("{}", recurse));
.append_pair("recurse", &format!("{recurse}"));
if let Some(concurrency) = concurrency {
path.query_pairs_mut()
.append_pair("concurrency", &format!("{}", concurrency));
.append_pair("concurrency", &format!("{concurrency}"));
}
self.request(Method::POST, path, ()).await.map(|_| ())
@@ -745,9 +746,11 @@ impl Client {
timeline_id: TimelineId,
base_lsn: Lsn,
end_lsn: Lsn,
pg_version: u32,
pg_version: PgMajorVersion,
basebackup_tarball: ReqwestBody,
) -> Result<()> {
let pg_version = pg_version.major_version_num();
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
self.mgmt_api_endpoint,
@@ -841,4 +844,13 @@ impl Client {
.await
.map_err(Error::ReceiveBody)
}
pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> {
let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint);
self.request(Method::POST, uri, spec)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -2,7 +2,7 @@ use std::sync::{Arc, Mutex};
use futures::stream::{SplitSink, SplitStream};
use futures::{SinkExt, StreamExt};
use pageserver_api::models::{
use pageserver_api::pagestream_api::{
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
};
use pageserver_api::reltag::RelTag;

View File

@@ -152,7 +152,7 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
let key_diff = key_end - key_start;
if key_start >= key_end {
panic!("Invalid key range {}-{}", key_start, key_end);
panic!("Invalid key range {key_start}-{key_end}");
}
let lsn_start = lsn_map.map(f.lsn_range.start);
@@ -212,12 +212,12 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
)?;
writeln!(svg, "</line>")?;
}
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"),
}
files_seen.insert(f);
}
writeln!(svg, "{}", EndSvg)?;
writeln!(svg, "{EndSvg}")?;
let mut layer_events_str = String::new();
let mut first = true;

View File

@@ -20,7 +20,7 @@
//!
//! # local timeline dir
//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
//! grep "__" | cargo run --release --bin pagectl draw-timeline > out.svg
//!
//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg
@@ -81,7 +81,11 @@ fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T
fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
let split: Vec<&str> = name.split("__").collect();
let keys: Vec<&str> = split[0].split('-').collect();
let mut lsns: Vec<&str> = split[1].split('-').collect();
// Remove the temporary file extension, e.g., remove the `.d20a.___temp` part from the following filename:
// 000000067F000040490000404A00441B0000-000000067F000040490000404A00441B4000__000043483A34CE00.d20a.___temp
let lsns = split[1].split('.').collect::<Vec<&str>>()[0];
let mut lsns: Vec<&str> = lsns.split('-').collect();
// The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001
@@ -224,7 +228,7 @@ pub fn main() -> Result<()> {
let lsn_max = lsn_map.len();
if key_start >= key_end {
panic!("Invalid key range {}-{}", key_start, key_end);
panic!("Invalid key range {key_start}-{key_end}");
}
let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
@@ -246,7 +250,7 @@ pub fn main() -> Result<()> {
ymargin = 0.05;
fill = Fill::Color(rgb(0, 0, 0));
}
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"),
}
println!(
@@ -283,10 +287,10 @@ pub fn main() -> Result<()> {
);
}
println!("{}", EndSvg);
println!("{EndSvg}");
eprintln!("num_images: {}", num_images);
eprintln!("num_deltas: {}", num_deltas);
eprintln!("num_images: {num_images}");
eprintln!("num_deltas: {num_deltas}");
Ok(())
}

View File

@@ -372,7 +372,7 @@ impl<const N: usize> std::fmt::Debug for RelTagish<N> {
f.write_char('/')?;
}
first = false;
write!(f, "{}", x)
write!(f, "{x}")
})
}
}

View File

@@ -224,8 +224,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
}
}
println!(
"Total delta layers {} image layers {} excess layers {}",
total_delta_layers, total_image_layers, total_excess_layers
"Total delta layers {total_delta_layers} image layers {total_image_layers} excess layers {total_excess_layers}"
);
Ok(())
}

View File

@@ -13,7 +13,7 @@ use pageserver::{page_cache, virtual_file};
use pageserver_api::key::Key;
use utils::id::{TenantId, TimelineId};
use crate::layer_map_analyzer::parse_filename;
use crate::layer_map_analyzer::{LayerFile, parse_filename};
#[derive(Subcommand)]
pub(crate) enum LayerCmd {
@@ -38,6 +38,8 @@ pub(crate) enum LayerCmd {
/// The id from list-layer command
id: usize,
},
/// Dump all information of a layer file locally
DumpLayerLocal { path: PathBuf },
RewriteSummary {
layer_file_path: Utf8PathBuf,
#[clap(long)]
@@ -131,15 +133,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
}
for (idx, layer_file) in to_print {
println!(
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
idx,
layer_file.key_range.start,
layer_file.key_range.end,
layer_file.lsn_range.start,
layer_file.lsn_range.end,
layer_file.is_delta,
);
print_layer_file(idx, &layer_file);
}
Ok(())
}
@@ -159,16 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
let layer = layer?;
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
if *id == idx {
// TODO(chi): dedup code
println!(
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
idx,
layer_file.key_range.start,
layer_file.key_range.end,
layer_file.lsn_range.start,
layer_file.lsn_range.end,
layer_file.is_delta,
);
print_layer_file(idx, &layer_file);
if layer_file.is_delta {
read_delta_file(layer.path(), &ctx).await?;
@@ -183,6 +168,18 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
}
Ok(())
}
LayerCmd::DumpLayerLocal { path } => {
if let Ok(layer_file) = parse_filename(path.file_name().unwrap().to_str().unwrap()) {
print_layer_file(0, &layer_file);
if layer_file.is_delta {
read_delta_file(path, &ctx).await?;
} else {
read_image_file(path, &ctx).await?;
}
}
Ok(())
}
LayerCmd::RewriteSummary {
layer_file_path,
new_tenant_id,
@@ -247,3 +244,15 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
}
}
}
fn print_layer_file(idx: usize, layer_file: &LayerFile) {
println!(
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
idx,
layer_file.key_range.start,
layer_file.key_range.end,
layer_file.lsn_range.start,
layer_file.lsn_range.end,
layer_file.is_delta,
);
}

View File

@@ -176,9 +176,11 @@ async fn main() -> anyhow::Result<()> {
let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?;
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
let cancel = CancellationToken::new();
// Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not
// need to limit the number of versions we are going to delete.
storage
.unwrap()
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None)
.await?;
}
Commands::Key(dkc) => dkc.execute(),

View File

@@ -5,11 +5,16 @@ edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
bytes.workspace = true
futures.workspace = true
pageserver_api.workspace = true
postgres_ffi.workspace = true
prost.workspace = true
strum.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tokio.workspace = true
tonic.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -102,12 +102,27 @@ message CheckRelExistsResponse {
bool exists = 1;
}
// Requests a base backup at a given LSN.
// Requests a base backup.
message GetBaseBackupRequest {
// The LSN to fetch a base backup at.
ReadLsn read_lsn = 1;
// The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver.
uint64 lsn = 1;
// If true, logical replication slots will not be created.
bool replica = 2;
// If true, include relation files in the base backup. Mainly for debugging and tests.
bool full = 3;
// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
// compression, so that we can cache compressed backups on the server.
BaseBackupCompression compression = 4;
}
// Base backup compression algorithms.
enum BaseBackupCompression {
// Unknown algorithm. Used when clients send an unsupported algorithm.
BASE_BACKUP_COMPRESSION_UNKNOWN = 0;
// No compression.
BASE_BACKUP_COMPRESSION_NONE = 1;
// GZIP compression.
BASE_BACKUP_COMPRESSION_GZIP = 2;
}
// Base backup response chunk, returned as an ordered stream.

View File

@@ -0,0 +1,199 @@
use std::convert::TryInto;
use bytes::Bytes;
use futures::TryStreamExt;
use futures::{Stream, StreamExt};
use tonic::metadata::AsciiMetadataValue;
use tonic::metadata::errors::InvalidMetadataValue;
use tonic::transport::Channel;
use tonic::{Request, Streaming};
use utils::id::TenantId;
use utils::id::TimelineId;
use utils::shard::ShardIndex;
use anyhow::Result;
use crate::model;
use crate::proto;
///
/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
/// headers are required at the pageserver.
///
#[derive(Clone)]
struct AuthInterceptor {
tenant_id: AsciiMetadataValue,
timeline_id: AsciiMetadataValue,
shard_id: AsciiMetadataValue,
auth_header: Option<AsciiMetadataValue>, // including "Bearer " prefix
}
impl AuthInterceptor {
fn new(
tenant_id: TenantId,
timeline_id: TimelineId,
auth_token: Option<String>,
shard_id: ShardIndex,
) -> Result<Self, InvalidMetadataValue> {
let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
let auth_header: Option<AsciiMetadataValue> = match auth_token {
Some(token) => Some(format!("Bearer {token}").try_into()?),
None => None,
};
Ok(Self {
tenant_id: tenant_ascii,
shard_id: shard_ascii,
timeline_id: timeline_ascii,
auth_header,
})
}
}
impl tonic::service::Interceptor for AuthInterceptor {
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
req.metadata_mut()
.insert("neon-tenant-id", self.tenant_id.clone());
req.metadata_mut()
.insert("neon-shard-id", self.shard_id.clone());
req.metadata_mut()
.insert("neon-timeline-id", self.timeline_id.clone());
if let Some(auth_header) = &self.auth_header {
req.metadata_mut()
.insert("authorization", auth_header.clone());
}
Ok(req)
}
}
#[derive(Clone)]
pub struct Client {
client: proto::PageServiceClient<
tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
>,
}
impl Client {
pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
into_endpoint: T,
tenant_id: TenantId,
timeline_id: TimelineId,
shard_id: ShardIndex,
auth_header: Option<String>,
compression: Option<tonic::codec::CompressionEncoding>,
) -> anyhow::Result<Self> {
let endpoint: tonic::transport::Endpoint = into_endpoint
.try_into()
.map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
let channel = endpoint.connect().await?;
let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
.map_err(|e| anyhow::anyhow!(e.to_string()))?;
let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
if let Some(compression) = compression {
// TODO: benchmark this (including network latency).
client = client
.accept_compressed(compression)
.send_compressed(compression);
}
Ok(Self { client })
}
/// Returns whether a relation exists.
pub async fn check_rel_exists(
&mut self,
req: model::CheckRelExistsRequest,
) -> Result<model::CheckRelExistsResponse, tonic::Status> {
let proto_req = proto::CheckRelExistsRequest::from(req);
let response = self.client.check_rel_exists(proto_req).await?;
let proto_resp = response.into_inner();
Ok(proto_resp.into())
}
/// Fetches a base backup.
pub async fn get_base_backup(
&mut self,
req: model::GetBaseBackupRequest,
) -> Result<impl Stream<Item = Result<Bytes, tonic::Status>> + 'static, tonic::Status> {
let proto_req = proto::GetBaseBackupRequest::from(req);
let response_stream: Streaming<proto::GetBaseBackupResponseChunk> =
self.client.get_base_backup(proto_req).await?.into_inner();
// TODO: Consider dechunking internally
let domain_stream = response_stream.map(|chunk_res| {
chunk_res.and_then(|proto_chunk| {
proto_chunk.try_into().map_err(|e| {
tonic::Status::internal(format!("Failed to convert response chunk: {e}"))
})
})
});
Ok(domain_stream)
}
/// Returns the total size of a database, as # of bytes.
pub async fn get_db_size(
&mut self,
req: model::GetDbSizeRequest,
) -> Result<u64, tonic::Status> {
let proto_req = proto::GetDbSizeRequest::from(req);
let response = self.client.get_db_size(proto_req).await?;
Ok(response.into_inner().into())
}
/// Fetches pages.
///
/// This is implemented as a bidirectional streaming RPC for performance.
/// Per-request errors are often returned as status_code instead of errors,
/// to avoid tearing down the entire stream via tonic::Status.
pub async fn get_pages<ReqSt>(
&mut self,
inbound: ReqSt,
) -> Result<
impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
tonic::Status,
>
where
ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
{
let outbound_proto = inbound.map(|domain_req| domain_req.into());
let req_new = Request::new(outbound_proto);
let response_stream: Streaming<proto::GetPageResponse> =
self.client.get_pages(req_new).await?.into_inner();
let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
Ok(domain_stream)
}
/// Returns the size of a relation, as # of blocks.
pub async fn get_rel_size(
&mut self,
req: model::GetRelSizeRequest,
) -> Result<model::GetRelSizeResponse, tonic::Status> {
let proto_req = proto::GetRelSizeRequest::from(req);
let response = self.client.get_rel_size(proto_req).await?;
let proto_resp = response.into_inner();
Ok(proto_resp.into())
}
/// Fetches an SLRU segment.
pub async fn get_slru_segment(
&mut self,
req: model::GetSlruSegmentRequest,
) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
let proto_req = proto::GetSlruSegmentRequest::from(req);
let response = self.client.get_slru_segment(proto_req).await?;
Ok(response.into_inner().try_into()?)
}
}

View File

@@ -18,6 +18,8 @@ pub mod proto {
pub use page_service_server::{PageService, PageServiceServer};
}
mod client;
pub use client::Client;
mod model;
pub use model::*;

View File

@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
use crate::proto;
/// A protocol error. Typically returned via try_from() or try_into().
#[derive(thiserror::Error, Debug)]
#[derive(thiserror::Error, Clone, Debug)]
pub enum ProtocolError {
#[error("field '{0}' has invalid value '{1}'")]
Invalid(&'static str, String),
@@ -182,13 +182,18 @@ impl From<CheckRelExistsResponse> for proto::CheckRelExistsResponse {
}
}
/// Requests a base backup at a given LSN.
/// Requests a base backup.
#[derive(Clone, Copy, Debug)]
pub struct GetBaseBackupRequest {
/// The LSN to fetch a base backup at.
pub read_lsn: ReadLsn,
/// The LSN to fetch a base backup at. If None, uses the latest LSN known to the Pageserver.
pub lsn: Option<Lsn>,
/// If true, logical replication slots will not be created.
pub replica: bool,
/// If true, include relation files in the base backup. Mainly for debugging and tests.
pub full: bool,
/// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
/// compression, so that we can cache compressed backups on the server.
pub compression: BaseBackupCompression,
}
impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
@@ -196,11 +201,10 @@ impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
Ok(Self {
read_lsn: pb
.read_lsn
.ok_or(ProtocolError::Missing("read_lsn"))?
.try_into()?,
lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
replica: pb.replica,
full: pb.full,
compression: pb.compression.try_into()?,
})
}
}
@@ -208,12 +212,58 @@ impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
fn from(request: GetBaseBackupRequest) -> Self {
Self {
read_lsn: Some(request.read_lsn.into()),
lsn: request.lsn.unwrap_or_default().0,
replica: request.replica,
full: request.full,
compression: request.compression.into(),
}
}
}
/// Base backup compression algorithm.
#[derive(Clone, Copy, Debug)]
pub enum BaseBackupCompression {
None,
Gzip,
}
impl TryFrom<proto::BaseBackupCompression> for BaseBackupCompression {
type Error = ProtocolError;
fn try_from(pb: proto::BaseBackupCompression) -> Result<Self, Self::Error> {
match pb {
proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)),
proto::BaseBackupCompression::None => Ok(Self::None),
proto::BaseBackupCompression::Gzip => Ok(Self::Gzip),
}
}
}
impl TryFrom<i32> for BaseBackupCompression {
type Error = ProtocolError;
fn try_from(compression: i32) -> Result<Self, Self::Error> {
proto::BaseBackupCompression::try_from(compression)
.map_err(|_| ProtocolError::invalid("compression", compression))
.and_then(Self::try_from)
}
}
impl From<BaseBackupCompression> for proto::BaseBackupCompression {
fn from(compression: BaseBackupCompression) -> Self {
match compression {
BaseBackupCompression::None => Self::None,
BaseBackupCompression::Gzip => Self::Gzip,
}
}
}
impl From<BaseBackupCompression> for i32 {
fn from(compression: BaseBackupCompression) -> Self {
proto::BaseBackupCompression::from(compression).into()
}
}
pub type GetBaseBackupResponseChunk = Bytes;
impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
@@ -422,12 +472,45 @@ impl From<GetPageResponse> for proto::GetPageResponse {
}
}
impl GetPageResponse {
/// Attempts to represent a tonic::Status as a GetPageResponse if appropriate. Returning a
/// tonic::Status will terminate the GetPage stream, so per-request errors are emitted as a
/// GetPageResponse with a non-OK status code instead.
#[allow(clippy::result_large_err)]
pub fn try_from_status(
status: tonic::Status,
request_id: RequestID,
) -> Result<Self, tonic::Status> {
// We shouldn't see an OK status here, because we're emitting an error.
debug_assert_ne!(status.code(), tonic::Code::Ok);
if status.code() == tonic::Code::Ok {
return Err(tonic::Status::internal(format!(
"unexpected OK status: {status:?}",
)));
}
// If we can't convert the tonic::Code to a GetPageStatusCode, this is not a per-request
// error and we should return a tonic::Status to terminate the stream.
let Ok(status_code) = status.code().try_into() else {
return Err(status);
};
// Return a GetPageResponse for the status.
Ok(Self {
request_id,
status_code,
reason: Some(status.message().to_string()),
page_images: Vec::new(),
})
}
}
/// A GetPage response status code.
///
/// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
/// (potentially shared by many backends), and a gRPC status response would terminate the stream so
/// we send GetPageResponse messages with these codes instead.
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, PartialEq, strum_macros::Display)]
pub enum GetPageStatusCode {
/// Unknown status. For forwards compatibility: used when an older client version receives a new
/// status code from a newer server version.
@@ -485,8 +568,42 @@ impl From<GetPageStatusCode> for i32 {
}
}
impl TryFrom<tonic::Code> for GetPageStatusCode {
type Error = tonic::Code;
fn try_from(code: tonic::Code) -> Result<Self, Self::Error> {
use tonic::Code;
let status_code = match code {
Code::Ok => Self::Ok,
// These are per-request errors, which should be returned as GetPageResponses.
Code::AlreadyExists => Self::InvalidRequest,
Code::DataLoss => Self::InternalError,
Code::FailedPrecondition => Self::InvalidRequest,
Code::InvalidArgument => Self::InvalidRequest,
Code::Internal => Self::InternalError,
Code::NotFound => Self::NotFound,
Code::OutOfRange => Self::InvalidRequest,
Code::ResourceExhausted => Self::SlowDown,
// These should terminate the stream by returning a tonic::Status.
Code::Aborted
| Code::Cancelled
| Code::DeadlineExceeded
| Code::PermissionDenied
| Code::Unauthenticated
| Code::Unavailable
| Code::Unimplemented
| Code::Unknown => return Err(code),
};
Ok(status_code)
}
}
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
// shards will error.
#[derive(Clone, Copy, Debug)]
pub struct GetRelSizeRequest {
pub read_lsn: ReadLsn,
pub rel: RelTag,
@@ -530,6 +647,7 @@ impl From<GetRelSizeResponse> for proto::GetRelSizeResponse {
}
/// Requests an SLRU segment. Only valid on shard 0, other shards will error.
#[derive(Clone, Copy, Debug)]
pub struct GetSlruSegmentRequest {
pub read_lsn: ReadLsn,
pub kind: SlruKind,

View File

@@ -25,6 +25,7 @@ tokio.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
tonic.workspace = true
url.workspace = true
pageserver_client.workspace = true
pageserver_api.workspace = true

View File

@@ -62,7 +62,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
let timeline_id = timeline.timeline_id;
println!("operating on timeline {}", timeline);
println!("operating on timeline {timeline}");
mgmt_api_client
.set_tenant_config(&TenantConfigRequest {
@@ -75,8 +75,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let items = (0..100)
.map(|id| {
(
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
format!("{:08}", id),
format!("pg_logical/mappings/{batch:03}.{id:03}"),
format!("{id:08}"),
)
})
.collect::<HashMap<_, _>>();

View File

@@ -1,20 +1,29 @@
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::ops::Range;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use std::pin::Pin;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Instant;
use anyhow::Context;
use anyhow::anyhow;
use futures::TryStreamExt as _;
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
use pageserver_client::page_service::BasebackupRequest;
use pageserver_page_api as page_api;
use rand::prelude::*;
use tokio::io::AsyncRead;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tokio_util::compat::{TokioAsyncReadCompatExt as _, TokioAsyncWriteCompatExt as _};
use tokio_util::io::StreamReader;
use tonic::async_trait;
use tracing::{info, instrument};
use url::Url;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use utils::shard::ShardIndex;
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
use crate::util::{request_stats, tokio_thread_local_stats};
@@ -24,14 +33,15 @@ use crate::util::{request_stats, tokio_thread_local_stats};
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
/// The Pageserver to connect to. Use postgresql:// for libpq, or grpc:// for gRPC.
#[clap(long, default_value = "postgresql://postgres@localhost:64000")]
page_service_connstring: String,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(long, default_value = "1")]
num_clients: NonZeroUsize,
#[clap(long, default_value = "1.0")]
gzip_probability: f64,
#[clap(long)]
no_compression: bool,
#[clap(long)]
runtime: Option<humantime::Duration>,
#[clap(long)]
@@ -146,12 +156,27 @@ async fn main_impl(
let mut work_senders = HashMap::new();
let mut tasks = Vec::new();
for tl in &timelines {
let scheme = match Url::parse(&args.page_service_connstring) {
Ok(url) => url.scheme().to_lowercase().to_string(),
Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(),
Err(err) => return Err(anyhow!("invalid connstring: {err}")),
};
for &tl in &timelines {
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
work_senders.insert(tl, sender);
tasks.push(tokio::spawn(client(
args,
*tl,
let client: Box<dyn Client> = match scheme.as_str() {
"postgresql" | "postgres" => Box::new(
LibpqClient::new(&args.page_service_connstring, tl, !args.no_compression).await?,
),
"grpc" => Box::new(
GrpcClient::new(&args.page_service_connstring, tl, !args.no_compression).await?,
),
scheme => return Err(anyhow!("invalid scheme {scheme}")),
};
tasks.push(tokio::spawn(run_worker(
client,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
@@ -166,13 +191,7 @@ async fn main_impl(
let mut rng = rand::thread_rng();
let target = all_targets.choose(&mut rng).unwrap();
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
(
target.timeline,
Work {
lsn,
gzip: rng.gen_bool(args.gzip_probability),
},
)
(target.timeline, Work { lsn })
};
let sender = work_senders.get(&timeline).unwrap();
// TODO: what if this blocks?
@@ -216,13 +235,11 @@ async fn main_impl(
#[derive(Copy, Clone)]
struct Work {
lsn: Option<Lsn>,
gzip: bool,
}
#[instrument(skip_all)]
async fn client(
args: &'static Args,
timeline: TenantTimelineId,
async fn run_worker(
mut client: Box<dyn Client>,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<Work>,
all_work_done_barrier: Arc<Barrier>,
@@ -230,37 +247,14 @@ async fn client(
) {
start_work_barrier.wait().await;
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
while let Some(Work { lsn, gzip }) = work.recv().await {
while let Some(Work { lsn }) = work.recv().await {
let start = Instant::now();
let copy_out_stream = client
.basebackup(&BasebackupRequest {
tenant_id: timeline.tenant_id,
timeline_id: timeline.timeline_id,
lsn,
gzip,
})
.await
.with_context(|| format!("start basebackup for {timeline}"))
.unwrap();
let stream = client.basebackup(lsn).await.unwrap();
use futures::StreamExt;
let size = Arc::new(AtomicUsize::new(0));
copy_out_stream
.for_each({
|r| {
let size = Arc::clone(&size);
async move {
let size = Arc::clone(&size);
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
}
}
})
.await;
info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
let size = futures::io::copy(stream.compat(), &mut tokio::io::sink().compat_write())
.await
.unwrap();
info!("basebackup size is {size} bytes");
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
@@ -270,3 +264,100 @@ async fn client(
all_work_done_barrier.wait().await;
}
/// A basebackup client. This allows switching out the client protocol implementation.
#[async_trait]
trait Client: Send {
async fn basebackup(
&mut self,
lsn: Option<Lsn>,
) -> anyhow::Result<Pin<Box<dyn AsyncRead + Send>>>;
}
/// A libpq-based Pageserver client.
struct LibpqClient {
inner: pageserver_client::page_service::Client,
ttid: TenantTimelineId,
compression: bool,
}
impl LibpqClient {
async fn new(
connstring: &str,
ttid: TenantTimelineId,
compression: bool,
) -> anyhow::Result<Self> {
Ok(Self {
inner: pageserver_client::page_service::Client::new(connstring.to_string()).await?,
ttid,
compression,
})
}
}
#[async_trait]
impl Client for LibpqClient {
async fn basebackup(
&mut self,
lsn: Option<Lsn>,
) -> anyhow::Result<Pin<Box<dyn AsyncRead + Send + 'static>>> {
let req = BasebackupRequest {
tenant_id: self.ttid.tenant_id,
timeline_id: self.ttid.timeline_id,
lsn,
gzip: self.compression,
};
let stream = self.inner.basebackup(&req).await?;
Ok(Box::pin(StreamReader::new(
stream.map_err(std::io::Error::other),
)))
}
}
/// A gRPC Pageserver client.
struct GrpcClient {
inner: page_api::Client,
compression: page_api::BaseBackupCompression,
}
impl GrpcClient {
async fn new(
connstring: &str,
ttid: TenantTimelineId,
compression: bool,
) -> anyhow::Result<Self> {
let inner = page_api::Client::new(
connstring.to_string(),
ttid.tenant_id,
ttid.timeline_id,
ShardIndex::unsharded(),
None,
None, // NB: uses payload compression
)
.await?;
let compression = match compression {
true => page_api::BaseBackupCompression::Gzip,
false => page_api::BaseBackupCompression::None,
};
Ok(Self { inner, compression })
}
}
#[async_trait]
impl Client for GrpcClient {
async fn basebackup(
&mut self,
lsn: Option<Lsn>,
) -> anyhow::Result<Pin<Box<dyn AsyncRead + Send + 'static>>> {
let req = page_api::GetBaseBackupRequest {
lsn,
replica: false,
full: false,
compression: self.compression,
};
let stream = self.inner.get_base_backup(req).await?;
Ok(Box::pin(StreamReader::new(
stream.map_err(std::io::Error::other),
)))
}
}

View File

@@ -10,33 +10,31 @@ use anyhow::Context;
use async_trait::async_trait;
use bytes::Bytes;
use camino::Utf8PathBuf;
use futures::{Stream, StreamExt as _};
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::TenantShardId;
use pageserver_page_api::proto;
use pageserver_page_api as page_api;
use rand::prelude::*;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::info;
use url::Url;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use utils::shard::ShardIndex;
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
use crate::util::{request_stats, tokio_thread_local_stats};
#[derive(clap::ValueEnum, Clone, Debug)]
enum Protocol {
Libpq,
Grpc,
}
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
page_service_connstring: String,
#[clap(long)]
@@ -45,8 +43,9 @@ pub(crate) struct Args {
num_clients: NonZeroUsize,
#[clap(long)]
runtime: Option<humantime::Duration>,
#[clap(long, value_enum, default_value = "libpq")]
protocol: Protocol,
/// If true, enable compression (only for gRPC).
#[clap(long)]
compression: bool,
/// Each client sends requests at the given rate.
///
/// If a request takes too long and we should be issuing a new request already,
@@ -325,18 +324,32 @@ async fn main_impl(
.unwrap();
Box::pin(async move {
let client: Box<dyn Client> = match args.protocol {
Protocol::Libpq => Box::new(
LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline)
.await
.unwrap(),
let scheme = match Url::parse(&args.page_service_connstring) {
Ok(url) => url.scheme().to_lowercase().to_string(),
Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(),
Err(err) => panic!("invalid connstring: {err}"),
};
let client: Box<dyn Client> = match scheme.as_str() {
"postgresql" | "postgres" => {
assert!(!args.compression, "libpq does not support compression");
Box::new(
LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
.await
.unwrap(),
)
}
"grpc" => Box::new(
GrpcClient::new(
&args.page_service_connstring,
worker_id.timeline,
args.compression,
)
.await
.unwrap(),
),
Protocol::Grpc => Box::new(
GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline)
.await
.unwrap(),
),
scheme => panic!("unsupported scheme {scheme}"),
};
run_worker(args, client, ss, cancel, rps_period, ranges, weights).await
})
@@ -543,8 +556,8 @@ struct LibpqClient {
}
impl LibpqClient {
async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
let inner = pageserver_client::page_service::Client::new(connstring)
async fn new(connstring: &str, ttid: TenantTimelineId) -> anyhow::Result<Self> {
let inner = pageserver_client::page_service::Client::new(connstring.to_string())
.await?
.pagestream(ttid.tenant_id, ttid.timeline_id)
.await?;
@@ -600,34 +613,36 @@ impl Client for LibpqClient {
}
}
/// A gRPC client using the raw, no-frills gRPC client.
/// A gRPC Pageserver client.
struct GrpcClient {
req_tx: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
resp_rx: tonic::Streaming<proto::GetPageResponse>,
req_tx: tokio::sync::mpsc::Sender<page_api::GetPageRequest>,
resp_rx: Pin<Box<dyn Stream<Item = Result<page_api::GetPageResponse, tonic::Status>> + Send>>,
}
impl GrpcClient {
async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?;
async fn new(
connstring: &str,
ttid: TenantTimelineId,
compression: bool,
) -> anyhow::Result<Self> {
let mut client = page_api::Client::new(
connstring.to_string(),
ttid.tenant_id,
ttid.timeline_id,
ShardIndex::unsharded(),
None,
compression.then_some(tonic::codec::CompressionEncoding::Zstd),
)
.await?;
// The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the
// benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are
// buffered by Tonic and the OS too.
let (req_tx, req_rx) = tokio::sync::mpsc::channel(1);
let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
let mut req = tonic::Request::new(req_stream);
let metadata = req.metadata_mut();
metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?);
metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?);
metadata.insert("neon-shard-id", "0000".try_into()?);
let resp_rx = Box::pin(client.get_pages(req_stream).await?);
let resp = client.get_pages(req).await?;
let resp_stream = resp.into_inner();
Ok(Self {
req_tx,
resp_rx: resp_stream,
})
Ok(Self { req_tx, resp_rx })
}
}
@@ -641,27 +656,27 @@ impl Client for GrpcClient {
rel: RelTag,
blks: Vec<u32>,
) -> anyhow::Result<()> {
let req = proto::GetPageRequest {
let req = page_api::GetPageRequest {
request_id: req_id,
request_class: proto::GetPageClass::Normal as i32,
read_lsn: Some(proto::ReadLsn {
request_lsn: req_lsn.0,
not_modified_since_lsn: mod_lsn.0,
}),
rel: Some(rel.into()),
block_number: blks,
request_class: page_api::GetPageClass::Normal,
read_lsn: page_api::ReadLsn {
request_lsn: req_lsn,
not_modified_since_lsn: Some(mod_lsn),
},
rel,
block_numbers: blks,
};
self.req_tx.send(req).await?;
Ok(())
}
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
let resp = self.resp_rx.message().await?.unwrap();
let resp = self.resp_rx.next().await.unwrap().unwrap();
anyhow::ensure!(
resp.status_code == proto::GetPageStatusCode::Ok as i32,
resp.status_code == page_api::GetPageStatusCode::Ok,
"unexpected status code: {}",
resp.status_code
resp.status_code,
);
Ok((resp.request_id, resp.page_image))
Ok((resp.request_id, resp.page_images))
}
}

View File

@@ -14,19 +14,19 @@ use std::fmt::Write as FmtWrite;
use std::time::{Instant, SystemTime};
use anyhow::{Context, anyhow};
use async_compression::tokio::write::GzipEncoder;
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::{Key, rel_block_to_key};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants::{
DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES,
};
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES};
use postgres_ffi::{
BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants,
BLCKSZ, PG_TLI, PgMajorVersion, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName,
dispatch_pgversion, pg_constants,
};
use tokio::io;
use tokio::io::AsyncWrite;
use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM};
use tokio::io::{self, AsyncWrite, AsyncWriteExt as _};
use tokio_tar::{Builder, EntryType, Header};
use tracing::*;
use utils::lsn::Lsn;
@@ -97,6 +97,7 @@ impl From<BasebackupError> for tonic::Status {
/// * When working without safekeepers. In this situation it is important to match the lsn
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
/// to start the replication.
#[allow(clippy::too_many_arguments)]
pub async fn send_basebackup_tarball<'a, W>(
write: &'a mut W,
timeline: &'a Timeline,
@@ -104,6 +105,7 @@ pub async fn send_basebackup_tarball<'a, W>(
prev_lsn: Option<Lsn>,
full_backup: bool,
replica: bool,
gzip_level: Option<async_compression::Level>,
ctx: &'a RequestContext,
) -> Result<(), BasebackupError>
where
@@ -122,7 +124,7 @@ where
// prev_lsn value; that happens if the timeline was just branched from
// an old LSN and it doesn't have any WAL of its own yet. We will set
// prev_lsn to Lsn(0) if we cannot provide the correct value.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn {
// Backup was requested at a particular LSN. The caller should've
// already checked that it's a valid LSN.
@@ -143,7 +145,7 @@ where
};
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
return Err(BasebackupError::Server(anyhow!(
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
@@ -155,30 +157,55 @@ where
};
info!(
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
backup_lsn, prev_lsn, full_backup, replica
"taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \
(full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})",
);
let span = info_span!("send_tarball", backup_lsn=%lsn);
let io_concurrency = IoConcurrency::spawn_from_conf(
timeline.conf.get_vectored_concurrent_io,
timeline
.gate
.enter()
.map_err(|_| BasebackupError::Shutdown)?,
);
let basebackup = Basebackup {
ar: Builder::new_non_terminated(write),
timeline,
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
full_backup,
replica,
ctx,
io_concurrency: IoConcurrency::spawn_from_conf(
timeline.conf.get_vectored_concurrent_io,
timeline
.gate
.enter()
.map_err(|_| BasebackupError::Shutdown)?,
),
};
basebackup
if let Some(gzip_level) = gzip_level {
let mut encoder = GzipEncoder::with_quality(write, gzip_level);
Basebackup {
ar: Builder::new_non_terminated(&mut encoder),
timeline,
lsn,
prev_record_lsn,
full_backup,
replica,
ctx,
io_concurrency,
}
.send_tarball()
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
.await
.instrument(span)
.await?;
encoder
.shutdown()
.await
.map_err(|err| BasebackupError::Client(err, "gzip"))?;
} else {
Basebackup {
ar: Builder::new_non_terminated(write),
timeline,
lsn,
prev_record_lsn,
full_backup,
replica,
ctx,
io_concurrency,
}
.send_tarball()
.instrument(span)
.await?;
}
Ok(())
}
/// This is short-living object only for the time of tarball creation,
@@ -372,6 +399,7 @@ where
.partition(
self.timeline.get_shard_identity(),
self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
BLCKSZ as u64,
);
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
@@ -619,10 +647,7 @@ where
};
if spcnode == GLOBALTABLESPACE_OID {
let pg_version_str = match self.timeline.pg_version {
14 | 15 => self.timeline.pg_version.to_string(),
ver => format!("{ver}\x0A"),
};
let pg_version_str = self.timeline.pg_version.versionfile_string();
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
self.ar
.append(&header, pg_version_str.as_bytes())
@@ -669,7 +694,7 @@ where
}
// Append dir path for each database
let path = format!("base/{}", dbnode);
let path = format!("base/{dbnode}");
let header = new_tar_header_dir(&path)?;
self.ar
.append(&header, io::empty())
@@ -677,19 +702,16 @@ where
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
let dst_path = format!("base/{dbnode}/PG_VERSION");
let pg_version_str = match self.timeline.pg_version {
14 | 15 => self.timeline.pg_version.to_string(),
ver => format!("{ver}\x0A"),
};
let pg_version_str = self.timeline.pg_version.versionfile_string();
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let relmap_path = format!("base/{dbnode}/pg_filenode.map");
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar
.append(&header, &img[..])
@@ -713,10 +735,10 @@ where
buf.extend_from_slice(&img[..]);
let crc = crc32c::crc32c(&img[..]);
buf.put_u32_le(crc);
let path = if self.timeline.pg_version < 17 {
format!("pg_twophase/{:>08X}", xid)
let path = if self.timeline.pg_version < PgMajorVersion::PG17 {
format!("pg_twophase/{xid:>08X}")
} else {
format!("pg_twophase/{:>016X}", xid)
format!("pg_twophase/{xid:>016X}")
};
let header = new_tar_header(&path, buf.len() as u64)?;
self.ar
@@ -768,7 +790,7 @@ where
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE);
let wal_file_path = format!("pg_wal/{}", wal_file_name);
let wal_file_path = format!("pg_wal/{wal_file_name}");
let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;
let wal_seg = postgres_ffi::generate_wal_segment(

View File

@@ -1,12 +1,12 @@
use std::{collections::HashMap, sync::Arc};
use async_compression::tokio::write::GzipEncoder;
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use metrics::core::{AtomicU64, GenericCounter};
use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
use tokio::{
io::{AsyncWriteExt, BufWriter},
sync::mpsc::{UnboundedReceiver, UnboundedSender},
sync::mpsc::{Receiver, Sender, error::TrySendError},
};
use tokio_util::sync::CancellationToken;
use utils::{
@@ -18,7 +18,10 @@ use utils::{
use crate::{
basebackup::send_basebackup_tarball,
context::{DownloadBehavior, RequestContext},
metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
metrics::{
BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE,
BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE,
},
task_mgr::TaskKind,
tenant::{
Timeline,
@@ -32,11 +35,16 @@ pub struct BasebackupPrepareRequest {
pub lsn: Lsn,
}
pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
pub type BasebackupPrepareSender = Sender<BasebackupPrepareRequest>;
pub type BasebackupPrepareReceiver = Receiver<BasebackupPrepareRequest>;
type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
#[derive(Clone)]
struct CacheEntry {
/// LSN at which the basebackup was taken.
lsn: Lsn,
/// Size of the basebackup archive in bytes.
size_bytes: u64,
}
/// BasebackupCache stores cached basebackup archives for timelines on local disk.
///
@@ -52,68 +60,118 @@ type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
/// and ~1 RPS for get requests.
pub struct BasebackupCache {
data_dir: Utf8PathBuf,
config: BasebackupCacheConfig,
tenant_manager: Arc<TenantManager>,
remove_entry_sender: BasebackupRemoveEntrySender,
config: Option<BasebackupCacheConfig>,
entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,
cancel: CancellationToken,
prepare_sender: BasebackupPrepareSender,
read_hit_count: GenericCounter<AtomicU64>,
read_miss_count: GenericCounter<AtomicU64>,
read_err_count: GenericCounter<AtomicU64>,
prepare_ok_count: GenericCounter<AtomicU64>,
prepare_skip_count: GenericCounter<AtomicU64>,
prepare_err_count: GenericCounter<AtomicU64>,
}
impl BasebackupCache {
/// Creates a BasebackupCache and spawns the background task.
/// The initialization of the cache is performed in the background and does not
/// block the caller. The cache will return `None` for any get requests until
/// initialization is complete.
pub fn spawn(
runtime_handle: &tokio::runtime::Handle,
/// Create a new BasebackupCache instance.
/// Also returns a BasebackupPrepareReceiver which is needed to start
/// the background task.
/// The cache is initialized from the data_dir in the background task.
/// The cache will return `None` for any get requests until the initialization is complete.
/// The background task is spawned separately using [`Self::spawn_background_task`]
/// to avoid a circular dependency between the cache and the tenant manager.
pub fn new(
data_dir: Utf8PathBuf,
config: Option<BasebackupCacheConfig>,
prepare_receiver: BasebackupPrepareReceiver,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
) -> Arc<Self> {
let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
) -> (Arc<Self>, BasebackupPrepareReceiver) {
let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1);
let enabled = config.is_some();
let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size);
let cache = Arc::new(BasebackupCache {
data_dir,
config: config.unwrap_or_default(),
tenant_manager,
remove_entry_sender,
config,
entries: std::sync::Mutex::new(HashMap::new()),
cancel,
prepare_sender,
read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
});
if enabled {
runtime_handle.spawn(
cache
.clone()
.background(prepare_receiver, remove_entry_receiver),
);
}
(cache, prepare_receiver)
}
cache
/// Spawns the background task.
/// The background task initializes the cache from the disk,
/// processes prepare requests, and cleans up outdated cache entries.
/// Noop if the cache is disabled (config is None).
pub fn spawn_background_task(
self: Arc<Self>,
runtime_handle: &tokio::runtime::Handle,
prepare_receiver: BasebackupPrepareReceiver,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
) {
if let Some(config) = self.config.clone() {
let background = BackgroundTask {
c: self,
config,
tenant_manager,
cancel,
entry_count: 0,
total_size_bytes: 0,
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
};
runtime_handle.spawn(background.run(prepare_receiver));
}
}
/// Send a basebackup prepare request to the background task.
/// The basebackup will be prepared asynchronously, it does not block the caller.
/// The request will be skipped if any cache limits are exceeded.
pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) {
let req = BasebackupPrepareRequest {
tenant_shard_id,
timeline_id,
lsn,
};
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc();
let res = self.prepare_sender.try_send(req);
if let Err(e) = res {
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
self.prepare_skip_count.inc();
match e {
TrySendError::Full(_) => {
// Basebackup prepares are pretty rare, normally we should not hit this.
tracing::info!(
tenant_id = %tenant_shard_id.tenant_id,
%timeline_id,
%lsn,
"Basebackup prepare channel is full, skipping the request"
);
}
TrySendError::Closed(_) => {
// Normal during shutdown, not critical.
tracing::info!(
tenant_id = %tenant_shard_id.tenant_id,
%timeline_id,
%lsn,
"Basebackup prepare channel is closed, skipping the request"
);
}
}
}
}
/// Gets a basebackup entry from the cache.
@@ -126,9 +184,13 @@ impl BasebackupCache {
timeline_id: TimelineId,
lsn: Lsn,
) -> Option<tokio::fs::File> {
if !self.is_enabled() {
return None;
}
// Fast path. Check if the entry exists using the in-memory state.
let tti = TenantTimelineId::new(tenant_id, timeline_id);
if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
self.read_miss_count.inc();
return None;
}
@@ -153,6 +215,10 @@ impl BasebackupCache {
}
}
pub fn is_enabled(&self) -> bool {
self.config.is_some()
}
// Private methods.
fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
@@ -166,6 +232,42 @@ impl BasebackupCache {
self.data_dir
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
}
}
/// The background task that does the job to prepare basebackups
/// and manage the cache entries on disk.
/// It is a separate struct from BasebackupCache to allow holding
/// a mutable reference to this state without a mutex lock,
/// while BasebackupCache is referenced by the clients.
struct BackgroundTask {
c: Arc<BasebackupCache>,
config: BasebackupCacheConfig,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
/// Number of the entries in the cache.
/// This counter is used for metrics and applying cache limits.
/// It generally should be equal to c.entries.len(), but it's calculated
/// pessimistically for abnormal situations: if we encountered some errors
/// during removing the entry from disk, we won't decrement this counter to
/// make sure that we don't exceed the limit with "trashed" files on the disk.
/// It will also count files in the data_dir that are not valid cache entries.
entry_count: usize,
/// Total size of all the entries on the disk.
/// This counter is used for metrics and applying cache limits.
/// Similar to entry_count, it is calculated pessimistically for abnormal situations.
total_size_bytes: u64,
prepare_ok_count: GenericCounter<AtomicU64>,
prepare_skip_count: GenericCounter<AtomicU64>,
prepare_err_count: GenericCounter<AtomicU64>,
}
impl BackgroundTask {
fn tmp_dir(&self) -> Utf8PathBuf {
self.c.data_dir.join("tmp")
}
fn entry_tmp_path(
&self,
@@ -173,9 +275,8 @@ impl BasebackupCache {
timeline_id: TimelineId,
lsn: Lsn,
) -> Utf8PathBuf {
self.data_dir
.join("tmp")
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
self.tmp_dir()
.join(BasebackupCache::entry_filename(tenant_id, timeline_id, lsn))
}
fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
@@ -194,18 +295,21 @@ impl BasebackupCache {
Some((tenant_id, timeline_id, lsn))
}
async fn cleanup(&self) -> anyhow::Result<()> {
// Cleanup tmp directory.
let tmp_dir = self.data_dir.join("tmp");
let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
while let Some(dir_entry) = tmp_dir.next_entry().await? {
if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
}
// Recreate the tmp directory to clear all files in it.
async fn clean_tmp_dir(&self) -> anyhow::Result<()> {
let tmp_dir = self.tmp_dir();
if tmp_dir.exists() {
tokio::fs::remove_dir_all(&tmp_dir).await?;
}
tokio::fs::create_dir_all(&tmp_dir).await?;
Ok(())
}
// Remove outdated entries.
let entries_old = self.entries.lock().unwrap().clone();
async fn cleanup(&mut self) -> anyhow::Result<()> {
self.clean_tmp_dir().await?;
// Leave only up-to-date entries.
let entries_old = self.c.entries.lock().unwrap().clone();
let mut entries_new = HashMap::new();
for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
if !tenant_shard_id.is_shard_zero() {
@@ -218,43 +322,42 @@ impl BasebackupCache {
for timeline in tenant.list_timelines() {
let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
if let Some(&entry_lsn) = entries_old.get(&tti) {
if timeline.get_last_record_lsn() <= entry_lsn {
entries_new.insert(tti, entry_lsn);
if let Some(entry) = entries_old.get(&tti) {
if timeline.get_last_record_lsn() <= entry.lsn {
entries_new.insert(tti, entry.clone());
}
}
}
}
for (&tti, &lsn) in entries_old.iter() {
// Try to remove all entries that are not up-to-date.
for (&tti, entry) in entries_old.iter() {
if !entries_new.contains_key(&tti) {
self.remove_entry_sender
.send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
.unwrap();
self.try_remove_entry(tti.tenant_id, tti.timeline_id, entry)
.await;
}
}
BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
*self.entries.lock().unwrap() = entries_new;
// Note: BackgroundTask is the only writer for self.c.entries,
// so it couldn't have been modified concurrently.
*self.c.entries.lock().unwrap() = entries_new;
Ok(())
}
async fn on_startup(&self) -> anyhow::Result<()> {
// Create data_dir and tmp directory if they do not exist.
tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
async fn on_startup(&mut self) -> anyhow::Result<()> {
// Create data_dir if it does not exist.
tokio::fs::create_dir_all(&self.c.data_dir)
.await
.map_err(|e| {
anyhow::anyhow!(
"Failed to create basebackup cache data_dir {:?}: {:?}",
self.data_dir,
e
)
})?;
.context("Failed to create basebackup cache data directory")?;
self.clean_tmp_dir()
.await
.context("Failed to clean tmp directory")?;
// Read existing entries from the data_dir and add them to in-memory state.
let mut entries = HashMap::new();
let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
let mut entries = HashMap::<TenantTimelineId, CacheEntry>::new();
let mut dir = tokio::fs::read_dir(&self.c.data_dir).await?;
while let Some(dir_entry) = dir.next_entry().await? {
let filename = dir_entry.file_name();
@@ -263,33 +366,43 @@ impl BasebackupCache {
continue;
}
let size_bytes = dir_entry
.metadata()
.await
.map_err(|e| {
anyhow::anyhow!("Failed to read metadata for file {:?}: {:?}", filename, e)
})?
.len();
self.entry_count += 1;
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
self.total_size_bytes += size_bytes;
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
let Some((tenant_id, timeline_id, lsn)) = parsed else {
tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
continue;
};
let cur_entry = CacheEntry { lsn, size_bytes };
let tti = TenantTimelineId::new(tenant_id, timeline_id);
use std::collections::hash_map::Entry::*;
match entries.entry(tti) {
Occupied(mut entry) => {
let entry_lsn = *entry.get();
let found_entry = entry.get();
// Leave only the latest entry, remove the old one.
if lsn < entry_lsn {
self.remove_entry_sender.send(self.entry_path(
tenant_id,
timeline_id,
lsn,
))?;
} else if lsn > entry_lsn {
self.remove_entry_sender.send(self.entry_path(
tenant_id,
timeline_id,
entry_lsn,
))?;
entry.insert(lsn);
if cur_entry.lsn < found_entry.lsn {
self.try_remove_entry(tenant_id, timeline_id, &cur_entry)
.await;
} else if cur_entry.lsn > found_entry.lsn {
self.try_remove_entry(tenant_id, timeline_id, found_entry)
.await;
entry.insert(cur_entry);
} else {
// Two different filenames parsed to the same timline_id and LSN.
// Should never happen.
@@ -300,22 +413,17 @@ impl BasebackupCache {
}
}
Vacant(entry) => {
entry.insert(lsn);
entry.insert(cur_entry);
}
}
}
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
*self.entries.lock().unwrap() = entries;
*self.c.entries.lock().unwrap() = entries;
Ok(())
}
async fn background(
self: Arc<Self>,
mut prepare_receiver: BasebackupPrepareReceiver,
mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
) {
async fn run(mut self, mut prepare_receiver: BasebackupPrepareReceiver) {
// Panic in the background is a safe fallback.
// It will drop receivers and the cache will be effectively disabled.
self.on_startup()
@@ -328,6 +436,7 @@ impl BasebackupCache {
loop {
tokio::select! {
Some(req) = prepare_receiver.recv() => {
BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
if let Err(err) = self.prepare_basebackup(
req.tenant_shard_id,
req.timeline_id,
@@ -338,11 +447,6 @@ impl BasebackupCache {
continue;
}
}
Some(req) = remove_entry_receiver.recv() => {
if let Err(e) = tokio::fs::remove_file(req).await {
tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
}
}
_ = cleanup_ticker.tick() => {
self.cleanup().await.unwrap_or_else(|e| {
tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
@@ -356,6 +460,67 @@ impl BasebackupCache {
}
}
/// Try to remove an entry from disk.
/// The caller is responsible for removing the entry from the in-memory state.
/// Updates size counters and corresponding metrics.
/// Ignores the filesystem errors as not-so-important, but the size counters
/// are not decremented in this case, so the file will continue to be counted
/// towards the size limits.
async fn try_remove_entry(
&mut self,
tenant_id: TenantId,
timeline_id: TimelineId,
entry: &CacheEntry,
) {
let entry_path = self.c.entry_path(tenant_id, timeline_id, entry.lsn);
match tokio::fs::remove_file(&entry_path).await {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => {
tracing::warn!(
"Failed to remove basebackup cache file for tenant {} timeline {} LSN {}: {:#}",
tenant_id,
timeline_id,
entry.lsn,
e
);
return;
}
}
self.entry_count -= 1;
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
self.total_size_bytes -= entry.size_bytes;
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
}
/// Insert the cache entry into in-memory state and update the size counters.
/// Assumes that the file for the entry already exists on disk.
/// If the entry already exists with previous LSN, it will be removed.
async fn upsert_entry(
&mut self,
tenant_id: TenantId,
timeline_id: TimelineId,
entry: CacheEntry,
) {
let tti = TenantTimelineId::new(tenant_id, timeline_id);
self.entry_count += 1;
BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
self.total_size_bytes += entry.size_bytes;
BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
let old_entry = self.c.entries.lock().unwrap().insert(tti, entry);
if let Some(old_entry) = old_entry {
self.try_remove_entry(tenant_id, timeline_id, &old_entry)
.await;
}
}
/// Prepare a basebackup for the given timeline.
///
/// If the basebackup already exists with a higher LSN or the timeline already
@@ -364,7 +529,7 @@ impl BasebackupCache {
/// The basebackup is prepared in a temporary directory and then moved to the final
/// location to make the operation atomic.
async fn prepare_basebackup(
&self,
&mut self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
req_lsn: Lsn,
@@ -378,30 +543,44 @@ impl BasebackupCache {
let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);
// TODO(diko): I don't think we will hit the limit,
// but if we do, it makes sense to try to evict oldest entries. here
if self.entry_count >= self.config.max_size_entries {
tracing::info!(
%tenant_shard_id,
%timeline_id,
%req_lsn,
"Basebackup cache is full (max_size_entries), skipping basebackup",
);
self.prepare_skip_count.inc();
return Ok(());
}
if self.total_size_bytes >= self.config.max_total_size_bytes {
tracing::info!(
%tenant_shard_id,
%timeline_id,
%req_lsn,
"Basebackup cache is full (max_total_size_bytes), skipping basebackup",
);
self.prepare_skip_count.inc();
return Ok(());
}
{
let entries = self.entries.lock().unwrap();
if let Some(&entry_lsn) = entries.get(&tti) {
if entry_lsn >= req_lsn {
let entries = self.c.entries.lock().unwrap();
if let Some(entry) = entries.get(&tti) {
if entry.lsn >= req_lsn {
tracing::info!(
%timeline_id,
%req_lsn,
%entry_lsn,
%entry.lsn,
"Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
);
self.prepare_skip_count.inc();
return Ok(());
}
}
if entries.len() as i64 >= self.config.max_size_entries {
tracing::info!(
%timeline_id,
%req_lsn,
"Basebackup cache is full, skipping basebackup",
);
self.prepare_skip_count.inc();
return Ok(());
}
}
let tenant = self
@@ -437,56 +616,54 @@ impl BasebackupCache {
.prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
.await;
if let Err(err) = res {
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
match tokio::fs::remove_file(&entry_tmp_path).await {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => {
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
let entry = match res {
Ok(entry) => entry,
Err(err) => {
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
match tokio::fs::remove_file(&entry_tmp_path).await {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => {
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
}
}
return Err(err);
}
return Err(err);
}
};
// Move the tmp file to the final location atomically.
let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
// The tmp file is fsynced, so it's guaranteed that we will not have a partial file
// in the main directory.
// It's not necessary to fsync the inode after renaming, because the worst case is that
// the rename operation will be rolled back on the disk failure, the entry will disappear
// from the main directory, and the entry access will cause a cache miss.
let entry_path = self
.c
.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
let mut entries = self.entries.lock().unwrap();
if let Some(old_lsn) = entries.insert(tti, req_lsn) {
// Remove the old entry if it exists.
self.remove_entry_sender
.send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
.unwrap();
}
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
self.upsert_entry(tenant_shard_id.tenant_id, timeline_id, entry)
.await;
self.prepare_ok_count.inc();
Ok(())
}
/// Prepares a basebackup in a temporary file.
/// Guarantees that the tmp file is fsynced before returning.
async fn prepare_basebackup_tmp(
&self,
emptry_tmp_path: &Utf8Path,
entry_tmp_path: &Utf8Path,
timeline: &Arc<Timeline>,
req_lsn: Lsn,
) -> anyhow::Result<()> {
) -> anyhow::Result<CacheEntry> {
let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
let ctx = ctx.with_scope_timeline(timeline);
let file = tokio::fs::File::create(emptry_tmp_path).await?;
let file = tokio::fs::File::create(entry_tmp_path).await?;
let mut writer = BufWriter::new(file);
let mut encoder = GzipEncoder::with_quality(
&mut writer,
// Level::Best because compression is not on the hot path of basebackup requests.
// The decompression is almost not affected by the compression level.
async_compression::Level::Best,
);
// We may receive a request before the WAL record is applied to the timeline.
// Wait for the requested LSN to be applied.
timeline
@@ -499,20 +676,28 @@ impl BasebackupCache {
.await?;
send_basebackup_tarball(
&mut encoder,
&mut writer,
timeline,
Some(req_lsn),
None,
false,
false,
// Level::Best because compression is not on the hot path of basebackup requests.
// The decompression is almost not affected by the compression level.
Some(async_compression::Level::Best),
&ctx,
)
.await?;
encoder.shutdown().await?;
writer.flush().await?;
writer.into_inner().sync_all().await?;
Ok(())
// TODO(diko): we can count it via Writer wrapper instead of a syscall.
let size_bytes = tokio::fs::metadata(entry_tmp_path).await?.len();
Ok(CacheEntry {
lsn: req_lsn,
size_bytes,
})
}
}

View File

@@ -23,6 +23,7 @@ use pageserver::deletion_queue::DeletionQueue;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::feature_resolver::FeatureResolver;
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::page_service::GrpcPageServiceHandler;
use pageserver::task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
};
@@ -581,11 +582,14 @@ fn start_pageserver(
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
// Scan the local 'tenants/' directory and start loading the tenants
let (basebackup_prepare_sender, basebackup_prepare_receiver) =
tokio::sync::mpsc::unbounded_channel();
let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new(
conf.basebackup_cache_dir(),
conf.basebackup_cache_config.clone(),
);
let deletion_queue_client = deletion_queue.new_client();
let background_purges = mgr::BackgroundPurges::default();
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
let tenant_manager = mgr::init(
conf,
background_purges.clone(),
TenantSharedResources {
@@ -593,18 +597,16 @@ fn start_pageserver(
remote_storage: remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
basebackup_prepare_sender,
feature_resolver,
basebackup_cache: Arc::clone(&basebackup_cache),
feature_resolver: feature_resolver.clone(),
},
order,
shutdown_pageserver.clone(),
))?;
);
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;
let basebackup_cache = BasebackupCache::spawn(
basebackup_cache.spawn_background_task(
BACKGROUND_RUNTIME.handle(),
conf.basebackup_cache_dir(),
conf.basebackup_cache_config.clone(),
basebackup_prepare_receiver,
Arc::clone(&tenant_manager),
shutdown_pageserver.child_token(),
@@ -726,6 +728,7 @@ fn start_pageserver(
disk_usage_eviction_state,
deletion_queue.new_client(),
secondary_controller,
feature_resolver,
)
.context("Failed to initialize router state")?,
);
@@ -816,7 +819,6 @@ fn start_pageserver(
} else {
None
},
basebackup_cache,
);
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for
@@ -827,7 +829,7 @@ fn start_pageserver(
// necessary?
let mut page_service_grpc = None;
if let Some(grpc_listener) = grpc_listener {
page_service_grpc = Some(page_service::spawn_grpc(
page_service_grpc = Some(GrpcPageServiceHandler::spawn(
tenant_manager.clone(),
grpc_auth,
otel_guard.as_ref().map(|g| g.dispatch.clone()),

View File

@@ -2,7 +2,9 @@ use std::io::{Read, Write, stdin, stdout};
use std::time::Duration;
use clap::Parser;
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
use pageserver_api::pagestream_api::{
PagestreamFeMessage, PagestreamRequest, PagestreamTestRequest,
};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -28,17 +30,15 @@ async fn main() -> anyhow::Result<()> {
let mut msg = 0;
loop {
msg += 1;
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
PagestreamTestRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(23),
not_modified_since: Lsn(23),
},
batch_key: 42,
message: format!("message {}", msg),
let fut = sender.send(PagestreamFeMessage::Test(PagestreamTestRequest {
hdr: PagestreamRequest {
reqid: 0,
request_lsn: Lsn(23),
not_modified_since: Lsn(23),
},
));
batch_key: 42,
message: format!("message {}", msg),
}));
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
eprintln!("pipe seems full");
break;

View File

@@ -11,7 +11,7 @@ use std::num::NonZeroUsize;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, bail, ensure};
use anyhow::{Context, ensure};
use camino::{Utf8Path, Utf8PathBuf};
use once_cell::sync::OnceCell;
use pageserver_api::config::{
@@ -22,6 +22,7 @@ use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
use pem::Pem;
use postgres_backend::AuthType;
use postgres_ffi::PgMajorVersion;
use remote_storage::{RemotePath, RemoteStorageConfig};
use reqwest::Url;
use storage_broker::Uri;
@@ -338,20 +339,16 @@ impl PageServerConf {
//
// Postgres distribution paths
//
pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
let path = self.pg_distrib_dir.clone();
#[allow(clippy::manual_range_patterns)]
match pg_version {
14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
_ => bail!("Unsupported postgres version: {}", pg_version),
}
Ok(path.join(pg_version.v_str()))
}
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
}
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
}
@@ -765,4 +762,23 @@ mod tests {
let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
assert_eq!(result.is_ok(), is_valid);
}
#[test]
fn test_config_posthog_config_is_valid() {
let input = r#"
control_plane_api = "http://localhost:6666"
[posthog_config]
server_api_key = "phs_AAA"
client_api_key = "phc_BBB"
project_id = "000"
private_api_url = "https://us.posthog.com"
public_api_url = "https://us.i.posthog.com"
"#;
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
.expect("posthogconfig is valid");
let workdir = Utf8PathBuf::from("/nonexistent");
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
.expect("parse_and_validate");
}
}

View File

@@ -159,14 +159,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
Ok(m) => {
// Since we run one time at startup, be generous in our logging and
// dump all metadata.
tracing::info!(
"Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
m.postgres_host,
m.postgres_port,
m.http_host,
m.http_port,
m.other
);
tracing::info!("Loaded node metadata: {m}");
let az_id = {
let az_id_from_metadata = m
@@ -195,6 +188,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
node_id: conf.id,
listen_pg_addr: m.postgres_host,
listen_pg_port: m.postgres_port,
listen_grpc_addr: m.grpc_host,
listen_grpc_port: m.grpc_port,
listen_http_addr: m.http_host,
listen_http_port: m.http_port,
listen_https_port: m.https_port,

View File

@@ -1,5 +1,7 @@
use std::{collections::HashMap, sync::Arc, time::Duration};
use arc_swap::ArcSwap;
use pageserver_api::config::NodeMetadata;
use posthog_client_lite::{
CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
PostHogFlagFilterPropertyValue,
@@ -11,10 +13,13 @@ use utils::id::TenantId;
use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600);
#[derive(Clone)]
pub struct FeatureResolver {
inner: Option<Arc<FeatureResolverBackgroundLoop>>,
internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
force_overrides_for_testing: Arc<ArcSwap<HashMap<String, String>>>,
}
impl FeatureResolver {
@@ -22,9 +27,17 @@ impl FeatureResolver {
Self {
inner: None,
internal_properties: None,
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
}
}
pub fn update(&self, spec: String) -> anyhow::Result<()> {
if let Some(inner) = &self.inner {
inner.update(spec)?;
}
Ok(())
}
pub fn spawn(
conf: &PageServerConf,
shutdown_pageserver: CancellationToken,
@@ -86,7 +99,35 @@ impl FeatureResolver {
}
}
}
// TODO: add pageserver URL.
// TODO: move this to a background task so that we don't block startup in case of slow disk
let metadata_path = conf.metadata_path();
match std::fs::read_to_string(&metadata_path) {
Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
Ok(metadata) => {
properties.insert(
"hostname".to_string(),
PostHogFlagFilterPropertyValue::String(metadata.http_host),
);
if let Some(cplane_region) = metadata.other.get("region_id") {
if let Some(cplane_region) = cplane_region.as_str() {
// This region contains the cell number
properties.insert(
"neon_region".to_string(),
PostHogFlagFilterPropertyValue::String(
cplane_region.to_string(),
),
);
}
}
}
Err(e) => {
tracing::warn!("Failed to parse metadata.json: {}", e);
}
},
Err(e) => {
tracing::warn!("Failed to read metadata.json: {}", e);
}
}
Arc::new(properties)
};
let fake_tenants = {
@@ -110,18 +151,23 @@ impl FeatureResolver {
}
tenants
};
// TODO: make refresh period configurable
inner
.clone()
.spawn(handle, Duration::from_secs(60), fake_tenants);
inner.clone().spawn(
handle,
posthog_config
.refresh_interval
.unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL),
fake_tenants,
);
Ok(FeatureResolver {
inner: Some(inner),
internal_properties: Some(internal_properties),
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
})
} else {
Ok(FeatureResolver {
inner: None,
internal_properties: None,
force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
})
}
}
@@ -161,6 +207,11 @@ impl FeatureResolver {
flag_key: &str,
tenant_id: TenantId,
) -> Result<String, PostHogEvaluationError> {
let force_overrides = self.force_overrides_for_testing.load();
if let Some(value) = force_overrides.get(flag_key) {
return Ok(value.clone());
}
if let Some(inner) = &self.inner {
let res = inner.feature_store().evaluate_multivariate(
flag_key,
@@ -199,6 +250,15 @@ impl FeatureResolver {
flag_key: &str,
tenant_id: TenantId,
) -> Result<(), PostHogEvaluationError> {
let force_overrides = self.force_overrides_for_testing.load();
if let Some(value) = force_overrides.get(flag_key) {
return if value == "true" {
Ok(())
} else {
Err(PostHogEvaluationError::NoConditionGroupMatched)
};
}
if let Some(inner) = &self.inner {
let res = inner.feature_store().evaluate_boolean(
flag_key,
@@ -230,8 +290,22 @@ impl FeatureResolver {
inner.feature_store().is_feature_flag_boolean(flag_key)
} else {
Err(PostHogEvaluationError::NotAvailable(
"PostHog integration is not enabled".to_string(),
"PostHog integration is not enabled, cannot auto-determine the flag type"
.to_string(),
))
}
}
/// Force override a feature flag for testing. This is only for testing purposes. Assume the caller only call it
/// from a single thread so it won't race.
pub fn force_override_for_testing(&self, flag_key: &str, value: Option<&str>) {
let mut force_overrides = self.force_overrides_for_testing.load().as_ref().clone();
if let Some(value) = value {
force_overrides.insert(flag_key.to_string(), value.to_string());
} else {
force_overrides.remove(flag_key);
}
self.force_overrides_for_testing
.store(Arc::new(force_overrides));
}
}

View File

@@ -41,6 +41,7 @@ use pageserver_api::models::{
TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
};
use pageserver_api::shard::{ShardCount, TenantShardId};
use postgres_ffi::PgMajorVersion;
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
use scopeguard::defer;
use serde_json::json;
@@ -59,6 +60,7 @@ use crate::config::PageServerConf;
use crate::context;
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
use crate::deletion_queue::DeletionQueueClient;
use crate::feature_resolver::FeatureResolver;
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::LocationConf;
@@ -73,6 +75,7 @@ use crate::tenant::remote_timeline_client::{
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
use crate::tenant::timeline::{
CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
@@ -106,6 +109,7 @@ pub struct State {
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
feature_resolver: FeatureResolver,
}
impl State {
@@ -119,6 +123,7 @@ impl State {
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
feature_resolver: FeatureResolver,
) -> anyhow::Result<Self> {
let allowlist_routes = &[
"/v1/status",
@@ -139,6 +144,7 @@ impl State {
deletion_queue_client,
secondary_controller,
latest_utilization: Default::default(),
feature_resolver,
})
}
}
@@ -282,11 +288,11 @@ impl From<GetActiveTenantError> for ApiError {
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
ApiError::ShuttingDown
}
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{e}")),
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
GetActiveTenantError::NotFound(gte) => gte.into(),
GetActiveTenantError::WaitForActiveTimeout { .. } => {
ApiError::ResourceUnavailable(format!("{}", e).into())
ApiError::ResourceUnavailable(format!("{e}").into())
}
GetActiveTenantError::SwitchedTenant => {
// in our HTTP handlers, this error doesn't happen
@@ -1010,7 +1016,7 @@ async fn get_lsn_by_timestamp_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let timestamp_raw = must_get_query_param(&request, "timestamp")?;
let timestamp = humantime::parse_rfc3339(&timestamp_raw)
.with_context(|| format!("Invalid time: {:?}", timestamp_raw))
.with_context(|| format!("Invalid time: {timestamp_raw:?}"))
.map_err(ApiError::BadRequest)?;
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
@@ -1105,7 +1111,7 @@ async fn get_timestamp_of_lsn_handler(
json_response(StatusCode::OK, time)
}
None => Err(ApiError::PreconditionFailed(
format!("Timestamp for lsn {} not found", lsn).into(),
format!("Timestamp for lsn {lsn} not found").into(),
)),
}
}
@@ -1451,7 +1457,10 @@ async fn timeline_layer_scan_disposable_keys(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let guard = timeline.layers.read().await;
let guard = timeline
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
@@ -2413,7 +2422,7 @@ async fn timeline_offload_handler(
}
if let (false, reason) = timeline.can_offload() {
return Err(ApiError::PreconditionFailed(
format!("Timeline::can_offload() check failed: {}", reason) .into(),
format!("Timeline::can_offload() check failed: {reason}") .into(),
));
}
offload_timeline(&tenant, &timeline)
@@ -3377,7 +3386,7 @@ async fn put_tenant_timeline_import_basebackup(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
let pg_version: PgMajorVersion = must_parse_query_param(&request, "pg_version")?;
check_permission(&request, Some(tenant_id))?;
@@ -3671,8 +3680,8 @@ async fn tenant_evaluate_feature_flag(
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let flag: String = must_parse_query_param(&request, "flag")?;
let as_type: String = must_parse_query_param(&request, "as")?;
let flag: String = parse_request_param(&request, "flag_key")?;
let as_type: Option<String> = parse_query_param(&request, "as")?;
let state = get_state(&request);
@@ -3681,11 +3690,11 @@ async fn tenant_evaluate_feature_flag(
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
if as_type == "boolean" {
if as_type.as_deref() == Some("boolean") {
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
let result = result.map(|_| true).map_err(|e| e.to_string());
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
} else if as_type == "multivariate" {
} else if as_type.as_deref() == Some("multivariate") {
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
} else {
@@ -3705,6 +3714,49 @@ async fn tenant_evaluate_feature_flag(
.await
}
async fn force_override_feature_flag_for_testing_put(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let flag: String = parse_request_param(&request, "flag_key")?;
let value: String = must_parse_query_param(&request, "value")?;
let state = get_state(&request);
state
.feature_resolver
.force_override_for_testing(&flag, Some(&value));
json_response(StatusCode::OK, ())
}
async fn force_override_feature_flag_for_testing_delete(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let flag: String = parse_request_param(&request, "flag_key")?;
let state = get_state(&request);
state
.feature_resolver
.force_override_for_testing(&flag, None);
json_response(StatusCode::OK, ())
}
async fn update_feature_flag_spec(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let body = json_request(&mut request).await?;
let state = get_state(&request);
state
.feature_resolver
.update(body)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
/// Common functionality of all the HTTP API handlers.
///
/// - Adds a tracing span to each request (by `request_span`)
@@ -4081,8 +4133,17 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
|r| api_handler(r, activate_post_import_handler),
)
.get("/v1/tenant/:tenant_shard_id/feature_flag", |r| {
.get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
api_handler(r, tenant_evaluate_feature_flag)
})
.put("/v1/feature_flag/:flag_key", |r| {
testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
})
.delete("/v1/feature_flag/:flag_key", |r| {
testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete)
})
.post("/v1/feature_flag_spec", |r| {
api_handler(r, update_feature_flag_spec)
})
.any(handler_404))
}

View File

@@ -520,7 +520,7 @@ async fn import_file(
}
if file_path.starts_with("global") {
let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
let spcnode = postgres_ffi_types::constants::GLOBALTABLESPACE_OID;
let dbnode = 0;
match file_name.as_ref() {
@@ -553,7 +553,7 @@ async fn import_file(
}
}
} else if file_path.starts_with("base") {
let spcnode = pg_constants::DEFAULTTABLESPACE_OID;
let spcnode = postgres_ffi_types::constants::DEFAULTTABLESPACE_OID;
let dbnode: u32 = file_path
.iter()
.nth(1)

View File

@@ -38,6 +38,7 @@ pub mod walredo;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
use postgres_ffi::PgMajorVersion;
use tenant::mgr::{BackgroundPurges, TenantManager};
use tenant::secondary;
use tracing::{info, info_span};
@@ -51,7 +52,7 @@ use tracing::{info, info_span};
/// backwards-compatible changes to the metadata format.
pub const STORAGE_FORMAT_VERSION: u16 = 3;
pub const DEFAULT_PG_VERSION: u32 = 17;
pub const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;
// Magic constants used to identify different kinds of files
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;

View File

@@ -1053,6 +1053,15 @@ pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("Failed to register pageserver_tenant_states_count metric")
});
pub(crate) static TIMELINE_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_timeline_states_count",
"Count of timelines per state",
&["state"]
)
.expect("Failed to register pageserver_timeline_states_count metric")
});
/// A set of broken tenants.
///
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
@@ -1718,12 +1727,7 @@ impl Drop for SmgrOpTimer {
impl SmgrOpFlushInProgress {
/// The caller must guarantee that `socket_fd`` outlives this function.
pub(crate) async fn measure<Fut, O>(
self,
started_at: Instant,
mut fut: Fut,
socket_fd: RawFd,
) -> O
pub(crate) async fn measure<Fut, O>(self, started_at: Instant, fut: Fut, socket_fd: RawFd) -> O
where
Fut: std::future::Future<Output = O>,
{
@@ -3325,6 +3329,8 @@ impl TimelineMetrics {
&timeline_id,
);
TIMELINE_STATE_METRIC.with_label_values(&["active"]).inc();
TimelineMetrics {
tenant_id,
shard_id,
@@ -3415,7 +3421,7 @@ impl TimelineMetrics {
pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
let labels = self.make_frozen_layer_labels(layer);
let size = layer.try_len().expect("frozen layer should have no writer");
let size = layer.len();
TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&labels)
.unwrap()
@@ -3430,7 +3436,7 @@ impl TimelineMetrics {
pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
let labels = self.make_frozen_layer_labels(layer);
let size = layer.try_len().expect("frozen layer should have no writer");
let size = layer.len();
TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&labels)
.unwrap()
@@ -3479,6 +3485,8 @@ impl TimelineMetrics {
return;
}
TIMELINE_STATE_METRIC.with_label_values(&["active"]).dec();
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
@@ -4415,24 +4423,30 @@ pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_basebackup_cache_entries_total",
"Number of entries in the basebackup cache"
)
.expect("failed to define a metric")
});
// FIXME: Support basebackup cache size metrics.
#[allow(dead_code)]
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_basebackup_cache_size_bytes",
"Total size of all basebackup cache entries on disk in bytes"
)
.expect("failed to define a metric")
});
pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_basebackup_cache_prepare_queue_size",
"Number of requests in the basebackup prepare channel"
)
.expect("failed to define a metric")
});
static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_config_ignored_items",

View File

@@ -13,8 +13,7 @@ use std::time::{Duration, Instant, SystemTime};
use std::{io, str};
use anyhow::{Context as _, anyhow, bail};
use async_compression::tokio::write::GzipEncoder;
use bytes::{Buf, BytesMut};
use bytes::{Buf as _, BufMut as _, BytesMut};
use futures::future::BoxFuture;
use futures::{FutureExt, Stream};
use itertools::Itertools;
@@ -25,12 +24,13 @@ use pageserver_api::config::{
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::models::{
self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
use pageserver_api::models::{PageTraceEvent, TenantState};
use pageserver_api::pagestream_api::{
self, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
PagestreamProtocolVersion, PagestreamRequest, TenantState,
PagestreamProtocolVersion, PagestreamRequest,
};
use pageserver_api::reltag::SlruKind;
use pageserver_api::shard::TenantShardId;
@@ -40,7 +40,7 @@ use postgres_backend::{
AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
};
use postgres_ffi::BLCKSZ;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi_types::constants::DEFAULTTABLESPACE_OID;
use pq_proto::framed::ConnectionError;
use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
use smallvec::{SmallVec, smallvec};
@@ -62,7 +62,6 @@ use utils::{failpoint_support, span_record};
use crate::auth::check_permission;
use crate::basebackup::{self, BasebackupError};
use crate::basebackup_cache::BasebackupCache;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -137,7 +136,6 @@ pub fn spawn(
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
basebackup_cache: Arc<BasebackupCache>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
@@ -159,7 +157,6 @@ pub fn spawn(
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
basebackup_cache,
libpq_ctx,
cancel.clone(),
)
@@ -169,99 +166,6 @@ pub fn spawn(
Listener { cancel, task }
}
/// Spawns a gRPC server for the page service.
///
/// TODO: move this onto GrpcPageServiceHandler::spawn().
/// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
/// need to reimplement the TCP+TLS accept loop ourselves.
pub fn spawn_grpc(
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
listener: std::net::TcpListener,
) -> anyhow::Result<CancellableTask> {
let cancel = CancellationToken::new();
let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
.download_behavior(DownloadBehavior::Download)
.perf_span_dispatch(perf_trace_dispatch)
.detached_child();
let gate = Gate::default();
// Set up the TCP socket. We take a preconfigured TcpListener to bind the
// port early during startup.
let incoming = {
let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std
listener.set_nonblocking(true)?;
tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std(listener)?)
.with_nodelay(Some(GRPC_TCP_NODELAY))
.with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME))
};
// Set up the gRPC server.
//
// TODO: consider tuning window sizes.
let mut server = tonic::transport::Server::builder()
.http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL))
.http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT))
.max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS));
// Main page service stack. Uses a mix of Tonic interceptors and Tower layers:
//
// * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service.
//
// * Layers: allow async code, can run code after the service response. However, only has access
// to the raw HTTP request/response, not the gRPC types.
let page_service_handler = GrpcPageServiceHandler {
tenant_manager,
ctx,
gate_guard: gate.enter().expect("gate was just created"),
get_vectored_concurrent_io,
};
let observability_layer = ObservabilityLayer;
let mut tenant_interceptor = TenantMetadataInterceptor;
let mut auth_interceptor = TenantAuthInterceptor::new(auth);
let page_service = tower::ServiceBuilder::new()
// Create tracing span and record request start time.
.layer(observability_layer)
// Intercept gRPC requests.
.layer(tonic::service::InterceptorLayer::new(move |mut req| {
// Extract tenant metadata.
req = tenant_interceptor.call(req)?;
// Authenticate tenant JWT token.
req = auth_interceptor.call(req)?;
Ok(req)
}))
.service(proto::PageServiceServer::new(page_service_handler));
let server = server.add_service(page_service);
// Reflection service for use with e.g. grpcurl.
let reflection_service = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET)
.build_v1()?;
let server = server.add_service(reflection_service);
// Spawn server task.
let task_cancel = cancel.clone();
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"grpc listener",
async move {
let result = server
.serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
.await;
if result.is_ok() {
// TODO: revisit shutdown logic once page service is implemented.
gate.close().await;
}
result
},
));
Ok(CancellableTask { task, cancel })
}
impl Listener {
pub async fn stop_accepting(self) -> Connections {
self.cancel.cancel();
@@ -311,7 +215,6 @@ pub async fn libpq_listener_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
basebackup_cache: Arc<BasebackupCache>,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
) -> Connections {
@@ -355,7 +258,6 @@ pub async fn libpq_listener_main(
auth_type,
tls_config.clone(),
pipelining_config.clone(),
Arc::clone(&basebackup_cache),
connection_ctx,
connections_cancel.child_token(),
gate_guard,
@@ -398,7 +300,6 @@ async fn page_service_conn_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
basebackup_cache: Arc<BasebackupCache>,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
@@ -464,7 +365,6 @@ async fn page_service_conn_main(
pipelining_config,
conf.get_vectored_concurrent_io,
perf_span_fields,
basebackup_cache,
connection_ctx,
cancel.clone(),
gate_guard,
@@ -484,16 +384,14 @@ async fn page_service_conn_main(
} else {
let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
Err(io_error).context(format!(
"Postgres connection error for tenant_id={:?} client at peer_addr={}",
tenant_id, peer_addr
"Postgres connection error for tenant_id={tenant_id:?} client at peer_addr={peer_addr}"
))
}
}
other => {
let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
other.context(format!(
"Postgres query error for tenant_id={:?} client peer_addr={}",
tenant_id, peer_addr
"Postgres query error for tenant_id={tenant_id:?} client peer_addr={peer_addr}"
))
}
}
@@ -520,8 +418,6 @@ struct PageServerHandler {
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
basebackup_cache: Arc<BasebackupCache>,
gate_guard: GateGuard,
}
@@ -716,60 +612,6 @@ enum PageStreamError {
BadRequest(Cow<'static, str>),
}
impl PageStreamError {
/// Converts a PageStreamError into a proto::GetPageResponse with the appropriate status
/// code, or a gRPC status if it should terminate the stream (e.g. shutdown). This is a
/// convenience method for use from a get_pages gRPC stream.
#[allow(clippy::result_large_err)]
fn into_get_page_response(
self,
request_id: page_api::RequestID,
) -> Result<proto::GetPageResponse, tonic::Status> {
use page_api::GetPageStatusCode;
use tonic::Code;
// We dispatch to Into<tonic::Status> first, and then map it to a GetPageResponse.
let status: tonic::Status = self.into();
let status_code = match status.code() {
// We shouldn't see an OK status here, because we're emitting an error.
Code::Ok => {
debug_assert_ne!(status.code(), Code::Ok);
return Err(tonic::Status::internal(format!(
"unexpected OK status: {status:?}",
)));
}
// These are per-request errors, returned as GetPageResponses.
Code::AlreadyExists => GetPageStatusCode::InvalidRequest,
Code::DataLoss => GetPageStatusCode::InternalError,
Code::FailedPrecondition => GetPageStatusCode::InvalidRequest,
Code::InvalidArgument => GetPageStatusCode::InvalidRequest,
Code::Internal => GetPageStatusCode::InternalError,
Code::NotFound => GetPageStatusCode::NotFound,
Code::OutOfRange => GetPageStatusCode::InvalidRequest,
Code::ResourceExhausted => GetPageStatusCode::SlowDown,
// These should terminate the stream.
Code::Aborted => return Err(status),
Code::Cancelled => return Err(status),
Code::DeadlineExceeded => return Err(status),
Code::PermissionDenied => return Err(status),
Code::Unauthenticated => return Err(status),
Code::Unavailable => return Err(status),
Code::Unimplemented => return Err(status),
Code::Unknown => return Err(status),
};
Ok(page_api::GetPageResponse {
request_id,
status_code,
reason: Some(status.message().to_string()),
page_images: Vec::new(),
}
.into())
}
}
impl From<PageStreamError> for tonic::Status {
fn from(err: PageStreamError) -> Self {
use tonic::Code;
@@ -859,7 +701,7 @@ struct BatchedGetPageRequest {
#[cfg(feature = "testing")]
struct BatchedTestRequest {
req: models::PagestreamTestRequest,
req: pagestream_api::PagestreamTestRequest,
timer: SmgrOpTimer,
}
@@ -873,13 +715,13 @@ enum BatchedFeMessage {
span: Span,
timer: SmgrOpTimer,
shard: WeakHandle<TenantManagerTypes>,
req: models::PagestreamExistsRequest,
req: PagestreamExistsRequest,
},
Nblocks {
span: Span,
timer: SmgrOpTimer,
shard: WeakHandle<TenantManagerTypes>,
req: models::PagestreamNblocksRequest,
req: PagestreamNblocksRequest,
},
GetPage {
span: Span,
@@ -891,13 +733,13 @@ enum BatchedFeMessage {
span: Span,
timer: SmgrOpTimer,
shard: WeakHandle<TenantManagerTypes>,
req: models::PagestreamDbSizeRequest,
req: PagestreamDbSizeRequest,
},
GetSlruSegment {
span: Span,
timer: SmgrOpTimer,
shard: WeakHandle<TenantManagerTypes>,
req: models::PagestreamGetSlruSegmentRequest,
req: PagestreamGetSlruSegmentRequest,
},
#[cfg(feature = "testing")]
Test {
@@ -1061,7 +903,6 @@ impl PageServerHandler {
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
perf_span_fields: ConnectionPerfSpanFields,
basebackup_cache: Arc<BasebackupCache>,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
@@ -1075,7 +916,6 @@ impl PageServerHandler {
cancel,
pipelining_config,
get_vectored_concurrent_io,
basebackup_cache,
gate_guard,
}
}
@@ -2286,8 +2126,7 @@ impl PageServerHandler {
if request_lsn < not_modified_since {
return Err(PageStreamError::BadRequest(
format!(
"invalid request with request LSN {} and not_modified_since {}",
request_lsn, not_modified_since,
"invalid request with request LSN {request_lsn} and not_modified_since {not_modified_since}",
)
.into(),
));
@@ -2590,10 +2429,9 @@ impl PageServerHandler {
.map(|(req, res)| {
res.map(|page| {
(
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
req: req.req,
page,
}),
PagestreamBeMessage::GetPage(
pagestream_api::PagestreamGetPageResponse { req: req.req, page },
),
req.timer,
req.ctx,
)
@@ -2660,7 +2498,7 @@ impl PageServerHandler {
.map(|(req, res)| {
res.map(|()| {
(
PagestreamBeMessage::Test(models::PagestreamTestResponse {
PagestreamBeMessage::Test(pagestream_api::PagestreamTestResponse {
req: req.req.clone(),
}),
req.timer,
@@ -2763,6 +2601,7 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
None,
&ctx,
)
.await?;
@@ -2776,9 +2615,7 @@ impl PageServerHandler {
&& lsn.is_some()
&& prev_lsn.is_none()
{
self.basebackup_cache
.get(tenant_id, timeline_id, lsn.unwrap())
.await
timeline.get_cached_basebackup(lsn.unwrap()).await
} else {
None
}
@@ -2791,31 +2628,6 @@ impl PageServerHandler {
.map_err(|err| {
BasebackupError::Client(err, "handle_basebackup_request,cached,copy")
})?;
} else if gzip {
let mut encoder = GzipEncoder::with_quality(
&mut writer,
// NOTE using fast compression because it's on the critical path
// for compute startup. For an empty database, we get
// <100KB with this method. The Level::Best compression method
// gives us <20KB, but maybe we should add basebackup caching
// on compute shutdown first.
async_compression::Level::Fastest,
);
basebackup::send_basebackup_tarball(
&mut encoder,
&timeline,
lsn,
prev_lsn,
full_backup,
replica,
&ctx,
)
.await?;
// shutdown the encoder to ensure the gzip footer is written
encoder
.shutdown()
.await
.map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
} else {
basebackup::send_basebackup_tarball(
&mut writer,
@@ -2824,6 +2636,11 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
// NB: using fast compression because it's on the critical path for compute
// startup. For an empty database, we get <100KB with this method. The
// Level::Best compression method gives us <20KB, but maybe we should add
// basebackup caching on compute shutdown first.
gzip.then_some(async_compression::Level::Fastest),
&ctx,
)
.await?;
@@ -3366,6 +3183,108 @@ pub struct GrpcPageServiceHandler {
}
impl GrpcPageServiceHandler {
/// Spawns a gRPC server for the page service.
///
/// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
/// need to reimplement the TCP+TLS accept loop ourselves.
pub fn spawn(
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
listener: std::net::TcpListener,
) -> anyhow::Result<CancellableTask> {
let cancel = CancellationToken::new();
let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
.download_behavior(DownloadBehavior::Download)
.perf_span_dispatch(perf_trace_dispatch)
.detached_child();
let gate = Gate::default();
// Set up the TCP socket. We take a preconfigured TcpListener to bind the
// port early during startup.
let incoming = {
let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std
listener.set_nonblocking(true)?;
tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std(
listener,
)?)
.with_nodelay(Some(GRPC_TCP_NODELAY))
.with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME))
};
// Set up the gRPC server.
//
// TODO: consider tuning window sizes.
let mut server = tonic::transport::Server::builder()
.http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL))
.http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT))
.max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS));
// Main page service stack. Uses a mix of Tonic interceptors and Tower layers:
//
// * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service.
//
// * Layers: allow async code, can run code after the service response. However, only has access
// to the raw HTTP request/response, not the gRPC types.
let page_service_handler = GrpcPageServiceHandler {
tenant_manager,
ctx,
gate_guard: gate.enter().expect("gate was just created"),
get_vectored_concurrent_io,
};
let observability_layer = ObservabilityLayer;
let mut tenant_interceptor = TenantMetadataInterceptor;
let mut auth_interceptor = TenantAuthInterceptor::new(auth);
let page_service = tower::ServiceBuilder::new()
// Create tracing span and record request start time.
.layer(observability_layer)
// Intercept gRPC requests.
.layer(tonic::service::InterceptorLayer::new(move |mut req| {
// Extract tenant metadata.
req = tenant_interceptor.call(req)?;
// Authenticate tenant JWT token.
req = auth_interceptor.call(req)?;
Ok(req)
}))
// Run the page service.
.service(
proto::PageServiceServer::new(page_service_handler)
// Support both gzip and zstd compression. The client decides what to use.
.accept_compressed(tonic::codec::CompressionEncoding::Gzip)
.accept_compressed(tonic::codec::CompressionEncoding::Zstd)
.send_compressed(tonic::codec::CompressionEncoding::Gzip)
.send_compressed(tonic::codec::CompressionEncoding::Zstd),
);
let server = server.add_service(page_service);
// Reflection service for use with e.g. grpcurl.
let reflection_service = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET)
.build_v1()?;
let server = server.add_service(reflection_service);
// Spawn server task.
let task_cancel = cancel.clone();
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"grpc listener",
async move {
let result = server
.serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
.await;
if result.is_ok() {
// TODO: revisit shutdown logic once page service is implemented.
gate.close().await;
}
result
},
));
Ok(CancellableTask { task, cancel })
}
/// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of
/// relations and their sizes, as well as SLRU segments and similar data.
#[allow(clippy::result_large_err)]
@@ -3436,8 +3355,8 @@ impl GrpcPageServiceHandler {
/// Processes a GetPage batch request, via the GetPages bidirectional streaming RPC.
///
/// NB: errors will terminate the stream. Per-request errors should return a GetPageResponse
/// with an appropriate status code instead.
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
///
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
@@ -3454,7 +3373,7 @@ impl GrpcPageServiceHandler {
let ctx = ctx.with_scope_page_service_pagestream(&timeline);
// Validate the request, decorate the span, and convert it to a Pagestream request.
let req: page_api::GetPageRequest = req.try_into()?;
let req = page_api::GetPageRequest::try_from(req)?;
span_record!(
req_id = %req.request_id,
@@ -3465,7 +3384,7 @@ impl GrpcPageServiceHandler {
);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
let effective_lsn = match PageServerHandler::effective_request_lsn(
let effective_lsn = PageServerHandler::effective_request_lsn(
&timeline,
timeline.get_last_record_lsn(),
req.read_lsn.request_lsn,
@@ -3473,10 +3392,7 @@ impl GrpcPageServiceHandler {
.not_modified_since_lsn
.unwrap_or(req.read_lsn.request_lsn),
&latest_gc_cutoff_lsn,
) {
Ok(lsn) => lsn,
Err(err) => return err.into_get_page_response(req.request_id),
};
)?;
let mut batch = SmallVec::with_capacity(req.block_numbers.len());
for blkno in req.block_numbers {
@@ -3533,7 +3449,7 @@ impl GrpcPageServiceHandler {
"unexpected response: {resp:?}"
)));
}
Err(err) => return err.err.into_get_page_response(req.request_id),
Err(err) => return Err(err.err.into()),
};
}
@@ -3587,57 +3503,66 @@ impl proto::PageService for GrpcPageServiceHandler {
Ok(tonic::Response::new(resp.into()))
}
// TODO: ensure clients use gzip compression for the stream.
#[instrument(skip_all, fields(lsn))]
async fn get_base_backup(
&self,
req: tonic::Request<proto::GetBaseBackupRequest>,
) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
// Send 64 KB chunks to avoid large memory allocations.
const CHUNK_SIZE: usize = 64 * 1024;
// Send chunks of 256 KB to avoid large memory allocations. pagebench basebackup shows this
// to be the sweet spot where throughput is saturated.
const CHUNK_SIZE: usize = 256 * 1024;
let timeline = self.get_request_timeline(&req).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
// Validate the request, decorate the span, and wait for the LSN to arrive.
//
// TODO: this requires a read LSN, is that ok?
// Validate the request and decorate the span.
Self::ensure_shard_zero(&timeline)?;
if timeline.is_archived() == Some(true) {
return Err(tonic::Status::failed_precondition("timeline is archived"));
}
let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;
span_record!(lsn=%req.read_lsn);
span_record!(lsn=?req.lsn);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
timeline
.wait_lsn(
req.read_lsn.request_lsn,
WaitLsnWaiter::PageService,
WaitLsnTimeout::Default,
&ctx,
)
.await?;
timeline
.check_lsn_is_in_scope(req.read_lsn.request_lsn, &latest_gc_cutoff_lsn)
.map_err(|err| {
tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
})?;
// Wait for the LSN to arrive, if given.
if let Some(lsn) = req.lsn {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
timeline
.wait_lsn(
lsn,
WaitLsnWaiter::PageService,
WaitLsnTimeout::Default,
&ctx,
)
.await?;
timeline
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
.map_err(|err| {
tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
})?;
}
// Spawn a task to run the basebackup.
//
// TODO: do we need to support full base backups, for debugging?
let span = Span::current();
let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
let jh = tokio::spawn(async move {
let gzip_level = match req.compression {
page_api::BaseBackupCompression::None => None,
// NB: using fast compression because it's on the critical path for compute
// startup. For an empty database, we get <100KB with this method. The
// Level::Best compression method gives us <20KB, but maybe we should add
// basebackup caching on compute shutdown first.
page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest),
};
let result = basebackup::send_basebackup_tarball(
&mut simplex_write,
&timeline,
Some(req.read_lsn.request_lsn),
req.lsn,
None,
false,
req.full,
req.replica,
gzip_level,
&ctx,
)
.instrument(span) // propagate request span
@@ -3650,20 +3575,21 @@ impl proto::PageService for GrpcPageServiceHandler {
// Emit chunks of size CHUNK_SIZE.
let chunks = async_stream::try_stream! {
let mut chunk = BytesMut::with_capacity(CHUNK_SIZE);
loop {
let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
})?;
// If we read 0 bytes, either the chunk is full or the stream is closed.
if n == 0 {
if chunk.is_empty() {
break;
let mut chunk = BytesMut::with_capacity(CHUNK_SIZE).limit(CHUNK_SIZE);
loop {
let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
})?;
if n == 0 {
break; // full chunk or closed stream
}
yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze());
chunk.clear();
}
let chunk = chunk.into_inner().freeze();
if chunk.is_empty() {
break;
}
yield proto::GetBaseBackupResponseChunk::from(chunk);
}
// Wait for the basebackup task to exit and check for errors.
jh.await.map_err(|err| {
@@ -3740,9 +3666,16 @@ impl proto::PageService for GrpcPageServiceHandler {
.await?
.downgrade();
while let Some(req) = reqs.message().await? {
yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
let req_id = req.request_id;
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
.instrument(span.clone()) // propagate request span
.await?
.await;
yield match result {
Ok(resp) => resp,
// Convert per-request errors to GetPageResponses as appropriate, or terminate
// the stream with a tonic::Status.
Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
}
}
};

View File

@@ -23,12 +23,11 @@ use pageserver_api::key::{
};
use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
use pageserver_api::models::RelSizeMigration;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use pageserver_api::value::Value;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId};
use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi_types::{Oid, RepOriginId};
use serde::{Deserialize, Serialize};
use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
@@ -36,6 +35,8 @@ use tracing::{debug, info, info_span, trace, warn};
use utils::bin_ser::{BeSer, DeserializeError};
use utils::lsn::Lsn;
use utils::pausable_failpoint;
use wal_decoder::models::record::NeonWalRecord;
use wal_decoder::models::value::Value;
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
use super::tenant::{PageReconstructError, Timeline};
@@ -720,6 +721,7 @@ impl Timeline {
let batches = keyspace.partition(
self.get_shard_identity(),
self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
BLCKSZ as u64,
);
let io_concurrency = IoConcurrency::spawn_from_conf(
@@ -960,6 +962,7 @@ impl Timeline {
let batches = keyspace.partition(
self.get_shard_identity(),
self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
BLCKSZ as u64,
);
let io_concurrency = IoConcurrency::spawn_from_conf(
@@ -1078,7 +1081,7 @@ impl Timeline {
// fetch directory entry
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
if self.pg_version >= 17 {
if self.pg_version >= PgMajorVersion::PG17 {
Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
} else {
Ok(TwoPhaseDirectory::des(&buf)?
@@ -1182,7 +1185,7 @@ impl Timeline {
}
let origin_id = k.field6 as RepOriginId;
let origin_lsn = Lsn::des(&v)
.with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?;
.with_context(|| format!("decode replorigin value for {origin_id}: {v:?}"))?;
if origin_lsn != Lsn::INVALID {
result.insert(origin_id, origin_lsn);
}
@@ -1610,7 +1613,7 @@ impl DatadirModification<'_> {
.push((DirectoryKind::Db, MetricsUpdate::Set(0)));
self.put(DBDIR_KEY, Value::Image(buf.into()));
let buf = if self.tline.pg_version >= 17 {
let buf = if self.tline.pg_version >= PgMajorVersion::PG17 {
TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
xids: HashSet::new(),
})
@@ -1964,7 +1967,7 @@ impl DatadirModification<'_> {
) -> Result<(), WalIngestError> {
// Add it to the directory entry
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
if !dir.xids.insert(xid) {
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
@@ -2380,7 +2383,7 @@ impl DatadirModification<'_> {
) -> Result<(), WalIngestError> {
// Remove it from the directory entry
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
let mut dir = TwoPhaseDirectoryV17::des(&buf)?;
if !dir.xids.remove(&xid) {
@@ -2437,8 +2440,7 @@ impl DatadirModification<'_> {
if path == p {
assert!(
modifying_file.is_none(),
"duplicated entries found for {}",
path
"duplicated entries found for {path}"
);
modifying_file = Some(content);
} else {

View File

@@ -38,6 +38,7 @@ use pageserver_api::models::{
WalRedoManagerStatus,
};
use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
use postgres_ffi::PgMajorVersion;
use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel};
use remote_timeline_client::index::GcCompactionState;
use remote_timeline_client::manifest::{
@@ -51,6 +52,7 @@ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
use storage_broker::BrokerClientChannel;
use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
use timeline::import_pgdata::ImportingTimeline;
use timeline::layer_manager::LayerManagerLockHolder;
use timeline::offload::{OffloadError, offload_timeline};
use timeline::{
CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata,
@@ -78,7 +80,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
use self::timeline::{
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
};
use crate::basebackup_cache::BasebackupPrepareSender;
use crate::basebackup_cache::BasebackupCache;
use crate::config::PageServerConf;
use crate::context;
use crate::context::RequestContextBuilder;
@@ -89,7 +91,8 @@ use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::{
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, TIMELINE_STATE_METRIC,
remove_tenant_metrics,
};
use crate::task_mgr::TaskKind;
use crate::tenant::config::LocationMode;
@@ -159,7 +162,7 @@ pub struct TenantSharedResources {
pub remote_storage: GenericRemoteStorage,
pub deletion_queue_client: DeletionQueueClient,
pub l0_flush_global_state: L0FlushGlobalState,
pub basebackup_prepare_sender: BasebackupPrepareSender,
pub basebackup_cache: Arc<BasebackupCache>,
pub feature_resolver: FeatureResolver,
}
@@ -328,7 +331,7 @@ pub struct TenantShard {
deletion_queue_client: DeletionQueueClient,
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
basebackup_prepare_sender: BasebackupPrepareSender,
basebackup_cache: Arc<BasebackupCache>,
/// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -494,8 +497,8 @@ impl WalRedoManager {
key: pageserver_api::key::Key,
lsn: Lsn,
base_img: Option<(Lsn, bytes::Bytes)>,
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
pg_version: u32,
records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>,
pg_version: PgMajorVersion,
redo_attempt_type: RedoAttemptType,
) -> Result<bytes::Bytes, walredo::Error> {
match self {
@@ -544,6 +547,28 @@ pub struct OffloadedTimeline {
/// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it
pub deleted_from_ancestor: AtomicBool,
_metrics_guard: OffloadedTimelineMetricsGuard,
}
/// Increases the offloaded timeline count metric when created, and decreases when dropped.
struct OffloadedTimelineMetricsGuard;
impl OffloadedTimelineMetricsGuard {
fn new() -> Self {
TIMELINE_STATE_METRIC
.with_label_values(&["offloaded"])
.inc();
Self
}
}
impl Drop for OffloadedTimelineMetricsGuard {
fn drop(&mut self) {
TIMELINE_STATE_METRIC
.with_label_values(&["offloaded"])
.dec();
}
}
impl OffloadedTimeline {
@@ -576,6 +601,8 @@ impl OffloadedTimeline {
delete_progress: timeline.delete_progress.clone(),
deleted_from_ancestor: AtomicBool::new(false),
_metrics_guard: OffloadedTimelineMetricsGuard::new(),
})
}
fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
@@ -595,6 +622,7 @@ impl OffloadedTimeline {
archived_at,
delete_progress: TimelineDeleteProgress::default(),
deleted_from_ancestor: AtomicBool::new(false),
_metrics_guard: OffloadedTimelineMetricsGuard::new(),
}
}
fn manifest(&self) -> OffloadedTimelineManifest {
@@ -906,7 +934,7 @@ pub(crate) enum CreateTimelineParams {
pub(crate) struct CreateTimelineParamsBootstrap {
pub(crate) new_timeline_id: TimelineId,
pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
pub(crate) pg_version: u32,
pub(crate) pg_version: PgMajorVersion,
}
/// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
@@ -944,7 +972,7 @@ pub(crate) enum CreateTimelineIdempotency {
/// NB: special treatment, see comment in [`Self`].
FailWithConflict,
Bootstrap {
pg_version: u32,
pg_version: PgMajorVersion,
},
/// NB: branches always have the same `pg_version` as their ancestor.
/// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
@@ -1289,7 +1317,7 @@ impl TenantShard {
ancestor.is_some()
|| timeline
.layers
.read()
.read(LayerManagerLockHolder::LoadLayerMap)
.await
.layer_map()
.expect(
@@ -1335,7 +1363,7 @@ impl TenantShard {
remote_storage,
deletion_queue_client,
l0_flush_global_state,
basebackup_prepare_sender,
basebackup_cache,
feature_resolver,
} = resources;
@@ -1352,7 +1380,7 @@ impl TenantShard {
remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
basebackup_prepare_sender,
basebackup_cache,
feature_resolver,
));
@@ -1832,6 +1860,29 @@ impl TenantShard {
}
}
// At this point we've initialized all timelines and are tracking them.
// Now compute the layer visibility for all (not offloaded) timelines.
let compute_visiblity_for = {
let timelines_accessor = self.timelines.lock().unwrap();
let mut timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap();
timelines_offloaded_accessor.extend(offloaded_timelines_list.into_iter());
// Before activation, populate each Timeline's GcInfo with information about its children
self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None);
timelines_accessor.values().cloned().collect::<Vec<_>>()
};
for tl in compute_visiblity_for {
tl.update_layer_visibility().await.with_context(|| {
format!(
"failed initial timeline visibility computation {} for tenant {}",
tl.timeline_id, self.tenant_shard_id
)
})?;
}
// Walk through deleted timelines, resume deletion
for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
remote_timeline_client
@@ -1851,10 +1902,6 @@ impl TenantShard {
.context("resume_deletion")
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
}
{
let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
}
// Stash the preloaded tenant manifest, and upload a new manifest if changed.
//
@@ -2495,7 +2542,7 @@ impl TenantShard {
self: &Arc<Self>,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
pg_version: PgMajorVersion,
ctx: &RequestContext,
) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
anyhow::ensure!(
@@ -2547,7 +2594,7 @@ impl TenantShard {
self: &Arc<Self>,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
pg_version: PgMajorVersion,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
let (uninit_tl, ctx) = self
@@ -2586,7 +2633,7 @@ impl TenantShard {
self: &Arc<Self>,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
pg_version: PgMajorVersion,
ctx: &RequestContext,
in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
@@ -2617,7 +2664,7 @@ impl TenantShard {
}
let layer_names = tline
.layers
.read()
.read(LayerManagerLockHolder::Testing)
.await
.layer_map()
.unwrap()
@@ -2852,7 +2899,7 @@ impl TenantShard {
Lsn(0),
initdb_lsn,
initdb_lsn,
15,
PgMajorVersion::PG15,
);
this.prepare_new_timeline(
new_timeline_id,
@@ -3132,7 +3179,12 @@ impl TenantShard {
for timeline in &compact {
// Collect L0 counts. Can't await while holding lock above.
if let Ok(lm) = timeline.layers.read().await.layer_map() {
if let Ok(lm) = timeline
.layers
.read(LayerManagerLockHolder::Compaction)
.await
.layer_map()
{
l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
}
}
@@ -3398,7 +3450,7 @@ impl TenantShard {
use pageserver_api::models::ActivatingFrom;
match &*current_state {
TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {current_state:?}");
}
TenantState::Attaching => {
*current_state = TenantState::Activating(ActivatingFrom::Attaching);
@@ -3417,9 +3469,6 @@ impl TenantShard {
.values()
.filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
// Before activation, populate each Timeline's GcInfo with information about its children
self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None);
// Spawn gc and compaction loops. The loops will shut themselves
// down when they notice that the tenant is inactive.
tasks::start_background_loops(self, background_jobs_can_start);
@@ -4331,7 +4380,7 @@ impl TenantShard {
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
l0_flush_global_state: L0FlushGlobalState,
basebackup_prepare_sender: BasebackupPrepareSender,
basebackup_cache: Arc<BasebackupCache>,
feature_resolver: FeatureResolver,
) -> TenantShard {
assert!(!attached_conf.location.generation.is_none());
@@ -4436,7 +4485,7 @@ impl TenantShard {
ongoing_timeline_detach: std::sync::Mutex::default(),
gc_block: Default::default(),
l0_flush_global_state,
basebackup_prepare_sender,
basebackup_cache,
feature_resolver,
}
}
@@ -4874,7 +4923,7 @@ impl TenantShard {
}
let layer_names = tline
.layers
.read()
.read(LayerManagerLockHolder::Testing)
.await
.layer_map()
.unwrap()
@@ -5042,7 +5091,7 @@ impl TenantShard {
pub(crate) async fn bootstrap_timeline_test(
self: &Arc<Self>,
timeline_id: TimelineId,
pg_version: u32,
pg_version: PgMajorVersion,
load_existing_initdb: Option<TimelineId>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
@@ -5184,7 +5233,7 @@ impl TenantShard {
async fn bootstrap_timeline(
self: &Arc<Self>,
timeline_id: TimelineId,
pg_version: u32,
pg_version: PgMajorVersion,
load_existing_initdb: Option<TimelineId>,
ctx: &RequestContext,
) -> Result<CreateTimelineResult, CreateTimelineError> {
@@ -5365,7 +5414,7 @@ impl TenantShard {
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
basebackup_cache: self.basebackup_cache.clone(),
feature_resolver: self.feature_resolver.clone(),
}
}
@@ -5722,7 +5771,7 @@ impl TenantShard {
async fn run_initdb(
conf: &'static PageServerConf,
initdb_target_dir: &Utf8Path,
pg_version: u32,
pg_version: PgMajorVersion,
cancel: &CancellationToken,
) -> Result<(), InitdbError> {
let initdb_bin_path = conf
@@ -5804,10 +5853,10 @@ pub(crate) mod harness {
use once_cell::sync::OnceCell;
use pageserver_api::key::Key;
use pageserver_api::models::ShardParameters;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::shard::ShardIndex;
use utils::id::TenantId;
use utils::logging;
use wal_decoder::models::record::NeonWalRecord;
use super::*;
use crate::deletion_queue::mock::MockDeletionQueue;
@@ -5951,7 +6000,7 @@ pub(crate) mod harness {
) -> anyhow::Result<Arc<TenantShard>> {
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
let tenant = Arc::new(TenantShard::new(
TenantState::Attaching,
@@ -5969,7 +6018,7 @@ pub(crate) mod harness {
self.deletion_queue.new_client(),
// TODO: ideally we should run all unit tests with both configs
L0FlushGlobalState::new(L0FlushConfig::default()),
basebackup_requst_sender,
basebackup_cache,
FeatureResolver::new_disabled(),
));
@@ -6003,7 +6052,7 @@ pub(crate) mod harness {
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
records: Vec<(Lsn, NeonWalRecord)>,
_pg_version: u32,
_pg_version: PgMajorVersion,
_redo_attempt_type: RedoAttemptType,
) -> Result<Bytes, walredo::Error> {
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
@@ -6062,9 +6111,6 @@ mod tests {
#[cfg(feature = "testing")]
use pageserver_api::keyspace::KeySpaceRandomAccum;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
#[cfg(feature = "testing")]
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
use pageserver_compaction::helpers::overlaps_with;
#[cfg(feature = "testing")]
use rand::SeedableRng;
@@ -6085,6 +6131,9 @@ mod tests {
use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
use utils::id::TenantId;
use utils::shard::{ShardCount, ShardNumber};
#[cfg(feature = "testing")]
use wal_decoder::models::record::NeonWalRecord;
use wal_decoder::models::value::Value;
use super::*;
use crate::DEFAULT_PG_VERSION;
@@ -6175,7 +6224,7 @@ mod tests {
async fn randomize_timeline(
tenant: &Arc<TenantShard>,
new_timeline_id: TimelineId,
pg_version: u32,
pg_version: PgMajorVersion,
spec: TestTimelineSpecification,
random: &mut rand::rngs::StdRng,
ctx: &RequestContext,
@@ -6568,7 +6617,7 @@ mod tests {
.put(
*TEST_KEY,
lsn,
&Value::Image(test_img(&format!("foo at {}", lsn))),
&Value::Image(test_img(&format!("foo at {lsn}"))),
ctx,
)
.await?;
@@ -6578,7 +6627,7 @@ mod tests {
.put(
*TEST_KEY,
lsn,
&Value::Image(test_img(&format!("foo at {}", lsn))),
&Value::Image(test_img(&format!("foo at {lsn}"))),
ctx,
)
.await?;
@@ -6592,7 +6641,7 @@ mod tests {
.put(
*TEST_KEY,
lsn,
&Value::Image(test_img(&format!("foo at {}", lsn))),
&Value::Image(test_img(&format!("foo at {lsn}"))),
ctx,
)
.await?;
@@ -6602,7 +6651,7 @@ mod tests {
.put(
*TEST_KEY,
lsn,
&Value::Image(test_img(&format!("foo at {}", lsn))),
&Value::Image(test_img(&format!("foo at {lsn}"))),
ctx,
)
.await?;
@@ -6944,7 +6993,7 @@ mod tests {
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
let layer_map = tline.layers.read().await;
let layer_map = tline.layers.read(LayerManagerLockHolder::Testing).await;
let level0_deltas = layer_map
.layer_map()?
.level0_deltas()
@@ -7101,7 +7150,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
ctx,
)
.await?;
@@ -7180,7 +7229,7 @@ mod tests {
let lsn = Lsn(0x10);
let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
let guard = tline.layers.read().await;
let guard = tline.layers.read(LayerManagerLockHolder::Testing).await;
let lm = guard.layer_map()?;
lm.dump(true, &ctx).await?;
@@ -7389,7 +7438,7 @@ mod tests {
.put(
gap_at_key,
current_lsn,
&Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
&Value::Image(test_img(&format!("{gap_at_key} at {current_lsn}"))),
&ctx,
)
.await?;
@@ -7428,7 +7477,7 @@ mod tests {
.put(
current_key,
current_lsn,
&Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
&Value::Image(test_img(&format!("{current_key} at {current_lsn}"))),
&ctx,
)
.await?;
@@ -7536,7 +7585,7 @@ mod tests {
while key < end_key {
current_lsn += 0x10;
let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
let image_value = format!("{child_gap_at_key} at {current_lsn}");
let mut writer = parent_timeline.writer().await;
writer
@@ -7579,7 +7628,7 @@ mod tests {
.put(
key,
current_lsn,
&Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
&Value::Image(test_img(&format!("{key} at {current_lsn}"))),
&ctx,
)
.await?;
@@ -7700,7 +7749,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
@@ -7721,7 +7770,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
@@ -7735,7 +7784,7 @@ mod tests {
test_key.field6 = blknum as u32;
assert_eq!(
tline.get(test_key, lsn, &ctx).await?,
test_img(&format!("{} at {}", blknum, last_lsn))
test_img(&format!("{blknum} at {last_lsn}"))
);
}
@@ -7781,7 +7830,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
@@ -7810,11 +7859,11 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
println!("updating {} at {}", blknum, lsn);
println!("updating {blknum} at {lsn}");
writer.finish_write(lsn);
drop(writer);
updated[blknum] = lsn;
@@ -7825,7 +7874,7 @@ mod tests {
test_key.field6 = blknum as u32;
assert_eq!(
tline.get(test_key, lsn, &ctx).await?,
test_img(&format!("{} at {}", blknum, last_lsn))
test_img(&format!("{blknum} at {last_lsn}"))
);
}
@@ -7878,11 +7927,11 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
&Value::Image(test_img(&format!("{idx} {blknum} at {lsn}"))),
&ctx,
)
.await?;
println!("updating [{}][{}] at {}", idx, blknum, lsn);
println!("updating [{idx}][{blknum}] at {lsn}");
writer.finish_write(lsn);
drop(writer);
updated[idx][blknum] = lsn;
@@ -8088,7 +8137,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
@@ -8105,7 +8154,7 @@ mod tests {
test_key.field6 = (blknum * STEP) as u32;
assert_eq!(
tline.get(test_key, lsn, &ctx).await?,
test_img(&format!("{} at {}", blknum, last_lsn))
test_img(&format!("{blknum} at {last_lsn}"))
);
}
@@ -8142,7 +8191,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
@@ -8208,12 +8257,23 @@ mod tests {
tline.freeze_and_flush().await?; // force create a delta layer
}
let before_num_l0_delta_files =
tline.layers.read().await.layer_map()?.level0_deltas().len();
let before_num_l0_delta_files = tline
.layers
.read(LayerManagerLockHolder::Testing)
.await
.layer_map()?
.level0_deltas()
.len();
tline.compact(&cancel, EnumSet::default(), &ctx).await?;
let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
let after_num_l0_delta_files = tline
.layers
.read(LayerManagerLockHolder::Testing)
.await
.layer_map()?
.level0_deltas()
.len();
assert!(
after_num_l0_delta_files < before_num_l0_delta_files,
@@ -8384,7 +8444,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
@@ -8404,7 +8464,7 @@ mod tests {
.put(
test_key,
lsn,
&Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
&Value::Image(test_img(&format!("{blknum} at {lsn}"))),
&ctx,
)
.await?;
@@ -9325,12 +9385,7 @@ mod tests {
let end_lsn = Lsn(0x100);
let image_layers = (0x20..=0x90)
.step_by(0x10)
.map(|n| {
(
Lsn(n),
vec![(key, test_img(&format!("data key at {:x}", n)))],
)
})
.map(|n| (Lsn(n), vec![(key, test_img(&format!("data key at {n:x}")))]))
.collect();
let timeline = tenant

View File

@@ -63,8 +63,7 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
&& overlaps_with(&layer.key_range, &other_layer.key_range)
{
let err = format!(
"layer violates the layer map LSN split assumption: layer {} intersects with layer {}",
layer, other_layer
"layer violates the layer map LSN split assumption: layer {layer} intersects with layer {other_layer}"
);
return Some(err);
}

View File

@@ -61,8 +61,10 @@ pub(crate) struct LocationConf {
/// The detailed shard identity. This structure is already scoped within
/// a TenantShardId, but we need the full ShardIdentity to enable calculating
/// key->shard mappings.
#[serde(default = "ShardIdentity::unsharded")]
#[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
///
/// NB: we store this even for unsharded tenants, so that we agree with storcon on the intended
/// stripe size. Otherwise, a split request that does not specify a stripe size may use a
/// different default than storcon, which can lead to incorrect stripe sizes and corruption.
pub(crate) shard: ShardIdentity,
/// The pan-cluster tenant configuration, the same on all locations
@@ -149,7 +151,12 @@ impl LocationConf {
/// For use when attaching/re-attaching: update the generation stored in this
/// structure. If we were in a secondary state, promote to attached (posession
/// of a fresh generation implies this).
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
pub(crate) fn attach_in_generation(
&mut self,
mode: AttachmentMode,
generation: Generation,
stripe_size: ShardStripeSize,
) {
match &mut self.mode {
LocationMode::Attached(attach_conf) => {
attach_conf.generation = generation;
@@ -163,6 +170,8 @@ impl LocationConf {
})
}
}
self.shard.stripe_size = stripe_size;
}
pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {

View File

@@ -3,7 +3,7 @@
use std::io;
use std::sync::Arc;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::{AtomicU64, Ordering};
use camino::Utf8PathBuf;
use num_traits::Num;
@@ -18,6 +18,7 @@ use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache;
use crate::tenant::storage_layer::inmemory_layer::GlobalResourceUnits;
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
@@ -30,9 +31,13 @@ pub struct EphemeralFile {
_tenant_shard_id: TenantShardId,
_timeline_id: TimelineId,
page_cache_file_id: page_cache::FileId,
bytes_written: u64,
file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
buffered_writer: BufferedWriter,
buffered_writer: tokio::sync::RwLock<BufferedWriter>,
bytes_written: AtomicU64,
resource_units: std::sync::Mutex<GlobalResourceUnits>,
}
type BufferedWriter = owned_buffers_io::write::BufferedWriter<
@@ -94,9 +99,8 @@ impl EphemeralFile {
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
page_cache_file_id,
bytes_written: 0,
file: file.clone(),
buffered_writer: BufferedWriter::new(
buffered_writer: tokio::sync::RwLock::new(BufferedWriter::new(
file,
0,
|| IoBufferMut::with_capacity(TAIL_SZ),
@@ -104,7 +108,9 @@ impl EphemeralFile {
cancel.child_token(),
ctx,
info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename),
),
)),
bytes_written: AtomicU64::new(0),
resource_units: std::sync::Mutex::new(GlobalResourceUnits::new()),
})
}
}
@@ -151,15 +157,17 @@ impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter
#[derive(Debug, thiserror::Error)]
pub(crate) enum EphemeralFileWriteError {
#[error("{0}")]
TooLong(String),
#[error("cancelled")]
Cancelled,
}
impl EphemeralFile {
pub(crate) fn len(&self) -> u64 {
self.bytes_written
// TODO(vlad): The value returned here is not always correct if
// we have more than one concurrent writer. Writes are always
// sequenced, but we could grab the buffered writer lock if we wanted
// to.
self.bytes_written.load(Ordering::Acquire)
}
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
@@ -186,7 +194,7 @@ impl EphemeralFile {
/// Panics if the write is short because there's no way we can recover from that.
/// TODO: make upstack handle this as an error.
pub(crate) async fn write_raw(
&mut self,
&self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> Result<u64, EphemeralFileWriteError> {
@@ -198,22 +206,13 @@ impl EphemeralFile {
}
async fn write_raw_controlled(
&mut self,
&self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> Result<(u64, Option<owned_buffers_io::write::FlushControl>), EphemeralFileWriteError> {
let pos = self.bytes_written;
let mut writer = self.buffered_writer.write().await;
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
EphemeralFileWriteError::TooLong(format!(
"write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
srcbuf_len = srcbuf.len(),
))
})?;
// Write the payload
let (nwritten, control) = self
.buffered_writer
let (nwritten, control) = writer
.write_buffered_borrowed_controlled(srcbuf, ctx)
.await
.map_err(|e| match e {
@@ -225,43 +224,69 @@ impl EphemeralFile {
"buffered writer has no short writes"
);
self.bytes_written = new_bytes_written;
// There's no realistic risk of overflow here. We won't have exabytes sized files on disk.
let pos = self
.bytes_written
.fetch_add(srcbuf.len().into_u64(), Ordering::AcqRel);
let mut resource_units = self.resource_units.lock().unwrap();
resource_units.maybe_publish_size(self.bytes_written.load(Ordering::Relaxed));
Ok((pos, control))
}
pub(crate) fn tick(&self) -> Option<u64> {
let mut resource_units = self.resource_units.lock().unwrap();
let len = self.bytes_written.load(Ordering::Relaxed);
resource_units.publish_size(len)
}
}
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
&self,
start: u64,
dst: tokio_epoll_uring::Slice<B>,
mut dst: tokio_epoll_uring::Slice<B>,
ctx: &RequestContext,
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
let submitted_offset = self.buffered_writer.bytes_submitted();
// We will fill the slice in back to front. Hence, we need
// the slice to be fully initialized.
// TODO(vlad): Is there a nicer way of doing this?
dst.as_mut_rust_slice_full_zeroed();
let mutable = match self.buffered_writer.inspect_mutable() {
Some(mutable) => &mutable[0..mutable.pending()],
None => {
// Timeline::cancel and hence buffered writer flush was cancelled.
// Remain read-available while timeline is shutting down.
&[]
}
};
let writer = self.buffered_writer.read().await;
let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
// Read bytes written while under lock. This is a hack to deal with concurrent
// writes updating the number of bytes written. `bytes_written` is not DIO alligned
// but we may end the read there.
//
// TODO(vlad): Feels like there's a nicer path where we align the end if it
// shoots over the end of the file.
let bytes_written = self.bytes_written.load(Ordering::Acquire);
let dst_cap = dst.bytes_total().into_u64();
let end = {
// saturating_add is correct here because the max file size is u64::MAX, so,
// if start + dst.len() > u64::MAX, then we know it will be a short read
let mut end: u64 = start.saturating_add(dst_cap);
if end > self.bytes_written {
end = self.bytes_written;
if end > bytes_written {
end = bytes_written;
}
end
};
let submitted_offset = writer.bytes_submitted();
let maybe_flushed = writer.inspect_maybe_flushed();
let mutable = match writer.inspect_mutable() {
Some(mutable) => &mutable[0..mutable.pending()],
None => {
// Timeline::cancel and hence buffered writer flush was cancelled.
// Remain read-available while timeline is shutting down.
&[]
}
};
// inclusive, exclusive
#[derive(Debug)]
struct Range<N>(N, N);
@@ -306,13 +331,33 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
let dst = if written_range.len() > 0 {
// There are three sources from which we might have to read data:
// 1. The file itself
// 2. The buffer which contains changes currently being flushed
// 3. The buffer which contains chnages yet to be flushed
//
// For better concurrency, we do them in reverse order: perform the in-memory
// reads while holding the writer lock, drop the writer lock and read from the
// file if required.
let dst = if mutable_range.len() > 0 {
let offset_in_buffer = mutable_range
.0
.checked_sub(submitted_offset)
.unwrap()
.into_usize();
let to_copy =
&mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
let bounds = dst.bounds();
let slice = self
.file
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
.await?;
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
let mut view = dst.slice({
let start =
written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
start..end
});
view.as_mut_rust_slice_full_zeroed()
.copy_from_slice(to_copy);
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
} else {
dst
};
@@ -342,24 +387,15 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
dst
};
let dst = if mutable_range.len() > 0 {
let offset_in_buffer = mutable_range
.0
.checked_sub(submitted_offset)
.unwrap()
.into_usize();
let to_copy =
&mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
drop(writer);
let dst = if written_range.len() > 0 {
let bounds = dst.bounds();
let mut view = dst.slice({
let start =
written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
start..end
});
view.as_mut_rust_slice_full_zeroed()
.copy_from_slice(to_copy);
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
let slice = self
.file
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
.await?;
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
} else {
dst
};
@@ -460,13 +496,15 @@ mod tests {
let gate = utils::sync::gate::Gate::default();
let cancel = CancellationToken::new();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
.await
.unwrap();
let mutable = file.buffered_writer.mutable();
let writer = file.buffered_writer.read().await;
let mutable = writer.mutable();
let cap = mutable.capacity();
let align = mutable.align();
drop(writer);
let write_nbytes = cap * 2 + cap / 2;
@@ -504,10 +542,11 @@ mod tests {
let file_contents = std::fs::read(file.file.path()).unwrap();
assert!(file_contents == content[0..cap * 2]);
let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
let writer = file.buffered_writer.read().await;
let maybe_flushed_buffer_contents = writer.inspect_maybe_flushed().unwrap();
assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
let mutable_buffer_contents = file.buffered_writer.mutable();
let mutable_buffer_contents = writer.mutable();
assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
}
@@ -517,12 +556,14 @@ mod tests {
let gate = utils::sync::gate::Gate::default();
let cancel = CancellationToken::new();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
.await
.unwrap();
// mutable buffer and maybe_flushed buffer each has `cap` bytes.
let cap = file.buffered_writer.mutable().capacity();
let writer = file.buffered_writer.read().await;
let cap = writer.mutable().capacity();
drop(writer);
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
@@ -540,12 +581,13 @@ mod tests {
2 * cap.into_u64(),
"buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
);
let writer = file.buffered_writer.read().await;
assert_eq!(
&file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
&writer.inspect_maybe_flushed().unwrap()[0..cap],
&content[cap..cap * 2]
);
assert_eq!(
&file.buffered_writer.mutable()[0..cap / 2],
&writer.mutable()[0..cap / 2],
&content[cap * 2..cap * 2 + cap / 2]
);
}
@@ -563,13 +605,15 @@ mod tests {
let gate = utils::sync::gate::Gate::default();
let cancel = CancellationToken::new();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
.await
.unwrap();
let mutable = file.buffered_writer.mutable();
let writer = file.buffered_writer.read().await;
let mutable = writer.mutable();
let cap = mutable.capacity();
let align = mutable.align();
drop(writer);
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
.take(cap * 2 + cap / 2)

View File

@@ -18,6 +18,7 @@
//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart
use anyhow::ensure;
use postgres_ffi::PgMajorVersion;
use serde::{Deserialize, Serialize};
use utils::bin_ser::{BeSer, SerializeError};
use utils::id::TimelineId;
@@ -136,7 +137,7 @@ struct TimelineMetadataBodyV2 {
latest_gc_cutoff_lsn: Lsn,
initdb_lsn: Lsn,
pg_version: u32,
pg_version: PgMajorVersion,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -167,7 +168,7 @@ impl TimelineMetadata {
ancestor_lsn: Lsn,
latest_gc_cutoff_lsn: Lsn,
initdb_lsn: Lsn,
pg_version: u32,
pg_version: PgMajorVersion,
) -> Self {
Self {
hdr: TimelineMetadataHeader {
@@ -215,7 +216,7 @@ impl TimelineMetadata {
ancestor_lsn: body.ancestor_lsn,
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
initdb_lsn: body.initdb_lsn,
pg_version: 14, // All timelines created before this version had pg_version 14
pg_version: PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
};
hdr.format_version = METADATA_FORMAT_VERSION;
@@ -317,7 +318,7 @@ impl TimelineMetadata {
self.body.initdb_lsn
}
pub fn pg_version(&self) -> u32 {
pub fn pg_version(&self) -> PgMajorVersion {
self.body.pg_version
}
@@ -331,7 +332,7 @@ impl TimelineMetadata {
Lsn::from_hex("00000000").unwrap(),
Lsn::from_hex("00000000").unwrap(),
Lsn::from_hex("00000000").unwrap(),
0,
PgMajorVersion::PG14,
);
let bytes = instance.to_bytes().unwrap();
Self::from_bytes(&bytes).unwrap()
@@ -545,13 +546,12 @@ mod tests {
Lsn(0),
Lsn(0),
Lsn(0),
14, // All timelines created before this version had pg_version 14
PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
);
assert_eq!(
deserialized_metadata.body, expected_metadata.body,
"Metadata of the old version {} should be upgraded to the latest version {}",
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
"Metadata of the old version {METADATA_OLD_FORMAT_VERSION} should be upgraded to the latest version {METADATA_FORMAT_VERSION}"
);
}
@@ -566,7 +566,7 @@ mod tests {
Lsn(0),
// Updating this version to 17 will cause the test to fail at the
// next assert_eq!().
16,
PgMajorVersion::PG16,
);
let expected_bytes = vec![
/* TimelineMetadataHeader */

File diff suppressed because it is too large Load Diff

View File

@@ -427,8 +427,8 @@ impl GcBlocking {
#[cfg(test)]
mod tests {
use postgres_ffi::PgMajorVersion;
use std::str::FromStr;
use utils::id::TimelineId;
use super::*;
@@ -831,7 +831,7 @@ mod tests {
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
PgMajorVersion::PG14,
).with_recalculated_checksum().unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None,
@@ -893,7 +893,7 @@ mod tests {
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
PgMajorVersion::PG14,
).with_recalculated_checksum().unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
@@ -957,7 +957,7 @@ mod tests {
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
PgMajorVersion::PG14,
).with_recalculated_checksum().unwrap(),
deleted_at: None,
lineage: Default::default(),
@@ -1033,7 +1033,7 @@ mod tests {
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
PgMajorVersion::PG14,
).with_recalculated_checksum().unwrap(),
deleted_at: None,
lineage: Default::default(),
@@ -1114,7 +1114,7 @@ mod tests {
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
PgMajorVersion::PG14,
).with_recalculated_checksum().unwrap(),
deleted_at: None,
lineage: Default::default(),
@@ -1199,7 +1199,7 @@ mod tests {
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
PgMajorVersion::PG14,
).with_recalculated_checksum().unwrap(),
deleted_at: None,
lineage: Default::default(),
@@ -1287,7 +1287,7 @@ mod tests {
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
PgMajorVersion::PG14,
).with_recalculated_checksum().unwrap(),
deleted_at: None,
lineage: Default::default(),

View File

@@ -1,6 +1,7 @@
//! Helper functions to upload files to remote storage with a RemoteStorage
use std::io::{ErrorKind, SeekFrom};
use std::num::NonZeroU32;
use std::time::SystemTime;
use anyhow::{Context, bail};
@@ -228,11 +229,25 @@ pub(crate) async fn time_travel_recover_tenant(
let timelines_path = super::remote_timelines_path(tenant_shard_id);
prefixes.push(timelines_path);
}
// Limit the number of versions deletions, mostly so that we don't
// keep requesting forever if the list is too long, as we'd put the
// list in RAM.
// Building a list of 100k entries that reaches the limit roughly takes
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
for prefix in &prefixes {
backoff::retry(
|| async {
storage
.time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
.time_travel_recover(
Some(prefix),
timestamp,
done_if_after,
cancel,
COMPLEXITY_LIMIT,
)
.await
},
|e| !matches!(e, TimeTravelError::Other(_)),

View File

@@ -1427,7 +1427,7 @@ async fn init_timeline_state(
let local_meta = dentry
.metadata()
.await
.fatal_err(&format!("Read metadata on {}", file_path));
.fatal_err(&format!("Read metadata on {file_path}"));
let file_name = file_path.file_name().expect("created it from the dentry");
if crate::is_temporary(&file_path)

View File

@@ -34,11 +34,11 @@ pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
use pageserver_api::config::GetVectoredConcurrentIo;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
use tracing::{Instrument, info_span, trace};
use utils::lsn::Lsn;
use utils::sync::gate::GateGuard;
use wal_decoder::models::record::NeonWalRecord;
use wal_decoder::models::value::Value;
use self::inmemory_layer::InMemoryLayerFileId;
use super::PageReconstructError;
@@ -109,7 +109,7 @@ pub(crate) enum OnDiskValue {
/// Reconstruct data accumulated for a single key during a vectored get
#[derive(Debug, Default)]
pub(crate) struct VectoredValueReconstructState {
pub struct VectoredValueReconstructState {
pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>,
pub(crate) situation: ValueReconstructSituation,
@@ -244,13 +244,60 @@ impl VectoredValueReconstructState {
res
}
/// Benchmarking utility to await for the completion of all pending ios
///
/// # Cancel-Safety
///
/// Technically fine to stop polling this future, but, the IOs will still
/// be executed to completion by the sidecar task and hold on to / consume resources.
/// Better not do it to make reasonsing about the system easier.
#[cfg(feature = "benchmarking")]
pub async fn sink_pending_ios(self) -> Result<(), std::io::Error> {
let mut res = Ok(());
// We should try hard not to bail early, so that by the time we return from this
// function, all IO for this value is done. It's not required -- we could totally
// stop polling the IO futures in the sidecar task, they need to support that,
// but just stopping to poll doesn't reduce the IO load on the disk. It's easier
// to reason about the system if we just wait for all IO to complete, even if
// we're no longer interested in the result.
//
// Revisit this when IO futures are replaced with a more sophisticated IO system
// and an IO scheduler, where we know which IOs were submitted and which ones
// just queued. Cf the comment on IoConcurrency::spawn_io.
for (_lsn, waiter) in self.on_disk_values {
let value_recv_res = waiter
.wait_completion()
// we rely on the caller to poll us to completion, so this is not a bail point
.await;
match (&mut res, value_recv_res) {
(Err(_), _) => {
// We've already failed, no need to process more.
}
(Ok(_), Err(_wait_err)) => {
// This shouldn't happen - likely the sidecar task panicked.
unreachable!();
}
(Ok(_), Ok(Err(err))) => {
let err: std::io::Error = err;
res = Err(err);
}
(Ok(_ok), Ok(Ok(OnDiskValue::RawImage(_img)))) => {}
(Ok(_ok), Ok(Ok(OnDiskValue::WalRecordOrImage(_buf)))) => {}
}
}
res
}
}
/// Bag of data accumulated during a vectored get..
pub(crate) struct ValuesReconstructState {
pub struct ValuesReconstructState {
/// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
/// should not expect to get anything from this hashmap.
pub(crate) keys: HashMap<Key, VectoredValueReconstructState>,
pub keys: HashMap<Key, VectoredValueReconstructState>,
/// The keys which are already retrieved
keys_done: KeySpaceRandomAccum,
@@ -272,7 +319,7 @@ pub(crate) struct ValuesReconstructState {
/// The desired end state is that we always do parallel IO.
/// This struct and the dispatching in the impl will be removed once
/// we've built enough confidence.
pub(crate) enum IoConcurrency {
pub enum IoConcurrency {
Sequential,
SidecarTask {
task_id: usize,
@@ -317,10 +364,7 @@ impl IoConcurrency {
Self::spawn(SelectedIoConcurrency::Sequential)
}
pub(crate) fn spawn_from_conf(
conf: GetVectoredConcurrentIo,
gate_guard: GateGuard,
) -> IoConcurrency {
pub fn spawn_from_conf(conf: GetVectoredConcurrentIo, gate_guard: GateGuard) -> IoConcurrency {
let selected = match conf {
GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
@@ -425,16 +469,6 @@ impl IoConcurrency {
}
}
pub(crate) fn clone(&self) -> Self {
match self {
IoConcurrency::Sequential => IoConcurrency::Sequential,
IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
task_id: *task_id,
ios_tx: ios_tx.clone(),
},
}
}
/// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string.
///
/// The IO is represented as an opaque future.
@@ -573,6 +607,18 @@ impl IoConcurrency {
}
}
impl Clone for IoConcurrency {
fn clone(&self) -> Self {
match self {
IoConcurrency::Sequential => IoConcurrency::Sequential,
IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
task_id: *task_id,
ios_tx: ios_tx.clone(),
},
}
}
}
/// Make noise in case the [`ValuesReconstructState`] gets dropped while
/// there are still IOs in flight.
/// Refer to `collect_pending_ios` for why we prefer not to do that.
@@ -603,7 +649,7 @@ impl Drop for ValuesReconstructState {
}
impl ValuesReconstructState {
pub(crate) fn new(io_concurrency: IoConcurrency) -> Self {
pub fn new(io_concurrency: IoConcurrency) -> Self {
Self {
keys: HashMap::new(),
keys_done: KeySpaceRandomAccum::new(),

View File

@@ -4,11 +4,11 @@ use std::sync::Arc;
use bytes::Bytes;
use pageserver_api::key::{KEY_SIZE, Key};
use pageserver_api::value::Value;
use tokio_util::sync::CancellationToken;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::shard::TenantShardId;
use wal_decoder::models::value::Value;
use super::errors::PutError;
use super::layer::S3_UPLOAD_LIMIT;

View File

@@ -44,7 +44,6 @@ use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
use pageserver_api::value::Value;
use serde::{Deserialize, Serialize};
use tokio::sync::OnceCell;
use tokio_epoll_uring::IoBuf;
@@ -54,6 +53,7 @@ use utils::bin_ser::BeSer;
use utils::bin_ser::SerializeError;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use wal_decoder::models::value::Value;
use super::errors::PutError;
use super::{
@@ -783,7 +783,7 @@ impl DeltaLayer {
ctx,
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
.with_context(|| format!("Failed to open file '{path}'"))?;
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader.read_blk(0, ctx).await?;
@@ -1306,7 +1306,7 @@ impl DeltaLayerInner {
// is it an image or will_init walrecord?
// FIXME: this could be handled by threading the BlobRef to the
// VectoredReadBuilder
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
let will_init = wal_decoder::models::value::ValueBytes::will_init(&data)
.inspect_err(|_e| {
#[cfg(feature = "testing")]
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
@@ -1369,7 +1369,7 @@ impl DeltaLayerInner {
format!(" img {} bytes", img.len())
}
Value::WalRecord(rec) => {
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
let wal_desc = wal_decoder::models::record::describe_wal_record(&rec)?;
format!(
" rec {} bytes will_init: {} {}",
buf.len(),
@@ -1401,7 +1401,7 @@ impl DeltaLayerInner {
match val {
Value::Image(img) => {
let checkpoint = CheckPoint::decode(&img)?;
println!(" CHECKPOINT: {:?}", checkpoint);
println!(" CHECKPOINT: {checkpoint:?}");
}
Value::WalRecord(_rec) => {
println!(" unexpected walrecord value for checkpoint key");
@@ -1622,12 +1622,6 @@ impl DeltaLayerIterator<'_> {
pub(crate) mod test {
use std::collections::BTreeMap;
use bytes::Bytes;
use itertools::MinMaxResult;
use pageserver_api::value::Value;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use rand::{Rng, RngCore};
use super::*;
use crate::DEFAULT_PG_VERSION;
use crate::context::DownloadBehavior;
@@ -1635,7 +1629,13 @@ pub(crate) mod test {
use crate::tenant::disk_btree::tests::TestDisk;
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
use crate::tenant::storage_layer::{Layer, ResidentLayer};
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
use crate::tenant::{TenantShard, Timeline};
use bytes::Bytes;
use itertools::MinMaxResult;
use postgres_ffi::PgMajorVersion;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use rand::{Rng, RngCore};
/// Construct an index for a fictional delta layer and and then
/// traverse in order to plan vectored reads for a query. Finally,
@@ -1987,7 +1987,7 @@ pub(crate) mod test {
#[tokio::test]
async fn copy_delta_prefix_smoke() {
use bytes::Bytes;
use pageserver_api::record::NeonWalRecord;
use wal_decoder::models::record::NeonWalRecord;
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
.await
@@ -1995,14 +1995,14 @@ pub(crate) mod test {
let (tenant, ctx) = h.load().await;
let ctx = &ctx;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
.create_test_timeline(TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, ctx)
.await
.unwrap();
let ctx = &ctx.with_scope_timeline(&timeline);
let initdb_layer = timeline
.layers
.read()
.read(crate::tenant::timeline::layer_manager::LayerManagerLockHolder::Testing)
.await
.likely_resident_layers()
.next()
@@ -2078,7 +2078,7 @@ pub(crate) mod test {
let new_layer = timeline
.layers
.read()
.read(LayerManagerLockHolder::Testing)
.await
.likely_resident_layers()
.find(|&x| x != &initdb_layer)

View File

@@ -4,8 +4,8 @@ use std::sync::Arc;
use anyhow::bail;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, SparseKeySpace};
use pageserver_api::value::Value;
use utils::lsn::Lsn;
use wal_decoder::models::value::Value;
use super::PersistentLayerKey;
use super::merge_iterator::{MergeIterator, MergeIteratorItem};
@@ -126,7 +126,6 @@ mod tests {
#[tokio::test]
async fn filter_keyspace_iterator() {
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
.await

View File

@@ -42,7 +42,6 @@ use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_api::value::Value;
use serde::{Deserialize, Serialize};
use tokio::sync::OnceCell;
use tokio_stream::StreamExt;
@@ -52,6 +51,7 @@ use utils::bin_ser::BeSer;
use utils::bin_ser::SerializeError;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use wal_decoder::models::value::Value;
use super::errors::PutError;
use super::layer_name::ImageLayerName;
@@ -272,8 +272,7 @@ impl ImageLayer {
conf.timeline_path(&tenant_shard_id, &timeline_id)
.join(format!(
"{fname}.{:x}.{TEMP_FILE_SUFFIX}",
filename_disambiguator
"{fname}.{filename_disambiguator:x}.{TEMP_FILE_SUFFIX}"
))
}
@@ -370,7 +369,7 @@ impl ImageLayer {
ctx,
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
.with_context(|| format!("Failed to open file '{path}'"))?;
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader.read_blk(0, ctx).await?;
@@ -1232,10 +1231,10 @@ mod test {
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use pageserver_api::value::Value;
use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use wal_decoder::models::value::Value;
use super::{ImageLayerIterator, ImageLayerWriter};
use crate::DEFAULT_PG_VERSION;
@@ -1475,7 +1474,7 @@ mod test {
assert_eq!(l1, expect_lsn);
assert_eq!(&i1, i2);
}
(o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2),
(o1, o2) => panic!("iterators length mismatch: {o1:?}, {o2:?}"),
}
}
}

View File

@@ -70,23 +70,15 @@ pub struct InMemoryLayer {
/// We use a separate lock for the index to reduce the critical section
/// during which reads cannot be planned.
///
/// If you need access to both the index and the underlying file at the same time,
/// respect the following locking order to avoid deadlocks:
/// 1. [`InMemoryLayer::inner`]
/// 2. [`InMemoryLayer::index`]
///
/// Note that the file backing [`InMemoryLayer::inner`] is append-only,
/// so it is not necessary to hold simultaneous locks on index.
/// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
/// Note that the file backing [`InMemoryLayer::file`] is append-only,
/// so it is not necessary to hold a lock on the index while reading or writing from the file.
/// In particular:
/// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
/// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
/// 1. It is safe to read and release [`InMemoryLayer::index`] before reading from [`InMemoryLayer::file`].
/// 2. It is safe to write to [`InMemoryLayer::file`] before locking and updating [`InMemoryLayer::index`].
index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
/// The above fields never change, except for `end_lsn`, which is only set once,
/// and `index` (see rationale there).
/// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>,
/// Wrapper for the actual on-disk file. Uses interior mutability for concurrent reads/writes.
file: EphemeralFile,
estimated_in_mem_size: AtomicU64,
}
@@ -96,20 +88,10 @@ impl std::fmt::Debug for InMemoryLayer {
f.debug_struct("InMemoryLayer")
.field("start_lsn", &self.start_lsn)
.field("end_lsn", &self.end_lsn)
.field("inner", &self.inner)
.finish()
}
}
pub struct InMemoryLayerInner {
/// The values are stored in a serialized format in this file.
/// Each serialized Value is preceded by a 'u32' length field.
/// PerSeg::page_versions map stores offsets into this file.
file: EphemeralFile,
resource_units: GlobalResourceUnits,
}
/// Support the same max blob length as blob_io, because ultimately
/// all the InMemoryLayer contents end up being written into a delta layer,
/// using the [`crate::tenant::blob_io`].
@@ -258,12 +240,6 @@ struct IndexEntryUnpacked {
pos: u64,
}
impl std::fmt::Debug for InMemoryLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("InMemoryLayerInner").finish()
}
}
/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
/// to minimize contention.
///
@@ -280,7 +256,7 @@ pub(crate) struct GlobalResources {
}
// Per-timeline RAII struct for its contribution to [`GlobalResources`]
struct GlobalResourceUnits {
pub(crate) struct GlobalResourceUnits {
// How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
// for decrementing the global counter by this many bytes when dropped.
dirty_bytes: u64,
@@ -292,7 +268,7 @@ impl GlobalResourceUnits {
// updated when the Timeline "ticks" in the background.
const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
fn new() -> Self {
pub(crate) fn new() -> Self {
GLOBAL_RESOURCES
.dirty_layers
.fetch_add(1, AtomicOrdering::Relaxed);
@@ -304,7 +280,7 @@ impl GlobalResourceUnits {
///
/// Returns the effective layer size limit that should be applied, if any, to keep
/// the total number of dirty bytes below the configured maximum.
fn publish_size(&mut self, size: u64) -> Option<u64> {
pub(crate) fn publish_size(&mut self, size: u64) -> Option<u64> {
let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
Ordering::Greater => {
@@ -349,7 +325,7 @@ impl GlobalResourceUnits {
// Call publish_size if the input size differs from last published size by more than
// the drift limit
fn maybe_publish_size(&mut self, size: u64) {
pub(crate) fn maybe_publish_size(&mut self, size: u64) {
let publish = match size.cmp(&self.dirty_bytes) {
Ordering::Equal => false,
Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
@@ -398,8 +374,8 @@ impl InMemoryLayer {
}
}
pub(crate) fn try_len(&self) -> Option<u64> {
self.inner.try_read().map(|i| i.file.len()).ok()
pub(crate) fn len(&self) -> u64 {
self.file.len()
}
pub(crate) fn assert_writable(&self) {
@@ -430,7 +406,7 @@ impl InMemoryLayer {
// Look up the keys in the provided keyspace and update
// the reconstruct state with whatever is found.
pub(crate) async fn get_values_reconstruct_data(
pub async fn get_values_reconstruct_data(
self: &Arc<InMemoryLayer>,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
@@ -479,14 +455,13 @@ impl InMemoryLayer {
}
}
}
drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
drop(index); // release the lock before we spawn the IO
let read_from = Arc::clone(self);
let read_ctx = ctx.attached_child();
reconstruct_state
.spawn_io(async move {
let inner = read_from.inner.read().await;
let f = vectored_dio_read::execute(
&inner.file,
&read_from.file,
reads
.iter()
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
@@ -518,7 +493,6 @@ impl InMemoryLayer {
// This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
// but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
// drop for consistency among all three layer types.
drop(inner);
drop(read_from);
})
.await;
@@ -537,7 +511,7 @@ fn inmem_layer_log_display(
start_lsn: Lsn,
end_lsn: Lsn,
) -> std::fmt::Result {
write!(f, "timeline {} in-memory ", timeline)?;
write!(f, "timeline {timeline} in-memory ")?;
inmem_layer_display(f, start_lsn, end_lsn)
}
@@ -549,12 +523,6 @@ impl std::fmt::Display for InMemoryLayer {
}
impl InMemoryLayer {
/// Get layer size.
pub async fn size(&self) -> Result<u64> {
let inner = self.inner.read().await;
Ok(inner.file.len())
}
pub fn estimated_in_mem_size(&self) -> u64 {
self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
}
@@ -587,10 +555,7 @@ impl InMemoryLayer {
end_lsn: OnceLock::new(),
opened_at: Instant::now(),
index: RwLock::new(BTreeMap::new()),
inner: RwLock::new(InMemoryLayerInner {
file,
resource_units: GlobalResourceUnits::new(),
}),
file,
estimated_in_mem_size: AtomicU64::new(0),
})
}
@@ -599,41 +564,37 @@ impl InMemoryLayer {
///
/// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
/// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
///
/// This method shall not be called concurrently. We enforce this property via [`crate::tenant::Timeline::write_lock`].
///
/// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
pub async fn put_batch(
&self,
serialized_batch: SerializedValueBatch,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let (base_offset, metadata) = {
let mut inner = self.inner.write().await;
self.assert_writable();
self.assert_writable();
let base_offset = inner.file.len();
let base_offset = self.file.len();
let SerializedValueBatch {
raw,
metadata,
max_lsn: _,
len: _,
} = serialized_batch;
let SerializedValueBatch {
raw,
metadata,
max_lsn: _,
len: _,
} = serialized_batch;
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
// Write the batch to the file
self.file.write_raw(&raw, ctx).await?;
let new_size = self.file.len();
let expected_new_len = base_offset
.checked_add(raw.len().into_u64())
// write_raw would error if we were to overflow u64.
// also IndexEntry and higher levels in
//the code don't allow the file to grow that large
.unwrap();
assert_eq!(new_size, expected_new_len);
inner.resource_units.maybe_publish_size(new_size);
(base_offset, metadata)
};
let expected_new_len = base_offset
.checked_add(raw.len().into_u64())
// write_raw would error if we were to overflow u64.
// also IndexEntry and higher levels in
//the code don't allow the file to grow that large
.unwrap();
assert_eq!(new_size, expected_new_len);
// Update the index with the new entries
let mut index = self.index.write().await;
@@ -686,10 +647,8 @@ impl InMemoryLayer {
self.opened_at
}
pub(crate) async fn tick(&self) -> Option<u64> {
let mut inner = self.inner.write().await;
let size = inner.file.len();
inner.resource_units.publish_size(size)
pub(crate) fn tick(&self) -> Option<u64> {
self.file.tick()
}
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
@@ -753,12 +712,6 @@ impl InMemoryLayer {
gate: &utils::sync::gate::Gate,
cancel: CancellationToken,
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. See the comment on
// [`InMemoryLayer::freeze`] to understand how locking between the append path
// and layer flushing works.
let inner = self.inner.read().await;
let index = self.index.read().await;
use l0_flush::Inner;
@@ -793,7 +746,7 @@ impl InMemoryLayer {
match l0_flush_global_state {
l0_flush::Inner::Direct { .. } => {
let file_contents = inner.file.load_to_io_buf(ctx).await?;
let file_contents = self.file.load_to_io_buf(ctx).await?;
let file_contents = file_contents.freeze();
for (key, vec_map) in index.iter() {

View File

@@ -380,7 +380,7 @@ impl<B: Buffer> std::fmt::Debug for LogicalReadState<B> {
write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer))
}
LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)),
LogicalReadState::Error(e) => write!(f, "Error({:?})", e),
LogicalReadState::Error(e) => write!(f, "Error({e:?})"),
LogicalReadState::Undefined => write!(f, "Undefined"),
}
}

View File

@@ -105,7 +105,7 @@ impl std::fmt::Display for Layer {
impl std::fmt::Debug for Layer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self)
write!(f, "{self}")
}
}

View File

@@ -1,6 +1,7 @@
use std::time::UNIX_EPOCH;
use pageserver_api::key::{CONTROLFILE_KEY, Key};
use postgres_ffi::PgMajorVersion;
use tokio::task::JoinSet;
use utils::completion::{self, Completion};
use utils::id::TimelineId;
@@ -10,6 +11,7 @@ use super::*;
use crate::context::DownloadBehavior;
use crate::tenant::harness::{TenantHarness, test_img};
use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
/// Used in tests to advance a future to wanted await point, and not futher.
const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
@@ -44,7 +46,7 @@ async fn smoke_test() {
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
PgMajorVersion::PG14,
&ctx,
Default::default(), // in-memory layers
Default::default(),
@@ -59,7 +61,7 @@ async fn smoke_test() {
// there to avoid the timeline being illegally empty
let (layer, dummy_layer) = {
let mut layers = {
let layers = timeline.layers.read().await;
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};
@@ -215,7 +217,7 @@ async fn smoke_test() {
// Simulate GC removing our test layer.
{
let mut g = timeline.layers.write().await;
let mut g = timeline.layers.write(LayerManagerLockHolder::Testing).await;
let layers = &[layer];
g.open_mut().unwrap().finish_gc_timeline(layers);
@@ -255,13 +257,18 @@ async fn evict_and_wait_on_wanted_deleted() {
let (tenant, ctx) = h.load().await;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline(
TimelineId::generate(),
Lsn(0x10),
PgMajorVersion::PG14,
&ctx,
)
.await
.unwrap();
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};
@@ -305,7 +312,7 @@ async fn evict_and_wait_on_wanted_deleted() {
// assert that once we remove the `layer` from the layer map and drop our reference,
// the deletion of the layer in remote_storage happens.
{
let mut layers = timeline.layers.write().await;
let mut layers = timeline.layers.write(LayerManagerLockHolder::Testing).await;
layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
}
@@ -340,14 +347,19 @@ fn read_wins_pending_eviction() {
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline(
TimelineId::generate(),
Lsn(0x10),
PgMajorVersion::PG14,
&ctx,
)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};
@@ -473,14 +485,19 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline(
TimelineId::generate(),
Lsn(0x10),
PgMajorVersion::PG14,
&ctx,
)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};
@@ -643,7 +660,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
let (tenant, ctx) = h.load().await;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline(
TimelineId::generate(),
Lsn(0x10),
PgMajorVersion::PG14,
&ctx,
)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
@@ -655,7 +677,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};
@@ -729,7 +751,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline(
TimelineId::generate(),
Lsn(0x10),
PgMajorVersion::PG14,
&ctx,
)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
@@ -741,7 +768,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
};
@@ -823,7 +850,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
#[tokio::test(start_paused = true)]
async fn eviction_cancellation_on_drop() {
use bytes::Bytes;
use pageserver_api::value::Value;
use wal_decoder::models::value::Value;
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
@@ -835,7 +862,12 @@ async fn eviction_cancellation_on_drop() {
let (tenant, ctx) = h.load().await;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline(
TimelineId::generate(),
Lsn(0x10),
PgMajorVersion::PG14,
&ctx,
)
.await
.unwrap();
@@ -862,7 +894,7 @@ async fn eviction_cancellation_on_drop() {
let (evicted_layer, not_evicted) = {
let mut layers = {
let mut guard = timeline.layers.write().await;
let mut guard = timeline.layers.write(LayerManagerLockHolder::Testing).await;
let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
// remove the layers from layermap
guard.open_mut().unwrap().finish_gc_timeline(&layers);

View File

@@ -4,8 +4,8 @@ use std::sync::Arc;
use anyhow::bail;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use utils::lsn::Lsn;
use wal_decoder::models::value::Value;
use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator};
use super::image_layer::{ImageLayerInner, ImageLayerIterator};
@@ -402,9 +402,9 @@ impl<'a> MergeIterator<'a> {
mod tests {
use itertools::Itertools;
use pageserver_api::key::Key;
#[cfg(feature = "testing")]
use pageserver_api::record::NeonWalRecord;
use utils::lsn::Lsn;
#[cfg(feature = "testing")]
use wal_decoder::models::record::NeonWalRecord;
use super::*;
use crate::DEFAULT_PG_VERSION;
@@ -436,7 +436,6 @@ mod tests {
#[tokio::test]
async fn merge_in_between() {
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("merge_iterator_merge_in_between")
.await
@@ -501,7 +500,6 @@ mod tests {
#[tokio::test]
async fn delta_merge() {
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("merge_iterator_delta_merge")
.await
@@ -578,7 +576,6 @@ mod tests {
#[tokio::test]
async fn delta_image_mixed_merge() {
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
.await

View File

@@ -35,7 +35,11 @@ use fail::fail_point;
use futures::stream::FuturesUnordered;
use futures::{FutureExt, StreamExt};
use handle::ShardTimelineId;
use layer_manager::Shutdown;
use layer_manager::{
LayerManagerLockHolder, LayerManagerReadGuard, LayerManagerWriteGuard, LockedLayerManager,
Shutdown,
};
use offload::OffloadError;
use once_cell::sync::Lazy;
use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
@@ -52,11 +56,9 @@ use pageserver_api::models::{
};
use pageserver_api::reltag::{BlockNumber, RelTag};
use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
#[cfg(test)]
use pageserver_api::value::Value;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::v14::xlog_utils;
use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
use postgres_ffi::{PgMajorVersion, WAL_SEGMENT_SIZE, to_pg_timestamp};
use rand::Rng;
use remote_storage::DownloadError;
use serde_with::serde_as;
@@ -77,12 +79,13 @@ use utils::seqwait::SeqWait;
use utils::simple_rcu::{Rcu, RcuReadGuard};
use utils::sync::gate::{Gate, GateGuard};
use utils::{completion, critical, fs_ext, pausable_failpoint};
#[cfg(test)]
use wal_decoder::models::value::Value;
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
use self::eviction_task::EvictionTaskTimelineState;
use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::remote_timeline_client::RemoteTimelineClient;
@@ -92,12 +95,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
use super::tasks::log_compaction_error;
use super::upload_queue::NotInitialized;
use super::{
AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
debug_assert_current_span_has_tenant_and_timeline_id,
};
use crate::PERF_TRACE_TARGET;
use crate::aux_file::AuxFileSizeEstimator;
use crate::basebackup_cache::BasebackupPrepareRequest;
use crate::basebackup_cache::BasebackupCache;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -175,19 +178,19 @@ pub enum LastImageLayerCreationStatus {
impl std::fmt::Display for ImageLayerCreationMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self)
write!(f, "{self:?}")
}
}
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
fn drop_layer_manager_rlock(rlock: LayerManagerReadGuard<'_>) {
drop(rlock)
}
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
fn drop_layer_manager_wlock(rlock: LayerManagerWriteGuard<'_>) {
drop(rlock)
}
@@ -198,7 +201,7 @@ pub struct TimelineResources {
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
pub l0_compaction_trigger: Arc<Notify>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
pub basebackup_prepare_sender: BasebackupPrepareSender,
pub basebackup_cache: Arc<BasebackupCache>,
pub feature_resolver: FeatureResolver,
}
@@ -222,7 +225,7 @@ pub struct Timeline {
/// to shards, and is constant through the lifetime of this Timeline.
shard_identity: ShardIdentity,
pub pg_version: u32,
pub pg_version: PgMajorVersion,
/// The tuple has two elements.
/// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote).
@@ -241,7 +244,7 @@ pub struct Timeline {
///
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
pub(crate) layers: tokio::sync::RwLock<LayerManager>,
pub(crate) layers: LockedLayerManager,
last_freeze_at: AtomicLsn,
// Atomic would be more appropriate here.
@@ -445,7 +448,7 @@ pub struct Timeline {
wait_lsn_log_slow: tokio::sync::Semaphore,
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
basebackup_prepare_sender: BasebackupPrepareSender,
basebackup_cache: Arc<BasebackupCache>,
feature_resolver: FeatureResolver,
}
@@ -629,7 +632,7 @@ pub enum ReadPathLayerId {
impl std::fmt::Display for ReadPathLayerId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key),
ReadPathLayerId::PersistentLayer(key) => write!(f, "{key}"),
ReadPathLayerId::InMemoryLayer(range) => {
write!(f, "in-mem {}..{}", range.start, range.end)
}
@@ -705,7 +708,7 @@ impl MissingKeyError {
impl std::fmt::Debug for MissingKeyError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self)
write!(f, "{self}")
}
}
@@ -718,19 +721,19 @@ impl std::fmt::Display for MissingKeyError {
)?;
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
write!(f, ", ancestor {}", ancestor_lsn)?;
write!(f, ", ancestor {ancestor_lsn}")?;
}
if let Some(ref query) = self.query {
write!(f, ", query {}", query)?;
write!(f, ", query {query}")?;
}
if let Some(ref read_path) = self.read_path {
write!(f, "\n{}", read_path)?;
write!(f, "\n{read_path}")?;
}
if let Some(ref backtrace) = self.backtrace {
write!(f, "\n{}", backtrace)?;
write!(f, "\n{backtrace}")?;
}
Ok(())
@@ -813,7 +816,7 @@ impl From<layer_manager::Shutdown> for FlushLayerError {
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GetVectoredError {
pub enum GetVectoredError {
#[error("timeline shutting down")]
Cancelled,
@@ -846,7 +849,7 @@ impl From<GetReadyAncestorError> for GetVectoredError {
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum GetReadyAncestorError {
pub enum GetReadyAncestorError {
#[error("ancestor LSN wait error")]
AncestorLsnTimeout(#[from] WaitLsnError),
@@ -936,7 +939,7 @@ impl std::fmt::Debug for Timeline {
}
#[derive(thiserror::Error, Debug, Clone)]
pub(crate) enum WaitLsnError {
pub enum WaitLsnError {
// Called on a timeline which is shutting down
#[error("Shutdown")]
Shutdown,
@@ -1055,8 +1058,8 @@ pub(crate) enum WaitLsnWaiter<'a> {
/// Argument to [`Timeline::shutdown`].
#[derive(Debug, Clone, Copy)]
pub(crate) enum ShutdownMode {
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk. This method can
/// take multiple seconds for a busy timeline.
///
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
/// the call to [`Timeline::shutdown`].
@@ -1535,7 +1538,10 @@ impl Timeline {
/// This method makes no distinction between local and remote layers.
/// Hence, the result **does not represent local filesystem usage**.
pub(crate) async fn layer_size_sum(&self) -> u64 {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
guard.layer_size_sum()
}
@@ -1845,7 +1851,7 @@ impl Timeline {
// time, and this was missed.
// if write_guard.is_none() { return; }
let Ok(layers_guard) = self.layers.try_read() else {
let Ok(layers_guard) = self.layers.try_read(LayerManagerLockHolder::TryFreezeLayer) else {
// Don't block if the layer lock is busy
return;
};
@@ -1896,16 +1902,11 @@ impl Timeline {
return;
};
let Some(current_size) = open_layer.try_len() else {
// Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
// read lock to get size should always succeed.
tracing::warn!("Lock conflict while reading size of open layer");
return;
};
let current_size = open_layer.len();
let current_lsn = self.get_last_record_lsn();
let checkpoint_distance_override = open_layer.tick().await;
let checkpoint_distance_override = open_layer.tick();
if let Some(size_override) = checkpoint_distance_override {
if current_size > size_override {
@@ -2158,7 +2159,7 @@ impl Timeline {
if let ShutdownMode::FreezeAndFlush = mode {
let do_flush = if let Some((open, frozen)) = self
.layers
.read()
.read(LayerManagerLockHolder::Shutdown)
.await
.layer_map()
.map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
@@ -2262,7 +2263,10 @@ impl Timeline {
// Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate
// open.
let mut write_guard = self.write_lock.lock().await;
self.layers.write().await.shutdown(&mut write_guard);
self.layers
.write(LayerManagerLockHolder::Shutdown)
.await
.shutdown(&mut write_guard);
}
// Finally wait until any gate-holders are complete.
@@ -2365,7 +2369,10 @@ impl Timeline {
&self,
reset: LayerAccessStatsReset,
) -> Result<LayerMapInfo, layer_manager::Shutdown> {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let layer_map = guard.layer_map()?;
let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
if let Some(open_layer) = &layer_map.open_layer {
@@ -2493,6 +2500,13 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
}
/// Try to get a basebackup from the on-disk cache.
pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option<tokio::fs::File> {
self.basebackup_cache
.get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn)
.await
}
/// Prepare basebackup for the given LSN and store it in the basebackup cache.
/// The method is asynchronous and returns immediately.
/// The actual basebackup preparation is performed in the background
@@ -2506,18 +2520,16 @@ impl Timeline {
// Preparing basebackup doesn't make sense for shards other than shard zero.
return;
}
let res = self
.basebackup_prepare_sender
.send(BasebackupPrepareRequest {
tenant_shard_id: self.tenant_shard_id,
timeline_id: self.timeline_id,
lsn,
});
if let Err(e) = res {
// May happen during shutdown, it's not critical.
info!("Failed to send shutdown checkpoint: {e:#}");
if !self.is_active() {
// May happen during initial timeline creation.
// Such timeline is not in the global timeline map yet,
// so basebackup cache will not be able to find it.
// TODO(diko): We can prepare such timelines in finish_creation().
return;
}
self.basebackup_cache
.send_prepare(self.tenant_shard_id, self.timeline_id, lsn);
}
}
@@ -2899,7 +2911,7 @@ impl Timeline {
shard_identity: ShardIdentity,
walredo_mgr: Option<Arc<super::WalRedoManager>>,
resources: TimelineResources,
pg_version: u32,
pg_version: PgMajorVersion,
state: TimelineState,
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
create_idempotency: crate::tenant::CreateTimelineIdempotency,
@@ -3074,7 +3086,7 @@ impl Timeline {
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
basebackup_prepare_sender: resources.basebackup_prepare_sender,
basebackup_cache: resources.basebackup_cache,
feature_resolver: resources.feature_resolver,
};
@@ -3225,7 +3237,7 @@ impl Timeline {
/// Initialize with an empty layer map. Used when creating a new timeline.
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
let mut layers = self.layers.try_write().expect(
let mut layers = self.layers.try_write(LayerManagerLockHolder::Init).expect(
"in the context where we call this function, no other task has access to the object",
);
layers
@@ -3245,7 +3257,10 @@ impl Timeline {
use init::Decision::*;
use init::{Discovered, DismissedLayer};
let mut guard = self.layers.write().await;
let mut guard = self
.layers
.write(LayerManagerLockHolder::LoadLayerMap)
.await;
let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -3400,10 +3415,6 @@ impl Timeline {
// TenantShard::create_timeline will wait for these uploads to happen before returning, or
// on retry.
// Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
drop(guard); // drop write lock, update_layer_visibility will take a read lock.
self.update_layer_visibility().await?;
info!(
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
@@ -3862,7 +3873,10 @@ impl Timeline {
&self,
layer_name: &LayerName,
) -> Result<Option<Layer>, layer_manager::Shutdown> {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let layer = guard
.layer_map()?
.iter_historic_layers()
@@ -3895,7 +3909,10 @@ impl Timeline {
return None;
}
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GenerateHeatmap)
.await;
// Firstly, if there's any heatmap left over from when this location
// was a secondary, take that into account. Keep layers that are:
@@ -3993,7 +4010,10 @@ impl Timeline {
}
pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GenerateHeatmap)
.await;
let now = SystemTime::now();
let mut heatmap_layers = Vec::default();
@@ -4335,7 +4355,7 @@ impl Timeline {
query: &VersionedKeySpaceQuery,
) -> Result<LayerFringe, GetVectoredError> {
let mut fringe = LayerFringe::new();
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::GetPage).await;
match query {
VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
@@ -4438,7 +4458,7 @@ impl Timeline {
// required for correctness, but avoids visiting extra layers
// which turns out to be a perf bottleneck in some cases.
if !unmapped_keyspace.is_empty() {
let guard = timeline.layers.read().await;
let guard = timeline.layers.read(LayerManagerLockHolder::GetPage).await;
guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;
// It's safe to drop the layer map lock after planning the next round of reads.
@@ -4548,7 +4568,10 @@ impl Timeline {
_guard: &tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<InMemoryLayer>> {
let mut guard = self.layers.write().await;
let mut guard = self
.layers
.write(LayerManagerLockHolder::GetLayerForWrite)
.await;
let last_record_lsn = self.get_last_record_lsn();
ensure!(
@@ -4590,7 +4613,10 @@ impl Timeline {
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
) -> Result<u64, FlushLayerError> {
let frozen = {
let mut guard = self.layers.write().await;
let mut guard = self
.layers
.write(LayerManagerLockHolder::TryFreezeLayer)
.await;
guard
.open_mut()?
.try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
@@ -4631,7 +4657,12 @@ impl Timeline {
ctx: &RequestContext,
) {
// Subscribe to L0 delta layer updates, for compaction backpressure.
let mut watch_l0 = match self.layers.read().await.layer_map() {
let mut watch_l0 = match self
.layers
.read(LayerManagerLockHolder::FlushLoop)
.await
.layer_map()
{
Ok(lm) => lm.watch_level0_deltas(),
Err(Shutdown) => return,
};
@@ -4668,7 +4699,7 @@ impl Timeline {
// Fetch the next layer to flush, if any.
let (layer, l0_count, frozen_count, frozen_size) = {
let layers = self.layers.read().await;
let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await;
let Ok(lm) = layers.layer_map() else {
info!("dropping out of flush loop for timeline shutdown");
return;
@@ -4964,7 +4995,10 @@ impl Timeline {
// in-memory layer from the map now. The flushed layer is stored in
// the mapping in `create_delta_layer`.
{
let mut guard = self.layers.write().await;
let mut guard = self
.layers
.write(LayerManagerLockHolder::FlushFrozenLayer)
.await;
guard.open_mut()?.finish_flush_l0_layer(
delta_layer_to_add.as_ref(),
@@ -5166,7 +5200,11 @@ impl Timeline {
}
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
let dense_partitioning = dense_ks.partition(
&self.shard_identity,
partition_size,
postgres_ffi::BLCKSZ as u64,
);
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
}; // no partitioning for metadata keys for now
@@ -5179,7 +5217,7 @@ impl Timeline {
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
let threshold = self.get_image_creation_threshold();
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
let Ok(layers) = guard.layer_map() else {
return false;
};
@@ -5597,7 +5635,7 @@ impl Timeline {
if let ImageLayerCreationMode::Force = mode {
// When forced to create image layers, we might try and create them where they already
// exist. This mode is only used in tests/debug.
let layers = self.layers.read().await;
let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
if layers.contains_key(&PersistentLayerKey {
key_range: img_range.clone(),
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
@@ -5722,7 +5760,7 @@ impl Timeline {
let image_layers = batch_image_writer.finish(self, ctx).await?;
let mut guard = self.layers.write().await;
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
// FIXME: we could add the images to be uploaded *before* returning from here, but right
// now they are being scheduled outside of write lock; current way is inconsistent with
@@ -5730,7 +5768,7 @@ impl Timeline {
guard
.open_mut()?
.track_new_image_layers(&image_layers, &self.metrics);
drop_wlock(guard);
drop_layer_manager_wlock(guard);
let duration = timer.stop_and_record();
// Creating image layers may have caused some previously visible layers to be covered
@@ -5894,7 +5932,7 @@ impl Drop for Timeline {
if let Ok(mut gc_info) = ancestor.gc_info.write() {
if !gc_info.remove_child_not_offloaded(self.timeline_id) {
tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id,
"Couldn't remove retain_lsn entry from offloaded timeline's parent: already removed");
"Couldn't remove retain_lsn entry from timeline's parent on drop: already removed");
}
}
}
@@ -6100,7 +6138,7 @@ impl Timeline {
layers_to_remove: &[Layer],
) -> Result<(), CompactionError> {
let mut guard = tokio::select! {
guard = self.layers.write() => guard,
guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
_ = self.cancel.cancelled() => {
return Err(CompactionError::ShuttingDown);
}
@@ -6149,7 +6187,7 @@ impl Timeline {
self.remote_client
.schedule_compaction_update(&remove_layers, new_deltas)?;
drop_wlock(guard);
drop_layer_manager_wlock(guard);
Ok(())
}
@@ -6159,7 +6197,7 @@ impl Timeline {
mut replace_layers: Vec<(Layer, ResidentLayer)>,
mut drop_layers: Vec<Layer>,
) -> Result<(), CompactionError> {
let mut guard = self.layers.write().await;
let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;
// Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
// to avoid double-removing, and avoid rewriting something that was removed.
@@ -6498,7 +6536,7 @@ impl Timeline {
debug!("retain_lsns: {:?}", retain_lsns);
let mut layers_to_remove = Vec::new();
let max_retain_lsn = retain_lsns.iter().max();
// Scan all layers in the timeline (remote or on-disk).
//
@@ -6508,105 +6546,110 @@ impl Timeline {
// 3. it doesn't need to be retained for 'retain_lsns';
// 4. it does not need to be kept for LSNs holding valid leases.
// 5. newer on-disk image layers cover the layer's whole key range
//
// TODO holding a write lock is too agressive and avoidable
let mut guard = self.layers.write().await;
let layers = guard.layer_map()?;
'outer: for l in layers.iter_historic_layers() {
result.layers_total += 1;
let layers_to_remove = {
let mut layers_to_remove = Vec::new();
// 1. Is it newer than GC horizon cutoff point?
if l.get_lsn_range().end > space_cutoff {
info!(
"keeping {} because it's newer than space_cutoff {}",
l.layer_name(),
space_cutoff,
);
result.layers_needed_by_cutoff += 1;
continue 'outer;
}
let guard = self
.layers
.read(LayerManagerLockHolder::GarbageCollection)
.await;
let layers = guard.layer_map()?;
'outer: for l in layers.iter_historic_layers() {
result.layers_total += 1;
// 2. It is newer than PiTR cutoff point?
if l.get_lsn_range().end > time_cutoff {
info!(
"keeping {} because it's newer than time_cutoff {}",
l.layer_name(),
time_cutoff,
);
result.layers_needed_by_pitr += 1;
continue 'outer;
}
// 3. Is it needed by a child branch?
// NOTE With that we would keep data that
// might be referenced by child branches forever.
// We can track this in child timeline GC and delete parent layers when
// they are no longer needed. This might be complicated with long inheritance chains.
//
// TODO Vec is not a great choice for `retain_lsns`
for retain_lsn in &retain_lsns {
// start_lsn is inclusive
if &l.get_lsn_range().start <= retain_lsn {
info!(
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
// 1. Is it newer than GC horizon cutoff point?
if l.get_lsn_range().end > space_cutoff {
debug!(
"keeping {} because it's newer than space_cutoff {}",
l.layer_name(),
retain_lsn,
l.is_incremental(),
space_cutoff,
);
result.layers_needed_by_branches += 1;
result.layers_needed_by_cutoff += 1;
continue 'outer;
}
}
// 4. Is there a valid lease that requires us to keep this layer?
if let Some(lsn) = &max_lsn_with_valid_lease {
// keep if layer start <= any of the lease
if &l.get_lsn_range().start <= lsn {
info!(
"keeping {} because there is a valid lease preventing GC at {}",
// 2. It is newer than PiTR cutoff point?
if l.get_lsn_range().end > time_cutoff {
debug!(
"keeping {} because it's newer than time_cutoff {}",
l.layer_name(),
lsn,
time_cutoff,
);
result.layers_needed_by_leases += 1;
result.layers_needed_by_pitr += 1;
continue 'outer;
}
// 3. Is it needed by a child branch?
// NOTE With that we would keep data that
// might be referenced by child branches forever.
// We can track this in child timeline GC and delete parent layers when
// they are no longer needed. This might be complicated with long inheritance chains.
if let Some(retain_lsn) = max_retain_lsn {
// start_lsn is inclusive
if &l.get_lsn_range().start <= retain_lsn {
debug!(
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
l.layer_name(),
retain_lsn,
l.is_incremental(),
);
result.layers_needed_by_branches += 1;
continue 'outer;
}
}
// 4. Is there a valid lease that requires us to keep this layer?
if let Some(lsn) = &max_lsn_with_valid_lease {
// keep if layer start <= any of the lease
if &l.get_lsn_range().start <= lsn {
debug!(
"keeping {} because there is a valid lease preventing GC at {}",
l.layer_name(),
lsn,
);
result.layers_needed_by_leases += 1;
continue 'outer;
}
}
// 5. Is there a later on-disk layer for this relation?
//
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
//
// For example, imagine that the following layers exist:
//
// 1000 - image (A)
// 1000-2000 - delta (B)
// 2000 - image (C)
// 2000-3000 - delta (D)
// 3000 - image (E)
//
// If GC horizon is at 2500, we can remove layers A and B, but
// we cannot remove C, even though it's older than 2500, because
// the delta layer 2000-3000 depends on it.
if !layers
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
{
debug!("keeping {} because it is the latest layer", l.layer_name());
result.layers_not_updated += 1;
continue 'outer;
}
// We didn't find any reason to keep this file, so remove it.
info!(
"garbage collecting {} is_dropped: xx is_incremental: {}",
l.layer_name(),
l.is_incremental(),
);
layers_to_remove.push(l);
}
// 5. Is there a later on-disk layer for this relation?
//
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
//
// For example, imagine that the following layers exist:
//
// 1000 - image (A)
// 1000-2000 - delta (B)
// 2000 - image (C)
// 2000-3000 - delta (D)
// 3000 - image (E)
//
// If GC horizon is at 2500, we can remove layers A and B, but
// we cannot remove C, even though it's older than 2500, because
// the delta layer 2000-3000 depends on it.
if !layers
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
{
info!("keeping {} because it is the latest layer", l.layer_name());
result.layers_not_updated += 1;
continue 'outer;
}
// We didn't find any reason to keep this file, so remove it.
info!(
"garbage collecting {} is_dropped: xx is_incremental: {}",
l.layer_name(),
l.is_incremental(),
);
layers_to_remove.push(l);
}
layers_to_remove
};
if !layers_to_remove.is_empty() {
// Persist the new GC cutoff value before we actually remove anything.
@@ -6622,15 +6665,19 @@ impl Timeline {
}
})?;
let mut guard = self
.layers
.write(LayerManagerLockHolder::GarbageCollection)
.await;
let gc_layers = layers_to_remove
.iter()
.map(|x| guard.get_from_desc(x))
.flat_map(|desc| guard.try_get_from_key(&desc.key()).cloned())
.collect::<Vec<Layer>>();
result.layers_removed = gc_layers.len() as u64;
self.remote_client.schedule_gc_update(&gc_layers)?;
guard.open_mut()?.finish_gc_timeline(&gc_layers);
#[cfg(feature = "testing")]
@@ -6812,7 +6859,10 @@ impl Timeline {
use pageserver_api::models::DownloadRemoteLayersTaskState;
let remaining = {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let Ok(lm) = guard.layer_map() else {
// technically here we could look into iterating accessible layers, but downloading
// all layers of a shutdown timeline makes no sense regardless.
@@ -6918,7 +6968,7 @@ impl Timeline {
impl Timeline {
/// Returns non-remote layers for eviction.
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;
let mut max_layer_size: Option<u64> = None;
let resident_layers = guard
@@ -7019,7 +7069,7 @@ impl Timeline {
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("force created image layer {}", image_layer.local_path());
{
let mut guard = self.layers.write().await;
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
guard
.open_mut()
.unwrap()
@@ -7082,7 +7132,7 @@ impl Timeline {
let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!("force created delta layer {}", delta_layer.local_path());
{
let mut guard = self.layers.write().await;
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
guard
.open_mut()
.unwrap()
@@ -7127,9 +7177,7 @@ impl Timeline {
if let Some(end) = layer_end_lsn {
assert!(
end <= last_record_lsn,
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
end,
last_record_lsn,
"advance last record lsn before inserting a layer, end_lsn={end}, last_record_lsn={last_record_lsn}",
);
}
@@ -7177,7 +7225,7 @@ impl Timeline {
// Link the layer to the layer map
{
let mut guard = self.layers.write().await;
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
let layer_map = guard.open_mut().unwrap();
layer_map.force_insert_in_memory_layer(Arc::new(layer));
}
@@ -7194,7 +7242,7 @@ impl Timeline {
io_concurrency: IoConcurrency,
) -> anyhow::Result<Vec<(Key, Bytes)>> {
let mut all_data = Vec::new();
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
for layer in guard.layer_map()?.iter_historic_layers() {
if !layer.is_delta() && layer.image_layer_lsn() == lsn {
let layer = guard.get_from_desc(&layer);
@@ -7223,7 +7271,7 @@ impl Timeline {
self: &Arc<Timeline>,
) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
let mut layers = Vec::new();
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
for layer in guard.layer_map()?.iter_historic_layers() {
layers.push(layer.key());
}
@@ -7315,7 +7363,7 @@ impl TimelineWriter<'_> {
.tl
.get_layer_for_write(at, &self.write_guard, ctx)
.await?;
let initial_size = layer.size().await?;
let initial_size = layer.len();
let last_freeze_at = self.last_freeze_at.load();
self.write_guard.replace(TimelineWriterState::new(
@@ -7335,7 +7383,7 @@ impl TimelineWriter<'_> {
let l0_count = self
.tl
.layers
.read()
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await
.layer_map()?
.level0_deltas()
@@ -7543,17 +7591,19 @@ mod tests {
use std::sync::Arc;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use postgres_ffi::PgMajorVersion;
use std::iter::Iterator;
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use wal_decoder::models::value::Value;
use super::HeatMapTimeline;
use crate::context::RequestContextBuilder;
use crate::tenant::harness::{TenantHarness, test_img};
use crate::tenant::layer_map::LayerMap;
use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError};
use crate::tenant::{PreviousHeatmap, Timeline};
@@ -7616,7 +7666,7 @@ mod tests {
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
PgMajorVersion::PG14,
&ctx,
Vec::new(), // in-memory layers
delta_layers,
@@ -7661,7 +7711,7 @@ mod tests {
// Evict all the layers and stash the old heatmap in the timeline.
// This simulates a migration to a cold secondary location.
let guard = timeline.layers.read().await;
let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
let mut all_layers = Vec::new();
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
@@ -7752,7 +7802,7 @@ mod tests {
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
PgMajorVersion::PG14,
&ctx,
Vec::new(), // in-memory layers
delta_layers,
@@ -7783,7 +7833,7 @@ mod tests {
})));
// Evict all the layers in the previous heatmap
let guard = timeline.layers.read().await;
let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
layer.evict_and_wait(forever).await.unwrap();
@@ -7812,7 +7862,12 @@ mod tests {
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.create_test_timeline(
TimelineId::generate(),
Lsn(0x10),
PgMajorVersion::PG14,
&ctx,
)
.await
.unwrap();
@@ -7846,7 +7901,10 @@ mod tests {
}
async fn find_some_layer(timeline: &Timeline) -> Layer {
let layers = timeline.layers.read().await;
let layers = timeline
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let desc = layers
.layer_map()
.unwrap()

View File

@@ -4,6 +4,7 @@ use std::ops::Range;
use utils::lsn::Lsn;
use super::Timeline;
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
#[derive(serde::Serialize)]
pub(crate) struct RangeAnalysis {
@@ -24,7 +25,10 @@ impl Timeline {
let num_of_l0;
let all_layer_files = {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
num_of_l0 = guard.layer_map().unwrap().level0_deltas().len();
guard.all_persistent_layers()
};

View File

@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
use std::sync::Arc;
use std::time::{Duration, Instant};
use super::layer_manager::LayerManager;
use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard};
use super::{
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
@@ -29,9 +29,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
use pageserver_api::key::{KEY_SIZE, Key};
use pageserver_api::keyspace::{KeySpace, ShardedRange};
use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use pageserver_api::value::Value;
use pageserver_compaction::helpers::{fully_contains, overlaps_with};
use pageserver_compaction::interface::*;
use serde::Serialize;
@@ -41,6 +39,8 @@ use tracing::{Instrument, debug, error, info, info_span, trace, warn};
use utils::critical;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use wal_decoder::models::record::NeonWalRecord;
use wal_decoder::models::value::Value;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
@@ -62,7 +62,7 @@ use crate::tenant::storage_layer::{
use crate::tenant::tasks::log_compaction_error;
use crate::tenant::timeline::{
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
ResidentLayer, drop_rlock,
ResidentLayer, drop_layer_manager_rlock,
};
use crate::tenant::{DeltaLayer, MaybeOffloaded};
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -314,7 +314,10 @@ impl GcCompactionQueue {
.unwrap_or(Lsn::INVALID);
let layers = {
let guard = timeline.layers.read().await;
let guard = timeline
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let layer_map = guard.layer_map()?;
layer_map.iter_historic_layers().collect_vec()
};
@@ -408,7 +411,10 @@ impl GcCompactionQueue {
timeline: &Arc<Timeline>,
lsn: Lsn,
) -> Result<u64, CompactionError> {
let guard = timeline.layers.read().await;
let guard = timeline
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let layer_map = guard.layer_map()?;
let layers = layer_map.iter_historic_layers().collect_vec();
let mut size = 0;
@@ -851,7 +857,7 @@ impl KeyHistoryRetention {
}
let layer_generation;
{
let guard = tline.layers.read().await;
let guard = tline.layers.read(LayerManagerLockHolder::Compaction).await;
if !guard.contains_key(key) {
return false;
}
@@ -971,7 +977,7 @@ impl KeyHistoryRetention {
tline
.reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
.await
.with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
.with_context(|| format!("verification failed for key {key} at lsn {lsn}"))?;
Ok(())
}
@@ -1282,7 +1288,10 @@ impl Timeline {
// We do the repartition on the L0-L1 boundary. All data below the boundary
// are compacted by L0 with low read amplification, thus making the `repartition`
// function run fast.
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
guard
.all_persistent_layers()
.iter()
@@ -1461,7 +1470,7 @@ impl Timeline {
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
let layers = self.layers.read().await;
let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
let layers_iter = layers.layer_map()?.iter_historic_layers();
let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
for layer_desc in layers_iter {
@@ -1722,7 +1731,10 @@ impl Timeline {
// are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
// Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
// they will be subject to L0->L1 compaction in the near future.
let layer_manager = self.layers.read().await;
let layer_manager = self
.layers
.read(LayerManagerLockHolder::GetLayerMapInfo)
.await;
let layer_map = layer_manager.layer_map()?;
let readable_points = {
@@ -1775,7 +1787,7 @@ impl Timeline {
};
let begin = tokio::time::Instant::now();
let phase1_layers_locked = self.layers.read().await;
let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await;
let now = tokio::time::Instant::now();
stats.read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -1803,7 +1815,7 @@ impl Timeline {
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
async fn compact_level0_phase1<'a>(
self: &'a Arc<Self>,
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
guard: LayerManagerReadGuard<'a>,
mut stats: CompactLevel0Phase1StatsBuilder,
target_file_size: u64,
force_compaction_ignore_threshold: bool,
@@ -2029,7 +2041,7 @@ impl Timeline {
holes
};
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
drop_rlock(guard);
drop_layer_manager_rlock(guard);
if self.cancel.is_cancelled() {
return Err(CompactionError::ShuttingDown);
@@ -2469,7 +2481,7 @@ impl Timeline {
// Find the top of the historical layers
let end_lsn = {
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
let layers = guard.layer_map()?;
let l0_deltas = layers.level0_deltas();
@@ -2635,15 +2647,15 @@ impl Timeline {
use std::fmt::Write;
let mut output = String::new();
if let Some((key, _, _)) = replay_history.first() {
write!(output, "key={} ", key).unwrap();
write!(output, "key={key} ").unwrap();
let mut cnt = 0;
for (_, lsn, val) in replay_history {
if val.is_image() {
write!(output, "i@{} ", lsn).unwrap();
write!(output, "i@{lsn} ").unwrap();
} else if val.will_init() {
write!(output, "di@{} ", lsn).unwrap();
write!(output, "di@{lsn} ").unwrap();
} else {
write!(output, "d@{} ", lsn).unwrap();
write!(output, "d@{lsn} ").unwrap();
}
cnt += 1;
if cnt >= 128 {
@@ -3008,7 +3020,7 @@ impl Timeline {
}
split_key_ranges.sort();
let all_layers = {
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
let layer_map = guard.layer_map()?;
layer_map.iter_historic_layers().collect_vec()
};
@@ -3112,12 +3124,12 @@ impl Timeline {
.await?;
let jobs_len = jobs.len();
for (idx, job) in jobs.into_iter().enumerate() {
info!(
"running enhanced gc bottom-most compaction, sub-compaction {}/{}",
idx + 1,
jobs_len
);
let sub_compaction_progress = format!("{}/{}", idx + 1, jobs_len);
self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
.instrument(info_span!(
"sub_compaction",
sub_compaction_progress = sub_compaction_progress
))
.await?;
}
if jobs_len == 0 {
@@ -3185,7 +3197,10 @@ impl Timeline {
// 1. If a layer is in the selection, all layers below it are in the selection.
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
let job_desc = {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GarbageCollection)
.await;
let layers = guard.layer_map()?;
let gc_info = self.gc_info.read().unwrap();
let mut retain_lsns_below_horizon = Vec::new();
@@ -3956,7 +3971,10 @@ impl Timeline {
// First, do a sanity check to ensure the newly-created layer map does not contain overlaps.
let all_layers = {
let guard = self.layers.read().await;
let guard = self
.layers
.read(LayerManagerLockHolder::GarbageCollection)
.await;
let layer_map = guard.layer_map()?;
layer_map.iter_historic_layers().collect_vec()
};
@@ -4020,7 +4038,10 @@ impl Timeline {
let update_guard = self.gc_compaction_layer_update_lock.write().await;
// Acquiring the update guard ensures current read operations end and new read operations are blocked.
// TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
let mut guard = self.layers.write().await;
let mut guard = self
.layers
.write(LayerManagerLockHolder::GarbageCollection)
.await;
guard
.open_mut()?
.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
@@ -4088,7 +4109,11 @@ impl TimelineAdaptor {
pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
let layers_to_delete = {
let guard = self.timeline.layers.read().await;
let guard = self
.timeline
.layers
.read(LayerManagerLockHolder::Compaction)
.await;
self.layers_to_delete
.iter()
.map(|x| guard.get_from_desc(x))
@@ -4133,7 +4158,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
self.flush_updates().await?;
let guard = self.timeline.layers.read().await;
let guard = self
.timeline
.layers
.read(LayerManagerLockHolder::Compaction)
.await;
let layer_map = guard.layer_map()?;
let result = layer_map
@@ -4172,7 +4201,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
// this is a lot more complex than a simple downcast...
if layer.is_delta() {
let l = {
let guard = self.timeline.layers.read().await;
let guard = self
.timeline
.layers
.read(LayerManagerLockHolder::Compaction)
.await;
guard.get_from_desc(layer)
};
let result = l.download_and_keep_resident(ctx).await?;

View File

@@ -19,7 +19,7 @@ use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::gate::GateError;
use super::layer_manager::LayerManager;
use super::layer_manager::{LayerManager, LayerManagerLockHolder};
use super::{FlushLayerError, Timeline};
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::TaskKind;
@@ -199,7 +199,10 @@ pub(crate) async fn generate_tombstone_image_layer(
let image_lsn = ancestor_lsn;
{
let layers = detached.layers.read().await;
let layers = detached
.layers
.read(LayerManagerLockHolder::DetachAncestor)
.await;
for layer in layers.all_persistent_layers() {
if !layer.is_delta
&& layer.lsn_range.start == image_lsn
@@ -423,7 +426,7 @@ pub(super) async fn prepare(
// we do not need to start from our layers, because they can only be layers that come
// *after* ancestor_lsn
let layers = tokio::select! {
guard = ancestor.layers.read() => guard,
guard = ancestor.layers.read(LayerManagerLockHolder::DetachAncestor) => guard,
_ = detached.cancel.cancelled() => {
return Err(ShuttingDown);
}
@@ -869,7 +872,12 @@ async fn remote_copy(
// Double check that the file is orphan (probably from an earlier attempt), then delete it
let key = file_name.clone().into();
if adoptee.layers.read().await.contains_key(&key) {
if adoptee
.layers
.read(LayerManagerLockHolder::DetachAncestor)
.await
.contains_key(&key)
{
// We are supposed to filter out such cases before coming to this function
return Err(Error::Prepare(anyhow::anyhow!(
"layer file {file_name} already present and inside layer map"

View File

@@ -33,6 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
use crate::tenant::storage_layer::LayerVisibilityHint;
use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
use crate::tenant::timeline::EvictionError;
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
use crate::tenant::{LogicalSizeCalculationCause, TenantShard};
#[derive(Default)]
@@ -208,7 +209,7 @@ impl Timeline {
let mut js = tokio::task::JoinSet::new();
{
let guard = self.layers.read().await;
let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;
guard
.likely_resident_layers()

View File

@@ -15,6 +15,7 @@ use super::{Timeline, TimelineDeleteProgress};
use crate::context::RequestContext;
use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
mod flow;
mod importbucket_client;
@@ -163,7 +164,10 @@ async fn prepare_import(
info!("wipe the slate clean");
{
// TODO: do we need to hold GC lock for this?
let mut guard = timeline.layers.write().await;
let mut guard = timeline
.layers
.write(LayerManagerLockHolder::ImportPgData)
.await;
assert!(
guard.layer_map()?.open_layer.is_none(),
"while importing, there should be no in-memory layer" // this just seems like a good place to assert it

View File

@@ -36,8 +36,8 @@ use pageserver_api::keyspace::{ShardedRange, singleton_range};
use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus};
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::BLCKSZ;
use postgres_ffi::relfile_utils::parse_relfilename;
use postgres_ffi::{BLCKSZ, pg_constants};
use remote_storage::RemotePath;
use tokio::sync::Semaphore;
use tokio_stream::StreamExt;
@@ -56,6 +56,7 @@ use crate::pgdatadir_mapping::{
};
use crate::task_mgr::TaskKind;
use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer};
use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
pub async fn run(
timeline: Arc<Timeline>,
@@ -557,7 +558,7 @@ impl PgDataDir {
PgDataDirDb::new(
storage,
&basedir.join(dboid.to_string()),
pg_constants::DEFAULTTABLESPACE_OID,
postgres_ffi_types::constants::DEFAULTTABLESPACE_OID,
dboid,
&datadir_path,
)
@@ -570,7 +571,7 @@ impl PgDataDir {
PgDataDirDb::new(
storage,
&datadir_path.join("global"),
postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
postgres_ffi_types::constants::GLOBALTABLESPACE_OID,
0,
&datadir_path,
)
@@ -984,7 +985,10 @@ impl ChunkProcessingJob {
let (desc, path) = writer.finish(ctx).await?;
{
let guard = timeline.layers.read().await;
let guard = timeline
.layers
.read(LayerManagerLockHolder::ImportPgData)
.await;
let existing_layer = guard.try_get_from_key(&desc.key());
if let Some(layer) = existing_layer {
if layer.metadata().generation == timeline.generation {
@@ -1007,7 +1011,10 @@ impl ChunkProcessingJob {
// certain that the existing layer is identical to the new one, so in that case
// we replace the old layer with the one we just generated.
let mut guard = timeline.layers.write().await;
let mut guard = timeline
.layers
.write(LayerManagerLockHolder::ImportPgData)
.await;
let existing_layer = guard
.try_get_from_key(&resident_layer.layer_desc().key())
@@ -1036,7 +1043,7 @@ impl ChunkProcessingJob {
}
}
crate::tenant::timeline::drop_wlock(guard);
crate::tenant::timeline::drop_layer_manager_wlock(guard);
timeline
.remote_client

View File

@@ -3,7 +3,7 @@ use std::sync::Arc;
use anyhow::Context;
use bytes::Bytes;
use postgres_ffi::ControlFileData;
use postgres_ffi::{ControlFileData, PgMajorVersion};
use remote_storage::{
Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
ListingObject, RemotePath, RemoteStorageConfig,
@@ -264,7 +264,7 @@ impl ControlFile {
pub(crate) fn base_lsn(&self) -> Lsn {
Lsn(self.control_file_data.checkPoint).align()
}
pub(crate) fn pg_version(&self) -> u32 {
pub(crate) fn pg_version(&self) -> PgMajorVersion {
self.try_pg_version()
.expect("prepare() checks that try_pg_version doesn't error")
}
@@ -274,13 +274,14 @@ impl ControlFile {
pub(crate) fn control_file_buf(&self) -> &Bytes {
&self.control_file_buf
}
fn try_pg_version(&self) -> anyhow::Result<u32> {
fn try_pg_version(&self) -> anyhow::Result<PgMajorVersion> {
Ok(match self.control_file_data.catalog_version_no {
// thesea are from catversion.h
202107181 => 14,
202209061 => 15,
202307071 => 16,
202406281 => 17,
202107181 => PgMajorVersion::PG14,
202209061 => PgMajorVersion::PG15,
202307071 => PgMajorVersion::PG16,
202406281 => PgMajorVersion::PG17,
catversion => {
anyhow::bail!("unrecognized catalog version {catversion}")
}

View File

@@ -1,5 +1,8 @@
use std::collections::HashMap;
use std::mem::ManuallyDrop;
use std::ops::{Deref, DerefMut};
use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, bail, ensure};
use itertools::Itertools;
@@ -20,6 +23,155 @@ use crate::tenant::storage_layer::{
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
};
/// Warn if the lock was held for longer than this threshold.
/// It's very generous and we should bring this value down over time.
const LAYER_MANAGER_LOCK_WARN_THRESHOLD: Duration = Duration::from_secs(5);
const LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD: Duration = Duration::from_secs(30);
/// Describes the operation that is holding the layer manager lock
#[derive(Debug, Clone, Copy, strum_macros::Display)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum LayerManagerLockHolder {
GetLayerMapInfo,
GenerateHeatmap,
GetPage,
Init,
LoadLayerMap,
GetLayerForWrite,
TryFreezeLayer,
FlushFrozenLayer,
FlushLoop,
Compaction,
GarbageCollection,
Shutdown,
ImportPgData,
DetachAncestor,
Eviction,
#[cfg(test)]
Testing,
}
/// Wrapper for the layer manager that tracks the amount of time during which
/// it was held under read or write lock
#[derive(Default)]
pub(crate) struct LockedLayerManager {
locked: tokio::sync::RwLock<LayerManager>,
}
pub(crate) struct LayerManagerReadGuard<'a> {
guard: ManuallyDrop<tokio::sync::RwLockReadGuard<'a, LayerManager>>,
acquired_at: std::time::Instant,
holder: LayerManagerLockHolder,
}
pub(crate) struct LayerManagerWriteGuard<'a> {
guard: ManuallyDrop<tokio::sync::RwLockWriteGuard<'a, LayerManager>>,
acquired_at: std::time::Instant,
holder: LayerManagerLockHolder,
}
impl Drop for LayerManagerReadGuard<'_> {
fn drop(&mut self) {
// Drop the lock first, before potentially warning if it was held for too long.
// SAFETY: ManuallyDrop in Drop implementation
unsafe { ManuallyDrop::drop(&mut self.guard) };
let held_for = self.acquired_at.elapsed();
if held_for >= LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD {
tracing::warn!(
holder=%self.holder,
"Layer manager read lock held for {}s",
held_for.as_secs_f64(),
);
}
}
}
impl Drop for LayerManagerWriteGuard<'_> {
fn drop(&mut self) {
// Drop the lock first, before potentially warning if it was held for too long.
// SAFETY: ManuallyDrop in Drop implementation
unsafe { ManuallyDrop::drop(&mut self.guard) };
let held_for = self.acquired_at.elapsed();
if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD {
tracing::warn!(
holder=%self.holder,
"Layer manager write lock held for {}s",
held_for.as_secs_f64(),
);
}
}
}
impl Deref for LayerManagerReadGuard<'_> {
type Target = LayerManager;
fn deref(&self) -> &Self::Target {
self.guard.deref()
}
}
impl Deref for LayerManagerWriteGuard<'_> {
type Target = LayerManager;
fn deref(&self) -> &Self::Target {
self.guard.deref()
}
}
impl DerefMut for LayerManagerWriteGuard<'_> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.guard.deref_mut()
}
}
impl LockedLayerManager {
pub(crate) async fn read(&self, holder: LayerManagerLockHolder) -> LayerManagerReadGuard {
let guard = ManuallyDrop::new(self.locked.read().await);
LayerManagerReadGuard {
guard,
acquired_at: std::time::Instant::now(),
holder,
}
}
pub(crate) fn try_read(
&self,
holder: LayerManagerLockHolder,
) -> Result<LayerManagerReadGuard, tokio::sync::TryLockError> {
let guard = ManuallyDrop::new(self.locked.try_read()?);
Ok(LayerManagerReadGuard {
guard,
acquired_at: std::time::Instant::now(),
holder,
})
}
pub(crate) async fn write(&self, holder: LayerManagerLockHolder) -> LayerManagerWriteGuard {
let guard = ManuallyDrop::new(self.locked.write().await);
LayerManagerWriteGuard {
guard,
acquired_at: std::time::Instant::now(),
holder,
}
}
pub(crate) fn try_write(
&self,
holder: LayerManagerLockHolder,
) -> Result<LayerManagerWriteGuard, tokio::sync::TryLockError> {
let guard = ManuallyDrop::new(self.locked.try_write()?);
Ok(LayerManagerWriteGuard {
guard,
acquired_at: std::time::Instant::now(),
holder,
})
}
}
/// Provides semantic APIs to manipulate the layer map.
pub(crate) enum LayerManager {
/// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate

View File

@@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection(
let copy_stream = replication_client.copy_both_simple(&query).await?;
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
.await
.map_err(|e| match e.kind {
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
_ => WalReceiverError::Other(e.into()),
})?;
let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
let walingest_res = select! {
walingest_res = walingest_future => walingest_res,
_ = cancellation.cancelled() => {
// We are doing reads in WalIngest::new, and those can hang as they come from the network.
// Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
debug!("Connection cancelled");
return Err(WalReceiverError::Cancelled);
},
};
let mut walingest = walingest_res.map_err(|e| match e.kind {
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
_ => WalReceiverError::Other(e.into()),
})?;
let (format, compression) = match protocol {
PostgresClientProtocol::Interpreted {
@@ -360,8 +368,7 @@ pub(super) async fn handle_walreceiver_connection(
match raw_wal_start_lsn.cmp(&expected_wal_start) {
std::cmp::Ordering::Greater => {
let msg = format!(
"Gap in streamed WAL: [{}, {})",
expected_wal_start, raw_wal_start_lsn
"Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn})"
);
critical!("{msg}");
return Err(WalReceiverError::Other(anyhow!(msg)));

View File

@@ -68,16 +68,9 @@ impl<A: Alignment> AlignedBuffer<A> {
assert!(
begin <= end,
"range start must not be greater than end: {:?} <= {:?}",
begin,
end,
);
assert!(
end <= len,
"range end out of bounds: {:?} <= {:?}",
end,
len,
"range start must not be greater than end: {begin:?} <= {end:?}",
);
assert!(end <= len, "range end out of bounds: {end:?} <= {len:?}",);
let begin = self.range.start + begin;
let end = self.range.start + end;

View File

@@ -242,10 +242,7 @@ unsafe impl<A: Alignment> bytes::BufMut for AlignedBufferMut<A> {
/// Panic with a nice error message.
#[cold]
fn panic_advance(idx: usize, len: usize) -> ! {
panic!(
"advance out of bounds: the len is {} but advancing by {}",
len, idx
);
panic!("advance out of bounds: the len is {len} but advancing by {idx}");
}
/// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer,

View File

@@ -28,20 +28,20 @@ use std::time::{Duration, Instant, SystemTime};
use bytes::{Buf, Bytes};
use pageserver_api::key::{Key, rel_block_to_key};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::walrecord::*;
use postgres_ffi::{
TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch,
fsm_logical_to_physical, pg_constants,
PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion,
enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants,
};
use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use tracing::*;
use utils::bin_ser::{DeserializeError, SerializeError};
use utils::lsn::Lsn;
use utils::rate_limit::RateLimit;
use utils::{critical, failpoint_support};
use wal_decoder::models::record::NeonWalRecord;
use wal_decoder::models::*;
use crate::ZERO_PAGE;
@@ -781,7 +781,7 @@ impl WalIngest {
) -> Result<(), WalIngestError> {
let (xact_common, is_commit, is_prepared) = match record {
XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
let xid: u64 = if modification.tline.pg_version >= 17 {
let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
self.adjust_to_full_transaction_id(xl_xid)?
} else {
xl_xid as u64
@@ -886,7 +886,7 @@ impl WalIngest {
xl_xid, parsed.xid, lsn,
);
let xid: u64 = if modification.tline.pg_version >= 17 {
let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
self.adjust_to_full_transaction_id(parsed.xid)?
} else {
parsed.xid as u64
@@ -1241,7 +1241,7 @@ impl WalIngest {
if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
&& info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
{
let oldest_active_xid = if pg_version >= 17 {
let oldest_active_xid = if pg_version >= PgMajorVersion::PG17 {
let mut oldest_active_full_xid = cp.nextXid.value;
for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
if xid < oldest_active_full_xid {
@@ -1475,10 +1475,11 @@ impl WalIngest {
const fn rate_limiter(
&self,
pg_version: u32,
pg_version: PgMajorVersion,
) -> Option<&Lazy<Mutex<RateLimit>>> {
const MIN_PG_VERSION: u32 = 14;
const MAX_PG_VERSION: u32 = 17;
const MIN_PG_VERSION: u32 = PgMajorVersion::PG14.major_version_num();
const MAX_PG_VERSION: u32 = PgMajorVersion::PG17.major_version_num();
let pg_version = pg_version.major_version_num();
if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION {
return None;
@@ -1603,6 +1604,7 @@ async fn get_relsize(
#[cfg(test)]
mod tests {
use anyhow::Result;
use postgres_ffi::PgMajorVersion;
use postgres_ffi::RELSEG_SIZE;
use super::*;
@@ -1625,7 +1627,7 @@ mod tests {
#[tokio::test]
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
for i in 14..=16 {
for i in PgMajorVersion::ALL {
dispatch_pgversion!(i, {
pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
});
@@ -2108,7 +2110,7 @@ mod tests {
// Check relation content
for blkno in 0..relsize {
let lsn = Lsn(0x20);
let data = format!("foo blk {} at {}", blkno, lsn);
let data = format!("foo blk {blkno} at {lsn}");
assert_eq!(
tline
.get_rel_page_at_lsn(
@@ -2142,7 +2144,7 @@ mod tests {
for blkno in 0..1 {
let lsn = Lsn(0x20);
let data = format!("foo blk {} at {}", blkno, lsn);
let data = format!("foo blk {blkno} at {lsn}");
assert_eq!(
tline
.get_rel_page_at_lsn(
@@ -2167,7 +2169,7 @@ mod tests {
);
for blkno in 0..relsize {
let lsn = Lsn(0x20);
let data = format!("foo blk {} at {}", blkno, lsn);
let data = format!("foo blk {blkno} at {lsn}");
assert_eq!(
tline
.get_rel_page_at_lsn(
@@ -2188,7 +2190,7 @@ mod tests {
let lsn = Lsn(0x80);
let mut m = tline.begin_modification(lsn);
for blkno in 0..relsize {
let data = format!("foo blk {} at {}", blkno, lsn);
let data = format!("foo blk {blkno} at {lsn}");
walingest
.put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
.await?;
@@ -2210,7 +2212,7 @@ mod tests {
// Check relation content
for blkno in 0..relsize {
let lsn = Lsn(0x80);
let data = format!("foo blk {} at {}", blkno, lsn);
let data = format!("foo blk {blkno} at {lsn}");
assert_eq!(
tline
.get_rel_page_at_lsn(
@@ -2335,7 +2337,7 @@ mod tests {
// 5. Grep sk logs for "restart decoder" to get startpoint
// 6. Run just the decoder from this test to get the endpoint.
// It's the last LSN the decoder will output.
let pg_version = 15; // The test data was generated by pg15
let pg_version = PgMajorVersion::PG15; // The test data was generated by pg15
let path = "test_data/sk_wal_segment_from_pgbench";
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
let source_initdb_path = format!("{path}/{INITDB_PATH}");
@@ -2414,6 +2416,6 @@ mod tests {
}
let duration = started_at.elapsed();
println!("done in {:?}", duration);
println!("done in {duration:?}");
}
}

View File

@@ -32,12 +32,13 @@ use anyhow::Context;
use bytes::{Bytes, BytesMut};
use pageserver_api::key::Key;
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::shard::TenantShardId;
use postgres_ffi::PgMajorVersion;
use tracing::*;
use utils::lsn::Lsn;
use utils::sync::gate::GateError;
use utils::sync::heavier_once_cell;
use wal_decoder::models::record::NeonWalRecord;
use crate::config::PageServerConf;
use crate::metrics::{
@@ -165,7 +166,7 @@ impl PostgresRedoManager {
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
pg_version: PgMajorVersion,
redo_attempt_type: RedoAttemptType,
) -> Result<Bytes, Error> {
if records.is_empty() {
@@ -232,7 +233,7 @@ impl PostgresRedoManager {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
pub async fn ping(&self, pg_version: PgMajorVersion) -> Result<(), Error> {
self.do_with_walredo_process(pg_version, |proc| async move {
proc.ping(Duration::from_secs(1))
.await
@@ -342,7 +343,7 @@ impl PostgresRedoManager {
O,
>(
&self,
pg_version: u32,
pg_version: PgMajorVersion,
closure: F,
) -> Result<O, Error> {
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
@@ -442,7 +443,7 @@ impl PostgresRedoManager {
base_img_lsn: Lsn,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
pg_version: u32,
pg_version: PgMajorVersion,
max_retry_attempts: u32,
) -> Result<Bytes, Error> {
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
@@ -571,11 +572,12 @@ mod tests {
use bytes::Bytes;
use pageserver_api::key::Key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::shard::TenantShardId;
use postgres_ffi::PgMajorVersion;
use tracing::Instrument;
use utils::id::TenantId;
use utils::lsn::Lsn;
use wal_decoder::models::record::NeonWalRecord;
use super::PostgresRedoManager;
use crate::config::PageServerConf;
@@ -586,7 +588,7 @@ mod tests {
let h = RedoHarness::new().unwrap();
h.manager
.ping(14)
.ping(PgMajorVersion::PG14)
.instrument(h.span())
.await
.expect("ping should work");
@@ -612,7 +614,7 @@ mod tests {
Lsn::from_str("0/16E2408").unwrap(),
None,
short_records(),
14,
PgMajorVersion::PG14,
RedoAttemptType::ReadPage,
)
.instrument(h.span())
@@ -641,7 +643,7 @@ mod tests {
Lsn::from_str("0/16E2408").unwrap(),
None,
short_records(),
14,
PgMajorVersion::PG14,
RedoAttemptType::ReadPage,
)
.instrument(h.span())
@@ -663,7 +665,7 @@ mod tests {
Lsn::INVALID,
None,
short_records(),
16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
PgMajorVersion::PG16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
RedoAttemptType::ReadPage,
)
.instrument(h.span())

View File

@@ -2,16 +2,16 @@ use anyhow::Context;
use byteorder::{ByteOrder, LittleEndian};
use bytes::BytesMut;
use pageserver_api::key::Key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::SlruKind;
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
use postgres_ffi::v14::nonrelfile_utils::{
mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
transaction_id_set_status,
};
use postgres_ffi::{BLCKSZ, pg_constants};
use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM;
use tracing::*;
use utils::lsn::Lsn;
use wal_decoder::models::record::NeonWalRecord;
/// Can this request be served by neon redo functions
/// or we need to pass it to wal-redo postgres process?
@@ -52,8 +52,7 @@ pub(crate) fn apply_in_neon(
let (rel, _) = key.to_rel_block().context("invalid record")?;
assert!(
rel.forknum == VISIBILITYMAP_FORKNUM,
"TruncateVisibilityMap record on unexpected rel {}",
rel
"TruncateVisibilityMap record on unexpected rel {rel}"
);
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
map[*trunc_byte + 1..].fill(0u8);
@@ -78,8 +77,7 @@ pub(crate) fn apply_in_neon(
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
assert!(
rel.forknum == VISIBILITYMAP_FORKNUM,
"ClearVisibilityMapFlags record on unexpected rel {}",
rel
"ClearVisibilityMapFlags record on unexpected rel {rel}"
);
if let Some(heap_blkno) = *new_heap_blkno {
// Calculate the VM block and offset that corresponds to the heap block.
@@ -124,8 +122,7 @@ pub(crate) fn apply_in_neon(
assert_eq!(
slru_kind,
SlruKind::Clog,
"ClogSetCommitted record with unexpected key {}",
key
"ClogSetCommitted record with unexpected key {key}"
);
for &xid in xids {
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -135,15 +132,11 @@ pub(crate) fn apply_in_neon(
// Check that we're modifying the correct CLOG block.
assert!(
segno == expected_segno,
"ClogSetCommitted record for XID {} with unexpected key {}",
xid,
key
"ClogSetCommitted record for XID {xid} with unexpected key {key}"
);
assert!(
blknum == expected_blknum,
"ClogSetCommitted record for XID {} with unexpected key {}",
xid,
key
"ClogSetCommitted record for XID {xid} with unexpected key {key}"
);
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page);
@@ -169,8 +162,7 @@ pub(crate) fn apply_in_neon(
assert_eq!(
slru_kind,
SlruKind::Clog,
"ClogSetAborted record with unexpected key {}",
key
"ClogSetAborted record with unexpected key {key}"
);
for &xid in xids {
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -180,15 +172,11 @@ pub(crate) fn apply_in_neon(
// Check that we're modifying the correct CLOG block.
assert!(
segno == expected_segno,
"ClogSetAborted record for XID {} with unexpected key {}",
xid,
key
"ClogSetAborted record for XID {xid} with unexpected key {key}"
);
assert!(
blknum == expected_blknum,
"ClogSetAborted record for XID {} with unexpected key {}",
xid,
key
"ClogSetAborted record for XID {xid} with unexpected key {key}"
);
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
@@ -199,8 +187,7 @@ pub(crate) fn apply_in_neon(
assert_eq!(
slru_kind,
SlruKind::MultiXactOffsets,
"MultixactOffsetCreate record with unexpected key {}",
key
"MultixactOffsetCreate record with unexpected key {key}"
);
// Compute the block and offset to modify.
// See RecordNewMultiXact in PostgreSQL sources.
@@ -213,15 +200,11 @@ pub(crate) fn apply_in_neon(
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
assert!(
segno == expected_segno,
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
mid,
key
"MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}"
);
assert!(
blknum == expected_blknum,
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
mid,
key
"MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}"
);
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
@@ -231,8 +214,7 @@ pub(crate) fn apply_in_neon(
assert_eq!(
slru_kind,
SlruKind::MultiXactMembers,
"MultixactMembersCreate record with unexpected key {}",
key
"MultixactMembersCreate record with unexpected key {key}"
);
for (i, member) in members.iter().enumerate() {
let offset = moff + i as u32;
@@ -249,15 +231,11 @@ pub(crate) fn apply_in_neon(
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
assert!(
segno == expected_segno,
"MultiXactMembersCreate record for offset {} with unexpected key {}",
moff,
key
"MultiXactMembersCreate record for offset {moff} with unexpected key {key}"
);
assert!(
blknum == expected_blknum,
"MultiXactMembersCreate record for offset {} with unexpected key {}",
moff,
key
"MultiXactMembersCreate record for offset {moff} with unexpected key {key}"
);
let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);

View File

@@ -10,14 +10,14 @@ use std::time::Duration;
use anyhow::Context;
use bytes::Bytes;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::TenantShardId;
use postgres_ffi::BLCKSZ;
use postgres_ffi::{BLCKSZ, PgMajorVersion};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tracing::{Instrument, debug, error, instrument};
use utils::lsn::Lsn;
use utils::poison::Poison;
use wal_decoder::models::record::NeonWalRecord;
use self::no_leak_child::NoLeakChild;
use crate::config::PageServerConf;
@@ -54,11 +54,11 @@ impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
#[instrument(skip_all,fields(pg_version=pg_version.major_version_num()))]
pub(crate) fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
pg_version: u32,
pg_version: PgMajorVersion,
) -> anyhow::Result<Self> {
crate::span::debug_assert_current_span_has_tenant_id();