mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-27 23:30:38 +00:00
Compare commits
12 Commits
basebackup
...
skyzh/immu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1863ae799d | ||
|
|
20fe57d93b | ||
|
|
0fad5e21ce | ||
|
|
a2056666ae | ||
|
|
a3909e03f8 | ||
|
|
fc190a2a19 | ||
|
|
faee3152f3 | ||
|
|
3693d1f431 | ||
|
|
fdf7a67ed2 | ||
|
|
1299df87d2 | ||
|
|
754ceaefac | ||
|
|
143fa0da42 |
30
Cargo.lock
generated
30
Cargo.lock
generated
@@ -110,6 +110,12 @@ dependencies = [
|
||||
"backtrace",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arc-swap"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
|
||||
|
||||
[[package]]
|
||||
name = "archery"
|
||||
version = "0.5.0"
|
||||
@@ -604,7 +610,7 @@ dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide 0.6.2",
|
||||
"miniz_oxide",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
]
|
||||
@@ -917,7 +923,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap 4.3.0",
|
||||
"compute_api",
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper",
|
||||
"notify",
|
||||
@@ -1400,16 +1405,6 @@ version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@@ -2200,15 +2195,6 @@ dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
|
||||
dependencies = [
|
||||
"adler",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.8.6"
|
||||
@@ -2562,6 +2548,7 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -2578,7 +2565,6 @@ dependencies = [
|
||||
"enum-map",
|
||||
"enumset",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
|
||||
@@ -32,7 +32,7 @@ license = "Apache-2.0"
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
flate2 = "1.0.26"
|
||||
arc-swap = "1.6"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
atty = "0.2.14"
|
||||
|
||||
@@ -6,7 +6,6 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
flate2.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
|
||||
@@ -15,7 +15,6 @@ use utils::lsn::Lsn;
|
||||
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
@@ -180,21 +179,16 @@ impl ComputeNode {
|
||||
_ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut measured_reader = MeasuredReader::new(copyreader);
|
||||
let mut decoder = flate2::read::GzDecoder::new(&mut measured_reader);
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(&mut decoder);
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
|
||||
// Report metrics
|
||||
self.state.lock().unwrap().metrics.basebackup_bytes =
|
||||
measured_reader.get_byte_count() as u64;
|
||||
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
@@ -376,11 +370,6 @@ impl ComputeNode {
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -433,22 +422,22 @@ impl ComputeNode {
|
||||
#[instrument(skip(self))]
|
||||
pub fn start_compute(&self) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||
spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
spec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
pspec.tenant_id,
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
self.prepare_pgdata(&compute_state)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
|
||||
let pg = self.start_postgres(spec.storage_auth_token.clone())?;
|
||||
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
if spec.spec.mode == ComputeMode::Primary && !spec.spec.skip_pg_catalog_updates {
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
self.apply_config(&compute_state)?;
|
||||
}
|
||||
|
||||
@@ -468,6 +457,11 @@ impl ComputeNode {
|
||||
}
|
||||
self.set_status(ComputeStatus::Running);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||
);
|
||||
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
|
||||
@@ -71,7 +71,6 @@ pub struct ComputeMetrics {
|
||||
pub wait_for_spec_ms: u64,
|
||||
pub sync_safekeepers_ms: u64,
|
||||
pub basebackup_ms: u64,
|
||||
pub basebackup_bytes: u64,
|
||||
pub config_ms: u64,
|
||||
pub total_startup_ms: u64,
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use pin_project_lite::pin_project;
|
||||
use std::io::Read;
|
||||
use std::pin::Pin;
|
||||
use std::{io, task};
|
||||
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
||||
@@ -76,34 +75,3 @@ impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S,
|
||||
self.project().stream.poll_shutdown(context)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for a reader that counts bytes read.
|
||||
///
|
||||
/// Similar to MeasuredStream but it's one way and it's sync
|
||||
pub struct MeasuredReader<R: Read> {
|
||||
inner: R,
|
||||
byte_count: usize,
|
||||
}
|
||||
|
||||
impl<R: Read> MeasuredReader<R> {
|
||||
pub fn new(reader: R) -> Self {
|
||||
Self {
|
||||
inner: reader,
|
||||
byte_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_byte_count(&self) -> usize {
|
||||
self.byte_count
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Read for MeasuredReader<R> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
let result = self.inner.read(buf);
|
||||
if let Ok(n_bytes) = result {
|
||||
self.byte_count += n_bytes
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
flate2.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
|
||||
use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
|
||||
use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
let mut layer_map = LayerMap::<LayerDescriptor>::default();
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
|
||||
let mut min_lsn = Lsn(u64::MAX);
|
||||
let mut max_lsn = Lsn(0);
|
||||
@@ -33,7 +34,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
min_lsn = min(min_lsn, lsn_range.start);
|
||||
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
|
||||
updates.insert_historic(layer.layer_desc().clone());
|
||||
}
|
||||
|
||||
println!("min: {min_lsn}, max: {max_lsn}");
|
||||
@@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
}
|
||||
|
||||
/// Construct a layer map query pattern for benchmarks
|
||||
fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
|
||||
fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
|
||||
// For each image layer we query one of the pages contained, at LSN right
|
||||
// before the image layer was created. This gives us a somewhat uniform
|
||||
// coverage of both the lsn and key space because image layers have
|
||||
@@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn
|
||||
|
||||
// Construct a partitioning for testing get_difficulty map when we
|
||||
// don't have an exact result of `collect_keyspace` to work with.
|
||||
fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
|
||||
fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
|
||||
let mut parts = Vec::new();
|
||||
|
||||
// We add a partition boundary at the start of each image layer,
|
||||
@@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
for i in 0..100_000 {
|
||||
let i32 = (i as u32) % 100;
|
||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
let layer = LayerDescriptor {
|
||||
key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
lsn: Lsn(i)..Lsn(i + 1),
|
||||
is_incremental: false,
|
||||
short_id: format!("Layer {}", i),
|
||||
};
|
||||
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
|
||||
let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
|
||||
TenantId::generate(),
|
||||
TimelineId::generate(),
|
||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
Lsn(i),
|
||||
false,
|
||||
0,
|
||||
));
|
||||
updates.insert_historic(layer.layer_desc().clone());
|
||||
}
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
@@ -516,7 +516,7 @@ async fn collect_eviction_candidates(
|
||||
if !tl.is_active() {
|
||||
continue;
|
||||
}
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction();
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
||||
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||
tenant_candidates.extend(
|
||||
info.resident_layers
|
||||
|
||||
@@ -215,7 +215,7 @@ async fn build_timeline_info(
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let mut info = build_timeline_info_common(timeline, ctx)?;
|
||||
let mut info = build_timeline_info_common(timeline, ctx).await?;
|
||||
if include_non_incremental_logical_size {
|
||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
||||
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
||||
@@ -233,7 +233,7 @@ async fn build_timeline_info(
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn build_timeline_info_common(
|
||||
async fn build_timeline_info_common(
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
@@ -264,7 +264,7 @@ fn build_timeline_info_common(
|
||||
None
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.layer_size_sum());
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
@@ -330,6 +330,7 @@ async fn timeline_create_handler(
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
@@ -591,7 +592,7 @@ async fn tenant_status(
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
current_physical_size += timeline.layer_size_sum();
|
||||
current_physical_size += timeline.layer_size_sum().await;
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
@@ -701,7 +702,7 @@ async fn layer_map_info_handler(
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let layer_map_info = timeline.layer_map_info(reset);
|
||||
let layer_map_info = timeline.layer_map_info(reset).await;
|
||||
|
||||
json_response(StatusCode::OK, layer_map_info)
|
||||
}
|
||||
|
||||
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
{
|
||||
pg_control = Some(control_file);
|
||||
}
|
||||
modification.flush()?;
|
||||
modification.flush().await?;
|
||||
}
|
||||
}
|
||||
|
||||
// We're done importing all the data files.
|
||||
modification.commit()?;
|
||||
modification.commit().await?;
|
||||
|
||||
// We expect the Postgres server to be shut down cleanly.
|
||||
let pg_control = pg_control.context("pg_control file not found")?;
|
||||
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
|
||||
// We found the pg_control file.
|
||||
pg_control = Some(res);
|
||||
}
|
||||
modification.flush()?;
|
||||
modification.flush().await?;
|
||||
}
|
||||
tokio_tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
|
||||
// sanity check: ensure that pg_control is loaded
|
||||
let _pg_control = pg_control.context("pg_control file not found")?;
|
||||
|
||||
modification.commit()?;
|
||||
modification.commit().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -594,7 +594,7 @@ async fn import_file(
|
||||
// zenith.signal is not necessarily the last file, that we handle
|
||||
// but it is ok to call `finish_write()`, because final `modification.commit()`
|
||||
// will update lsn once more to the final one.
|
||||
let writer = modification.tline.writer();
|
||||
let writer = modification.tline.writer().await;
|
||||
writer.finish_write(prev_lsn);
|
||||
|
||||
debug!("imported zenith signal {}", prev_lsn);
|
||||
|
||||
@@ -24,14 +24,14 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::io::{self, Write};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::pin::pin;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -489,7 +489,9 @@ impl PageServerHandler {
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.await?;
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
// We might have some wal to import as well, and we should prevent compute
|
||||
@@ -773,9 +775,8 @@ impl PageServerHandler {
|
||||
// Send a tarball of the latest layer on the timeline
|
||||
{
|
||||
let mut writer = pgb.copyout_writer();
|
||||
let mut raw_tar = Vec::new();
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut raw_tar,
|
||||
&mut writer,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
@@ -783,11 +784,6 @@ impl PageServerHandler {
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
let mut encoder =
|
||||
flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
|
||||
encoder.write_all(&raw_tar)?;
|
||||
let compressed_tar = encoder.finish()?;
|
||||
writer.write(&compressed_tar).await?;
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
|
||||
@@ -1122,7 +1122,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub fn flush(&mut self) -> anyhow::Result<()> {
|
||||
pub async fn flush(&mut self) -> anyhow::Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -1130,19 +1130,20 @@ impl<'a> DatadirModification<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let writer = self.tline.writer();
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut result: anyhow::Result<()> = Ok(());
|
||||
self.pending_updates.retain(|&key, value| {
|
||||
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
|
||||
result = writer.put(key, self.lsn, value);
|
||||
false
|
||||
let mut retained_pending_updates = HashMap::new();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
if is_rel_block_key(key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, self.lsn, &value).await?;
|
||||
} else {
|
||||
true
|
||||
retained_pending_updates.insert(key, value);
|
||||
}
|
||||
});
|
||||
result?;
|
||||
}
|
||||
self.pending_updates.extend(retained_pending_updates);
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1157,17 +1158,17 @@ impl<'a> DatadirModification<'a> {
|
||||
/// underlying timeline.
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub fn commit(&mut self) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer();
|
||||
pub async fn commit(&mut self) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer().await;
|
||||
let lsn = self.lsn;
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
writer.put(key, lsn, &value)?;
|
||||
writer.put(key, lsn, &value).await?;
|
||||
}
|
||||
for key_range in self.pending_deletions.drain(..) {
|
||||
writer.delete(key_range, lsn)?;
|
||||
writer.delete(key_range, lsn).await?;
|
||||
}
|
||||
|
||||
writer.finish_write(lsn);
|
||||
|
||||
@@ -85,7 +85,9 @@ pub mod blob_io;
|
||||
pub mod block_io;
|
||||
pub mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
pub mod layer_cache;
|
||||
pub mod layer_map;
|
||||
pub mod layer_map_mgr;
|
||||
pub mod manifest;
|
||||
|
||||
pub mod metadata;
|
||||
@@ -519,6 +521,7 @@ impl Tenant {
|
||||
);
|
||||
timeline
|
||||
.load_layer_map(new_disk_consistent_lsn)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
@@ -554,16 +557,10 @@ impl Tenant {
|
||||
.context("failed to reconcile with remote")?
|
||||
}
|
||||
|
||||
let layers = timeline.layer_mgr.read();
|
||||
// Sanity check: a timeline should have some content.
|
||||
anyhow::ensure!(
|
||||
ancestor.is_some()
|
||||
|| timeline
|
||||
.layers
|
||||
.read()
|
||||
.unwrap()
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.is_some(),
|
||||
ancestor.is_some() || layers.iter_historic_layers().next().is_some(),
|
||||
"Timeline has no ancestor and no layer files"
|
||||
);
|
||||
|
||||
@@ -1244,7 +1241,7 @@ impl Tenant {
|
||||
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
|
||||
/// minimum amount of keys required to get a writable timeline.
|
||||
/// (Without it, `put` might fail due to `repartition` failing.)
|
||||
pub fn create_empty_timeline(
|
||||
pub async fn create_empty_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
@@ -1256,9 +1253,11 @@ impl Tenant {
|
||||
"Cannot create empty timelines on inactive tenant"
|
||||
);
|
||||
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
|
||||
drop(timelines);
|
||||
let timeline_uninit_mark = {
|
||||
let timelines: MutexGuard<'_, HashMap<TimelineId, Arc<Timeline>>> =
|
||||
self.timelines.lock().unwrap();
|
||||
self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
|
||||
};
|
||||
|
||||
let new_metadata = TimelineMetadata::new(
|
||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||
@@ -1278,6 +1277,7 @@ impl Tenant {
|
||||
initdb_lsn,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Helper for unit tests to create an emtpy timeline.
|
||||
@@ -1293,7 +1293,9 @@ impl Tenant {
|
||||
pg_version: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?;
|
||||
let uninit_tl = self
|
||||
.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
|
||||
.await?;
|
||||
let tline = uninit_tl.raw_timeline().expect("we just created it");
|
||||
assert_eq!(tline.get_last_record_lsn(), Lsn(0));
|
||||
|
||||
@@ -1304,6 +1306,7 @@ impl Tenant {
|
||||
.context("init_empty_test_timeline")?;
|
||||
modification
|
||||
.commit()
|
||||
.await
|
||||
.context("commit init_empty_test_timeline modification")?;
|
||||
|
||||
// Flush to disk so that uninit_tl's check for valid disk_consistent_lsn passes.
|
||||
@@ -1556,7 +1559,7 @@ impl Tenant {
|
||||
// No timeout here, GC & Compaction should be responsive to the
|
||||
// `TimelineState::Stopping` change.
|
||||
info!("waiting for layer_removal_cs.lock()");
|
||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
let layer_removal_guard = timeline.lcache.delete_guard().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
|
||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||
@@ -2752,13 +2755,15 @@ impl Tenant {
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
|
||||
let uninitialized_timeline = self.prepare_new_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
timeline_uninit_mark,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)?;
|
||||
let uninitialized_timeline = self
|
||||
.prepare_new_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
timeline_uninit_mark,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let new_timeline = uninitialized_timeline.finish_creation()?;
|
||||
|
||||
@@ -2836,13 +2841,15 @@ impl Tenant {
|
||||
pgdata_lsn,
|
||||
pg_version,
|
||||
);
|
||||
let raw_timeline = self.prepare_new_timeline(
|
||||
timeline_id,
|
||||
&new_metadata,
|
||||
timeline_uninit_mark,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
)?;
|
||||
let raw_timeline = self
|
||||
.prepare_new_timeline(
|
||||
timeline_id,
|
||||
&new_metadata,
|
||||
timeline_uninit_mark,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let tenant_id = raw_timeline.owning_tenant.tenant_id;
|
||||
let unfinished_timeline = raw_timeline.raw_timeline()?;
|
||||
@@ -2895,7 +2902,7 @@ impl Tenant {
|
||||
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
|
||||
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
|
||||
/// uninit mark file.
|
||||
fn prepare_new_timeline(
|
||||
async fn prepare_new_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
@@ -2922,7 +2929,7 @@ impl Tenant {
|
||||
.create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
|
||||
timeline_struct.init_empty_layer_map(start_lsn);
|
||||
timeline_struct.init_empty_layer_map(start_lsn).await?;
|
||||
|
||||
if let Err(e) =
|
||||
self.create_timeline_files(&uninit_mark.timeline_path, new_timeline_id, new_metadata)
|
||||
@@ -3580,13 +3587,17 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
@@ -3615,7 +3626,10 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) {
|
||||
match tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("duplicate timeline creation should fail"),
|
||||
Err(e) => assert_eq!(
|
||||
e.to_string(),
|
||||
@@ -3647,7 +3661,7 @@ mod tests {
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let writer = tline.writer();
|
||||
let writer = tline.writer().await;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
||||
@@ -3655,13 +3669,21 @@ mod tests {
|
||||
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
|
||||
|
||||
// Insert a value on the timeline
|
||||
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
|
||||
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
|
||||
writer
|
||||
.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))
|
||||
.await?;
|
||||
writer
|
||||
.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
|
||||
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
|
||||
writer
|
||||
.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
|
||||
writer
|
||||
.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x40));
|
||||
|
||||
//assert_current_logical_size(&tline, Lsn(0x40));
|
||||
@@ -3673,8 +3695,10 @@ mod tests {
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
let new_writer = newtline.writer();
|
||||
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
|
||||
let new_writer = newtline.writer().await;
|
||||
new_writer
|
||||
.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))
|
||||
.await?;
|
||||
new_writer.finish_write(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
@@ -3700,38 +3724,46 @@ mod tests {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
let writer = tline.writer();
|
||||
let writer = tline.writer().await;
|
||||
// Create a relation on the timeline
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
}
|
||||
tline.freeze_and_flush().await?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
}
|
||||
tline.freeze_and_flush().await
|
||||
@@ -4046,32 +4078,40 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&ctx).await?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&ctx).await?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
drop(writer);
|
||||
|
||||
tline.freeze_and_flush().await?;
|
||||
tline.compact(&ctx).await?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x40));
|
||||
drop(writer);
|
||||
|
||||
@@ -4122,12 +4162,14 @@ mod tests {
|
||||
for _ in 0..50 {
|
||||
for _ in 0..10000 {
|
||||
test_key.field6 = blknum;
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
@@ -4172,12 +4214,14 @@ mod tests {
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = blknum as u32;
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
updated[blknum] = lsn;
|
||||
drop(writer);
|
||||
@@ -4190,12 +4234,14 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
updated[blknum] = lsn;
|
||||
@@ -4247,12 +4293,14 @@ mod tests {
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = blknum as u32;
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(lsn);
|
||||
updated[blknum] = lsn;
|
||||
drop(writer);
|
||||
@@ -4273,12 +4321,14 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)
|
||||
.await?;
|
||||
println!("updating {} at {}", blknum, lsn);
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
@@ -4339,12 +4389,14 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
)?;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
)
|
||||
.await?;
|
||||
println!("updating [{}][{}] at {}", idx, blknum, lsn);
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
@@ -4377,8 +4429,9 @@ mod tests {
|
||||
.await;
|
||||
|
||||
let initdb_lsn = Lsn(0x20);
|
||||
let utline =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)?;
|
||||
let utline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = utline.raw_timeline().unwrap();
|
||||
|
||||
// Spawn flush loop now so that we can set the `expect_initdb_optimization`
|
||||
@@ -4415,6 +4468,7 @@ mod tests {
|
||||
.context("init_empty_test_timeline")?;
|
||||
modification
|
||||
.commit()
|
||||
.await
|
||||
.context("commit init_empty_test_timeline modification")?;
|
||||
|
||||
// Do the flush. The flush code will check the expectations that we set above.
|
||||
|
||||
143
pageserver/src/tenant/layer_cache.rs
Normal file
143
pageserver/src/tenant/layer_cache.rs
Normal file
@@ -0,0 +1,143 @@
|
||||
use super::storage_layer::{PersistentLayer, PersistentLayerDesc, PersistentLayerKey, RemoteLayer};
|
||||
use super::Timeline;
|
||||
use crate::tenant::layer_map::{self, LayerMap};
|
||||
use anyhow::Result;
|
||||
use std::sync::{Mutex, Weak};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
pub struct LayerCache {
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
|
||||
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
|
||||
pub layers_removal_lock: Arc<tokio::sync::Mutex<()>>,
|
||||
|
||||
/// We need this lock b/c we do not have any way to prevent GC/compaction from removing files in-use.
|
||||
/// We need to do reference counting on Arc to prevent this from happening, and we can safely remove this lock.
|
||||
pub layers_operation_lock: Arc<tokio::sync::RwLock<()>>,
|
||||
|
||||
/// Will be useful when we move evict / download to layer cache.
|
||||
#[allow(unused)]
|
||||
timeline: Weak<Timeline>,
|
||||
|
||||
mapping: Mutex<HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>>,
|
||||
}
|
||||
|
||||
pub struct LayerInUseWrite(tokio::sync::OwnedRwLockWriteGuard<()>);
|
||||
|
||||
pub struct LayerInUseRead(tokio::sync::OwnedRwLockReadGuard<()>);
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteGuard(Arc<tokio::sync::OwnedMutexGuard<()>>);
|
||||
|
||||
impl LayerCache {
|
||||
pub fn new(timeline: Weak<Timeline>) -> Self {
|
||||
Self {
|
||||
layers_operation_lock: Arc::new(tokio::sync::RwLock::new(())),
|
||||
layers_removal_lock: Arc::new(tokio::sync::Mutex::new(())),
|
||||
mapping: Mutex::new(HashMap::new()),
|
||||
timeline,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
let guard = self.mapping.lock().unwrap();
|
||||
guard.get(&desc.key()).expect("not found").clone()
|
||||
}
|
||||
|
||||
/// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
|
||||
/// we won't delete files that are being read.
|
||||
pub async fn layer_in_use_write(&self) -> LayerInUseWrite {
|
||||
LayerInUseWrite(self.layers_operation_lock.clone().write_owned().await)
|
||||
}
|
||||
|
||||
/// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
|
||||
/// we won't delete files that are being read.
|
||||
pub async fn layer_in_use_read(&self) -> LayerInUseRead {
|
||||
LayerInUseRead(self.layers_operation_lock.clone().read_owned().await)
|
||||
}
|
||||
|
||||
/// Ensures only one of compaction / gc can happen at a time.
|
||||
pub async fn delete_guard(&self) -> DeleteGuard {
|
||||
DeleteGuard(Arc::new(
|
||||
self.layers_removal_lock.clone().lock_owned().await,
|
||||
))
|
||||
}
|
||||
|
||||
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
|
||||
pub fn remove_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
|
||||
let mut guard = self.mapping.lock().unwrap();
|
||||
guard.remove(&layer.layer_desc().key());
|
||||
}
|
||||
|
||||
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
|
||||
pub fn populate_remote_when_init(&self, layer: Arc<RemoteLayer>) {
|
||||
let mut guard = self.mapping.lock().unwrap();
|
||||
guard.insert(layer.layer_desc().key(), layer);
|
||||
}
|
||||
|
||||
/// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
|
||||
pub fn populate_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
|
||||
let mut guard = self.mapping.lock().unwrap();
|
||||
guard.insert(layer.layer_desc().key(), layer);
|
||||
}
|
||||
|
||||
/// Called within read path.
|
||||
pub fn replace_and_verify(
|
||||
&self,
|
||||
expected: Arc<dyn PersistentLayer>,
|
||||
new: Arc<dyn PersistentLayer>,
|
||||
) -> Result<()> {
|
||||
let mut guard = self.mapping.lock().unwrap();
|
||||
|
||||
use super::layer_map::LayerKey;
|
||||
let key = LayerKey::from(&*expected);
|
||||
let other = LayerKey::from(&*new);
|
||||
|
||||
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
|
||||
let new_l0 = LayerMap::is_l0(new.layer_desc());
|
||||
|
||||
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
|
||||
"replacing downloaded layer into layermap failed because layer was not found"
|
||||
));
|
||||
|
||||
anyhow::ensure!(
|
||||
key == other,
|
||||
"replacing downloaded layer into layermap failed because two layers have different keys: {key:?} != {other:?}"
|
||||
);
|
||||
|
||||
anyhow::ensure!(
|
||||
expected_l0 == new_l0,
|
||||
"replacing downloaded layer into layermap failed because one layer is l0 while the other is not: {expected_l0} != {new_l0}"
|
||||
);
|
||||
|
||||
if let Some(layer) = guard.get_mut(&expected.layer_desc().key()) {
|
||||
anyhow::ensure!(
|
||||
layer_map::compare_arced_layers(&expected, layer),
|
||||
"replacing downloaded layer into layermap failed because another layer was found instead of expected, expected={expected:?}, new={new:?}",
|
||||
expected = Arc::as_ptr(&expected),
|
||||
new = Arc::as_ptr(layer),
|
||||
);
|
||||
*layer = new;
|
||||
Ok(())
|
||||
} else {
|
||||
anyhow::bail!(
|
||||
"replacing downloaded layer into layermap failed because layer was not found"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Called within write path. When compaction and image layer creation we will create new layers.
|
||||
pub fn create_new_layer(&self, layer: Arc<dyn PersistentLayer>) {
|
||||
let mut guard = self.mapping.lock().unwrap();
|
||||
guard.insert(layer.layer_desc().key(), layer);
|
||||
}
|
||||
|
||||
/// Called within write path. When GC and compaction we will remove layers and delete them on disk.
|
||||
/// Will move logic to delete files here later.
|
||||
pub fn delete_layer(&self, layer: Arc<dyn PersistentLayer>) {
|
||||
let mut guard = self.mapping.lock().unwrap();
|
||||
guard.remove(&layer.layer_desc().key());
|
||||
}
|
||||
}
|
||||
@@ -51,25 +51,23 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::Replacement;
|
||||
pub use historic_layer_coverage::{LayerKey, Replacement};
|
||||
|
||||
use super::storage_layer::range_eq;
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
use super::storage_layer::PersistentLayerKey;
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
pub struct LayerMap<L: ?Sized> {
|
||||
#[derive(Default, Clone)]
|
||||
pub struct LayerMap {
|
||||
//
|
||||
// 'open_layer' holds the current InMemoryLayer that is accepting new
|
||||
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
|
||||
@@ -95,24 +93,6 @@ pub struct LayerMap<L: ?Sized> {
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
|
||||
|
||||
/// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
|
||||
/// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
|
||||
/// RemoteLayer will be removed.
|
||||
mapping: HashMap<PersistentLayerKey, Arc<L>>,
|
||||
}
|
||||
|
||||
impl<L: ?Sized> Default for LayerMap<L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
open_layer: None,
|
||||
next_open_layer_at: None,
|
||||
frozen_layers: VecDeque::default(),
|
||||
l0_delta_layers: Vec::default(),
|
||||
historic: BufferedHistoricLayerCoverage::default(),
|
||||
mapping: HashMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The primary update API for the layer map.
|
||||
@@ -120,24 +100,21 @@ impl<L: ?Sized> Default for LayerMap<L> {
|
||||
/// Batching historic layer insertions and removals is good for
|
||||
/// performance and this struct helps us do that correctly.
|
||||
#[must_use]
|
||||
pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
|
||||
pub struct BatchedUpdates<'a> {
|
||||
// While we hold this exclusive reference to the layer map the type checker
|
||||
// will prevent us from accidentally reading any unflushed updates.
|
||||
layer_map: &'a mut LayerMap<L>,
|
||||
layer_map: &'a mut LayerMap,
|
||||
}
|
||||
|
||||
/// Provide ability to batch more updates while hiding the read
|
||||
/// API so we don't accidentally read without flushing.
|
||||
impl<L> BatchedUpdates<'_, L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
impl BatchedUpdates<'_> {
|
||||
///
|
||||
/// Insert an on-disk layer.
|
||||
///
|
||||
// TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
|
||||
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
self.layer_map.insert_historic_noflush(layer_desc, layer)
|
||||
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.layer_map.insert_historic_noflush(layer_desc)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -145,31 +122,8 @@ where
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
self.layer_map.remove_historic_noflush(layer_desc, layer)
|
||||
}
|
||||
|
||||
/// Replaces existing layer iff it is the `expected`.
|
||||
///
|
||||
/// If the expected layer has been removed it will not be inserted by this function.
|
||||
///
|
||||
/// Returned `Replacement` describes succeeding in replacement or the reason why it could not
|
||||
/// be done.
|
||||
///
|
||||
/// TODO replacement can be done without buffering and rebuilding layer map updates.
|
||||
/// One way to do that is to add a layer of indirection for returned values, so
|
||||
/// that we can replace values only by updating a hashmap.
|
||||
pub fn replace_historic(
|
||||
&mut self,
|
||||
expected_desc: PersistentLayerDesc,
|
||||
expected: &Arc<L>,
|
||||
new_desc: PersistentLayerDesc,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
|
||||
|
||||
self.layer_map
|
||||
.replace_historic_noflush(expected_desc, expected, new_desc, new)
|
||||
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.layer_map.remove_historic_noflush(layer_desc)
|
||||
}
|
||||
|
||||
// We will flush on drop anyway, but this method makes it
|
||||
@@ -185,25 +139,19 @@ where
|
||||
// than panic later or read without flushing.
|
||||
//
|
||||
// TODO maybe warn if flush hasn't explicitly been called
|
||||
impl<L> Drop for BatchedUpdates<'_, L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
impl Drop for BatchedUpdates<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.layer_map.flush_updates();
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
pub struct SearchResult<L: ?Sized> {
|
||||
pub layer: Arc<L>,
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<PersistentLayerDesc>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
impl<L> LayerMap<L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
/// 'key', with lsn.start < 'end_lsn'.
|
||||
@@ -235,7 +183,7 @@ where
|
||||
/// NOTE: This only searches the 'historic' layers, *not* the
|
||||
/// 'open' and 'frozen' layers!
|
||||
///
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
|
||||
let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
|
||||
let latest_delta = version.delta_coverage.query(key.to_i128());
|
||||
let latest_image = version.image_coverage.query(key.to_i128());
|
||||
@@ -244,7 +192,6 @@ where
|
||||
(None, None) => None,
|
||||
(None, Some(image)) => {
|
||||
let lsn_floor = image.get_lsn_range().start;
|
||||
let image = self.get_layer_from_mapping(&image.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor,
|
||||
@@ -252,7 +199,6 @@ where
|
||||
}
|
||||
(Some(delta), None) => {
|
||||
let lsn_floor = delta.get_lsn_range().start;
|
||||
let delta = self.get_layer_from_mapping(&delta.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
@@ -263,7 +209,6 @@ where
|
||||
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
|
||||
let image_exact_match = img_lsn + 1 == end_lsn;
|
||||
if image_is_newer || image_exact_match {
|
||||
let image = self.get_layer_from_mapping(&image.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor: img_lsn,
|
||||
@@ -271,7 +216,6 @@ where
|
||||
} else {
|
||||
let lsn_floor =
|
||||
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
|
||||
let delta = self.get_layer_from_mapping(&delta.key()).clone();
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
@@ -282,7 +226,7 @@ where
|
||||
}
|
||||
|
||||
/// Start a batch of updates, applied on drop
|
||||
pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
|
||||
pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
|
||||
BatchedUpdates { layer_map: self }
|
||||
}
|
||||
|
||||
@@ -292,48 +236,32 @@ where
|
||||
/// Helper function for BatchedUpdates::insert_historic
|
||||
///
|
||||
/// TODO(chi): remove L generic so that we do not need to pass layer object.
|
||||
pub(self) fn insert_historic_noflush(
|
||||
&mut self,
|
||||
layer_desc: PersistentLayerDesc,
|
||||
layer: Arc<L>,
|
||||
) {
|
||||
self.mapping.insert(layer_desc.key(), layer.clone());
|
||||
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
|
||||
if Self::is_l0(&layer) {
|
||||
if Self::is_l0(&layer_desc) {
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
}
|
||||
|
||||
self.historic.insert(
|
||||
historic_layer_coverage::LayerKey::from(&*layer),
|
||||
historic_layer_coverage::LayerKey::from(&layer_desc),
|
||||
layer_desc.into(),
|
||||
);
|
||||
}
|
||||
|
||||
fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
|
||||
let layer = self
|
||||
.mapping
|
||||
.get(key)
|
||||
.with_context(|| format!("{key:?}"))
|
||||
.expect("inconsistent layer mapping");
|
||||
layer
|
||||
}
|
||||
|
||||
///
|
||||
/// Remove an on-disk layer from the map.
|
||||
///
|
||||
/// Helper function for BatchedUpdates::remove_historic
|
||||
///
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(&*layer));
|
||||
if Self::is_l0(&layer) {
|
||||
.remove(historic_layer_coverage::LayerKey::from(&layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(&layer_desc) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| {
|
||||
!Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
|
||||
});
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
self.l0_delta_layers = l0_delta_layers;
|
||||
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
|
||||
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
|
||||
@@ -344,69 +272,6 @@ where
|
||||
"failed to locate removed historic layer from l0_delta_layers"
|
||||
);
|
||||
}
|
||||
self.mapping.remove(&layer_desc.key());
|
||||
}
|
||||
|
||||
pub(self) fn replace_historic_noflush(
|
||||
&mut self,
|
||||
expected_desc: PersistentLayerDesc,
|
||||
expected: &Arc<L>,
|
||||
new_desc: PersistentLayerDesc,
|
||||
new: Arc<L>,
|
||||
) -> anyhow::Result<Replacement<Arc<L>>> {
|
||||
let key = historic_layer_coverage::LayerKey::from(&**expected);
|
||||
let other = historic_layer_coverage::LayerKey::from(&*new);
|
||||
|
||||
let expected_l0 = Self::is_l0(expected);
|
||||
let new_l0 = Self::is_l0(&new);
|
||||
|
||||
anyhow::ensure!(
|
||||
key == other,
|
||||
"expected and new must have equal LayerKeys: {key:?} != {other:?}"
|
||||
);
|
||||
|
||||
anyhow::ensure!(
|
||||
expected_l0 == new_l0,
|
||||
"expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
|
||||
);
|
||||
|
||||
let l0_index = if expected_l0 {
|
||||
// find the index in case replace worked, we need to replace that as well
|
||||
let pos = self.l0_delta_layers.iter().position(|slot| {
|
||||
Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
|
||||
});
|
||||
|
||||
if pos.is_none() {
|
||||
return Ok(Replacement::NotFound);
|
||||
}
|
||||
pos
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let new_desc = Arc::new(new_desc);
|
||||
let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
|
||||
**existing == expected_desc
|
||||
});
|
||||
|
||||
if let Replacement::Replaced { .. } = &replaced {
|
||||
self.mapping.remove(&expected_desc.key());
|
||||
self.mapping.insert(new_desc.key(), new);
|
||||
if let Some(index) = l0_index {
|
||||
self.l0_delta_layers[index] = new_desc;
|
||||
}
|
||||
}
|
||||
|
||||
let replaced = match replaced {
|
||||
Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
|
||||
Replacement::NotFound => Replacement::NotFound,
|
||||
Replacement::RemovalBuffered => Replacement::RemovalBuffered,
|
||||
Replacement::Unexpected(x) => {
|
||||
Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
|
||||
}
|
||||
};
|
||||
|
||||
Ok(replaced)
|
||||
}
|
||||
|
||||
/// Helper function for BatchedUpdates::drop.
|
||||
@@ -454,10 +319,8 @@ where
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
||||
self.historic
|
||||
.iter()
|
||||
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
///
|
||||
@@ -472,7 +335,7 @@ where
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.0) {
|
||||
Some(v) => v,
|
||||
None => return Ok(vec![]),
|
||||
@@ -482,36 +345,26 @@ where
|
||||
let end = key_range.end.to_i128();
|
||||
|
||||
// Initialize loop variables
|
||||
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
|
||||
let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
|
||||
let mut current_key = start;
|
||||
let mut current_val = version.image_coverage.query(start);
|
||||
|
||||
// Loop through the change events and push intervals
|
||||
for (change_key, change_val) in version.image_coverage.range(start..end) {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
coverage.push((
|
||||
kr,
|
||||
current_val
|
||||
.take()
|
||||
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
|
||||
));
|
||||
coverage.push((kr, current_val.take()));
|
||||
current_key = change_key;
|
||||
current_val = change_val.clone();
|
||||
}
|
||||
|
||||
// Add the final interval
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
coverage.push((
|
||||
kr,
|
||||
current_val
|
||||
.take()
|
||||
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
|
||||
));
|
||||
coverage.push((kr, current_val.take()));
|
||||
|
||||
Ok(coverage)
|
||||
}
|
||||
|
||||
pub fn is_l0(layer: &L) -> bool {
|
||||
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
||||
range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
|
||||
}
|
||||
|
||||
@@ -537,7 +390,7 @@ where
|
||||
/// TODO The optimal number should probably be slightly higher than 1, but to
|
||||
/// implement that we need to plumb a lot more context into this function
|
||||
/// than just the current partition_range.
|
||||
pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
|
||||
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
|
||||
// Case 1
|
||||
if !Self::is_l0(layer) {
|
||||
return true;
|
||||
@@ -595,9 +448,7 @@ where
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
if !kr.is_empty() {
|
||||
let base_count =
|
||||
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
|
||||
as usize;
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath =
|
||||
self.count_deltas(&kr, &lr, new_limit)?;
|
||||
@@ -620,9 +471,7 @@ where
|
||||
let lr = lsn.start..val.get_lsn_range().start;
|
||||
|
||||
if !kr.is_empty() {
|
||||
let base_count =
|
||||
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
|
||||
as usize;
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
@@ -772,12 +621,8 @@ where
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
||||
Ok(self
|
||||
.l0_delta_layers
|
||||
.iter()
|
||||
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
|
||||
.collect())
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
|
||||
Ok(self.l0_delta_layers.to_vec())
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
@@ -802,97 +647,76 @@ where
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
|
||||
///
|
||||
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
|
||||
#[inline(always)]
|
||||
pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
|
||||
// "dyn Trait" objects are "fat pointers" in that they have two components:
|
||||
// - pointer to the object
|
||||
// - pointer to the vtable
|
||||
//
|
||||
// rust does not provide a guarantee that these vtables are unique, but however
|
||||
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
|
||||
// pointer and the vtable need to be equal.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/issues/103763
|
||||
//
|
||||
// A future version of rust will most likely use this form below, where we cast each
|
||||
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
|
||||
// not affect the comparison.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/pull/106450
|
||||
let left = Arc::as_ptr(left) as *const ();
|
||||
let right = Arc::as_ptr(right) as *const ();
|
||||
/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
|
||||
///
|
||||
/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
|
||||
///
|
||||
/// If comparing persistent layers, ALWAYS compare the layer descriptor key.
|
||||
#[inline(always)]
|
||||
pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
|
||||
// "dyn Trait" objects are "fat pointers" in that they have two components:
|
||||
// - pointer to the object
|
||||
// - pointer to the vtable
|
||||
//
|
||||
// rust does not provide a guarantee that these vtables are unique, but however
|
||||
// `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
|
||||
// pointer and the vtable need to be equal.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/issues/103763
|
||||
//
|
||||
// A future version of rust will most likely use this form below, where we cast each
|
||||
// pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
|
||||
// not affect the comparison.
|
||||
//
|
||||
// See: https://github.com/rust-lang/rust/pull/106450
|
||||
let left = Arc::as_ptr(left) as *const ();
|
||||
let right = Arc::as_ptr(right) as *const ();
|
||||
|
||||
left == right
|
||||
}
|
||||
left == right
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{LayerMap, Replacement};
|
||||
use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
|
||||
use super::LayerMap;
|
||||
use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
mod l0_delta_layers_updated {
|
||||
|
||||
use crate::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn for_full_range_delta() {
|
||||
// l0_delta_layers are used by compaction, and should observe all buffered updates
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
|
||||
true
|
||||
)
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_non_full_range_delta() {
|
||||
// has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
|
||||
// because not full range
|
||||
false
|
||||
)
|
||||
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
|
||||
// because not full range
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_image() {
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
|
||||
// code only checks if it is a full range layer, doesn't care about images, which must
|
||||
// mean we should in practice never have full range images
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replacing_missing_l0_is_notfound() {
|
||||
// original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
|
||||
// however only happen for precondition failures.
|
||||
|
||||
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
|
||||
let layer = LayerFileName::from_str(layer).unwrap();
|
||||
let layer = LayerDescriptor::from(layer);
|
||||
|
||||
// same skeletan construction; see scenario below
|
||||
let not_found = Arc::new(layer.clone());
|
||||
let new_version = Arc::new(layer);
|
||||
|
||||
let mut map = LayerMap::default();
|
||||
|
||||
let res = map.batch_update().replace_historic(
|
||||
not_found.get_persistent_layer_desc(),
|
||||
¬_found,
|
||||
new_version.get_persistent_layer_desc(),
|
||||
new_version,
|
||||
);
|
||||
|
||||
assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
|
||||
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
|
||||
// code only checks if it is a full range layer, doesn't care about images, which must
|
||||
// mean we should in practice never have full range images
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
|
||||
@@ -906,46 +730,31 @@ mod tests {
|
||||
|
||||
// two disjoint Arcs in different lifecycle phases. even if it seems they must be the
|
||||
// same layer, we use LayerMap::compare_arced_layers as the identity of layers.
|
||||
assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
|
||||
assert_eq!(remote.layer_desc(), downloaded.layer_desc());
|
||||
|
||||
let expected_in_counts = (1, usize::from(expected_l0));
|
||||
|
||||
map.batch_update()
|
||||
.insert_historic(remote.get_persistent_layer_desc(), remote.clone());
|
||||
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
|
||||
|
||||
let replaced = map
|
||||
.batch_update()
|
||||
.replace_historic(
|
||||
remote.get_persistent_layer_desc(),
|
||||
&remote,
|
||||
downloaded.get_persistent_layer_desc(),
|
||||
downloaded.clone(),
|
||||
)
|
||||
.expect("name derived attributes are the same");
|
||||
assert!(
|
||||
matches!(replaced, Replacement::Replaced { .. }),
|
||||
"{replaced:?}"
|
||||
.insert_historic(remote.layer_desc().clone());
|
||||
assert_eq!(
|
||||
count_layer_in(&map, remote.layer_desc()),
|
||||
expected_in_counts
|
||||
);
|
||||
assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
|
||||
|
||||
map.batch_update()
|
||||
.remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
|
||||
assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
|
||||
.remove_historic(downloaded.layer_desc().clone());
|
||||
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
|
||||
}
|
||||
|
||||
fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
|
||||
fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
|
||||
let historic = map
|
||||
.iter_historic_layers()
|
||||
.filter(|x| LayerMap::compare_arced_layers(x, layer))
|
||||
.filter(|x| x.key() == layer.key())
|
||||
.count();
|
||||
let l0s = map
|
||||
.get_level0_deltas()
|
||||
.expect("why does this return a result");
|
||||
let l0 = l0s
|
||||
.iter()
|
||||
.filter(|x| LayerMap::compare_arced_layers(x, layer))
|
||||
.count();
|
||||
let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
|
||||
|
||||
(historic, l0)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@ use std::ops::Range;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
use crate::tenant::storage_layer::PersistentLayerDesc;
|
||||
|
||||
use super::layer_coverage::LayerCoverageTuple;
|
||||
|
||||
/// Layers in this module are identified and indexed by this data.
|
||||
@@ -53,11 +55,24 @@ impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerK
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&PersistentLayerDesc> for LayerKey {
|
||||
fn from(layer: &PersistentLayerDesc) -> Self {
|
||||
let kr = layer.get_key_range();
|
||||
let lr = layer.get_lsn_range();
|
||||
LayerKey {
|
||||
key: kr.start.to_i128()..kr.end.to_i128(),
|
||||
lsn: lr.start.0..lr.end.0,
|
||||
is_image: !layer.is_incremental(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Efficiently queryable layer coverage for each LSN.
|
||||
///
|
||||
/// Allows answering layer map queries very efficiently,
|
||||
/// but doesn't allow retroactive insertion, which is
|
||||
/// sometimes necessary. See BufferedHistoricLayerCoverage.
|
||||
#[derive(Clone)]
|
||||
pub struct HistoricLayerCoverage<Value> {
|
||||
/// The latest state
|
||||
head: LayerCoverageTuple<Value>,
|
||||
@@ -411,6 +426,7 @@ fn test_persistent_overlapping() {
|
||||
///
|
||||
/// See this for more on persistent and retroactive techniques:
|
||||
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
#[derive(Clone)]
|
||||
pub struct BufferedHistoricLayerCoverage<Value> {
|
||||
/// A persistent layer map that we rebuild when we need to retroactively update
|
||||
historic_coverage: HistoricLayerCoverage<Value>,
|
||||
@@ -467,6 +483,11 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
///
|
||||
/// Returns a `Replacement` value describing the outcome; only the case of
|
||||
/// `Replacement::Replaced` modifies the map and requires a rebuild.
|
||||
///
|
||||
/// This function is unlikely to be used in the future because LayerMap now only records the
|
||||
/// layer descriptors. Therefore, anything added to the layer map will only be removed or
|
||||
/// added, and never replaced.
|
||||
#[allow(dead_code)]
|
||||
pub fn replace<F>(
|
||||
&mut self,
|
||||
layer_key: &LayerKey,
|
||||
|
||||
@@ -15,6 +15,7 @@ use rpds::RedBlackTreeMapSync;
|
||||
///
|
||||
/// NOTE The struct is parameterized over Value for easier
|
||||
/// testing, but in practice it's some sort of layer.
|
||||
#[derive(Clone)]
|
||||
pub struct LayerCoverage<Value> {
|
||||
/// For every change in coverage (as we sweep the key space)
|
||||
/// we store (lsn.end, value).
|
||||
@@ -139,6 +140,7 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
}
|
||||
|
||||
/// Image and delta coverage at a specific LSN.
|
||||
#[derive(Clone)]
|
||||
pub struct LayerCoverageTuple<Value> {
|
||||
pub image_coverage: LayerCoverage<Value>,
|
||||
pub delta_coverage: LayerCoverage<Value>,
|
||||
|
||||
146
pageserver/src/tenant/layer_map_mgr.rs
Normal file
146
pageserver/src/tenant/layer_map_mgr.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
//! This module implements `LayerMapMgr`, which manages a layer map object and provides lock-free access to the state.
|
||||
//!
|
||||
//! A common usage pattern is as follows:
|
||||
//!
|
||||
//! ```ignore
|
||||
//! async fn compaction(&self) {
|
||||
//! // Get the current state.
|
||||
//! let state = self.layer_map_mgr.read();
|
||||
//! // No lock held at this point. Do compaction based on the state. This part usually incurs I/O operations and may
|
||||
//! // take a long time.
|
||||
//! let compaction_result = self.do_compaction(&state).await?;
|
||||
//! // Update the state.
|
||||
//! self.layer_map_mgr.update(|mut state| async move {
|
||||
//! // do updates to the state, return it.
|
||||
//! Ok(state)
|
||||
//! }).await?;
|
||||
//! }
|
||||
//! ```
|
||||
use anyhow::Result;
|
||||
use arc_swap::ArcSwap;
|
||||
use futures::Future;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::layer_map::LayerMap;
|
||||
|
||||
/// Manages the storage state. Provide utility functions to modify the layer map and get an immutable reference to the
|
||||
/// layer map.
|
||||
pub struct LayerMapMgr {
|
||||
layer_map: ArcSwap<LayerMap>,
|
||||
state_lock: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl LayerMapMgr {
|
||||
/// Get the current state of the layer map.
|
||||
pub fn read(&self) -> Arc<LayerMap> {
|
||||
// TODO: it is possible to use `load` to reduce the overhead of cloning the Arc, but read path usually involves
|
||||
// disk reads and layer mapping fetching, and therefore it's not a big deal to use a more optimized version
|
||||
// here.
|
||||
self.layer_map.load_full()
|
||||
}
|
||||
|
||||
/// Clone the layer map for modification.
|
||||
fn clone_for_write(&self, _state_lock_witness: &tokio::sync::MutexGuard<'_, ()>) -> LayerMap {
|
||||
(**self.layer_map.load()).clone()
|
||||
}
|
||||
|
||||
pub fn new(layer_map: LayerMap) -> Self {
|
||||
Self {
|
||||
layer_map: ArcSwap::new(Arc::new(layer_map)),
|
||||
state_lock: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the layer map.
|
||||
pub async fn update<O, F>(&self, operation: O) -> Result<()>
|
||||
where
|
||||
O: FnOnce(LayerMap) -> F,
|
||||
F: Future<Output = Result<LayerMap>>,
|
||||
{
|
||||
let state_lock = self.state_lock.lock().await;
|
||||
let state = self.clone_for_write(&state_lock);
|
||||
let new_state = operation(state).await?;
|
||||
self.layer_map.store(Arc::new(new_state));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{repository::Key, tenant::storage_layer::PersistentLayerDesc};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_layer_map_manage() -> Result<()> {
|
||||
let mgr = LayerMapMgr::new(Default::default());
|
||||
mgr.update(|mut map| async move {
|
||||
let mut updates = map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_img(
|
||||
TenantId::generate(),
|
||||
TimelineId::generate(),
|
||||
Key::from_i128(0)..Key::from_i128(1),
|
||||
Lsn(0),
|
||||
false,
|
||||
0,
|
||||
));
|
||||
updates.flush();
|
||||
Ok(map)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let ref_1 = mgr.read();
|
||||
|
||||
mgr.update(|mut map| async move {
|
||||
let mut updates = map.batch_update();
|
||||
updates.insert_historic(PersistentLayerDesc::new_img(
|
||||
TenantId::generate(),
|
||||
TimelineId::generate(),
|
||||
Key::from_i128(1)..Key::from_i128(2),
|
||||
Lsn(0),
|
||||
false,
|
||||
0,
|
||||
));
|
||||
updates.flush();
|
||||
Ok(map)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let ref_2 = mgr.read();
|
||||
|
||||
// Modification should not be visible to the old reference.
|
||||
assert_eq!(
|
||||
ref_1
|
||||
.search(Key::from_i128(0), Lsn(1))
|
||||
.unwrap()
|
||||
.layer
|
||||
.key_range,
|
||||
Key::from_i128(0)..Key::from_i128(1)
|
||||
);
|
||||
assert!(ref_1.search(Key::from_i128(1), Lsn(1)).is_none());
|
||||
|
||||
// Modification should be visible to the new reference.
|
||||
assert_eq!(
|
||||
ref_2
|
||||
.search(Key::from_i128(0), Lsn(1))
|
||||
.unwrap()
|
||||
.layer
|
||||
.key_range,
|
||||
Key::from_i128(0)..Key::from_i128(1)
|
||||
);
|
||||
assert_eq!(
|
||||
ref_2
|
||||
.search(Key::from_i128(1), Lsn(1))
|
||||
.unwrap()
|
||||
.layer
|
||||
.key_range,
|
||||
Key::from_i128(1)..Key::from_i128(2)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -41,8 +41,6 @@ pub use inmemory_layer::InMemoryLayer;
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
use super::layer_map::BatchedUpdates;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
@@ -176,19 +174,9 @@ impl LayerAccessStats {
|
||||
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
pub(crate) fn for_loading_layer<L>(
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
status: LayerResidenceStatus,
|
||||
) -> Self
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
|
||||
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
|
||||
new.record_residence_event(
|
||||
layer_map_lock_held_witness,
|
||||
status,
|
||||
LayerResidenceEventReason::LayerLoad,
|
||||
);
|
||||
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
|
||||
new
|
||||
}
|
||||
|
||||
@@ -197,24 +185,16 @@ impl LayerAccessStats {
|
||||
/// The `new_status` is not recorded in `self`.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
pub(crate) fn clone_for_residence_change<L>(
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
new_status: LayerResidenceStatus,
|
||||
) -> LayerAccessStats
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
) -> LayerAccessStats {
|
||||
let clone = {
|
||||
let inner = self.0.lock().unwrap();
|
||||
inner.clone()
|
||||
};
|
||||
let new = LayerAccessStats(Mutex::new(clone));
|
||||
new.record_residence_event(
|
||||
layer_map_lock_held_witness,
|
||||
new_status,
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
);
|
||||
new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
|
||||
new
|
||||
}
|
||||
|
||||
@@ -232,14 +212,11 @@ impl LayerAccessStats {
|
||||
/// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
|
||||
/// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
|
||||
///
|
||||
pub(crate) fn record_residence_event<L>(
|
||||
pub(crate) fn record_residence_event(
|
||||
&self,
|
||||
_layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
status: LayerResidenceStatus,
|
||||
reason: LayerResidenceEventReason,
|
||||
) where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
) {
|
||||
let mut locked = self.0.lock().unwrap();
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner
|
||||
@@ -389,10 +366,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
|
||||
}
|
||||
|
||||
/// Returned by [`Layer::iter`]
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
||||
|
||||
/// Returned by [`Layer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
@@ -473,94 +450,125 @@ pub fn downcast_remote_layer(
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds metadata about a layer without any content. Used mostly for testing.
|
||||
///
|
||||
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
|
||||
/// LayerDescriptor.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LayerDescriptor {
|
||||
pub key: Range<Key>,
|
||||
pub lsn: Range<Lsn>,
|
||||
pub is_incremental: bool,
|
||||
pub short_id: String,
|
||||
}
|
||||
pub mod tests {
|
||||
use super::*;
|
||||
|
||||
impl LayerDescriptor {
|
||||
/// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
|
||||
/// and the tenant / timeline id does not matter.
|
||||
pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
|
||||
PersistentLayerDesc::new_delta(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
self.key.clone(),
|
||||
self.lsn.clone(),
|
||||
233,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key.clone()
|
||||
/// Holds metadata about a layer without any content. Used mostly for testing.
|
||||
///
|
||||
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
|
||||
/// LayerDescriptor.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LayerDescriptor {
|
||||
base: PersistentLayerDesc,
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn.clone()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
self.short_id.clone()
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeltaFileName> for LayerDescriptor {
|
||||
fn from(value: DeltaFileName) -> Self {
|
||||
let short_id = value.to_string();
|
||||
LayerDescriptor {
|
||||
key: value.key_range,
|
||||
lsn: value.lsn_range,
|
||||
is_incremental: true,
|
||||
short_id,
|
||||
impl From<PersistentLayerDesc> for LayerDescriptor {
|
||||
fn from(base: PersistentLayerDesc) -> Self {
|
||||
Self { base }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageFileName> for LayerDescriptor {
|
||||
fn from(value: ImageFileName) -> Self {
|
||||
let short_id = value.to_string();
|
||||
let lsn = value.lsn_as_range();
|
||||
LayerDescriptor {
|
||||
key: value.key_range,
|
||||
lsn,
|
||||
is_incremental: false,
|
||||
short_id,
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn short_id(&self) -> String {
|
||||
self.layer_desc().short_id()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LayerFileName> for LayerDescriptor {
|
||||
fn from(value: LayerFileName) -> Self {
|
||||
match value {
|
||||
LayerFileName::Delta(d) => Self::from(d),
|
||||
LayerFileName::Image(i) => Self::from(i),
|
||||
impl PersistentLayer for LayerDescriptor {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.base
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeltaFileName> for LayerDescriptor {
|
||||
fn from(value: DeltaFileName) -> Self {
|
||||
LayerDescriptor {
|
||||
base: PersistentLayerDesc::new_delta(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn_range,
|
||||
233,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageFileName> for LayerDescriptor {
|
||||
fn from(value: ImageFileName) -> Self {
|
||||
LayerDescriptor {
|
||||
base: PersistentLayerDesc::new_img(
|
||||
TenantId::from_array([0; 16]),
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn,
|
||||
false,
|
||||
233,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LayerFileName> for LayerDescriptor {
|
||||
fn from(value: LayerFileName) -> Self {
|
||||
match value {
|
||||
LayerFileName::Delta(d) => Self::from(d),
|
||||
LayerFileName::Image(i) => Self::from(i),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -304,7 +304,7 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
||||
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
||||
// TODO: Currently, we just leak the storage for any deleted keys
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::layer_map::BatchedUpdates;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use anyhow::{bail, Result};
|
||||
@@ -218,15 +217,11 @@ impl RemoteLayer {
|
||||
}
|
||||
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub fn create_downloaded_layer<L>(
|
||||
pub fn create_downloaded_layer(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
|
||||
conf: &'static PageServerConf,
|
||||
file_size: u64,
|
||||
) -> Arc<dyn PersistentLayer>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
) -> Arc<dyn PersistentLayer> {
|
||||
if self.desc.is_delta {
|
||||
let fname = self.desc.delta_file_name();
|
||||
Arc::new(DeltaLayer::new(
|
||||
@@ -235,10 +230,8 @@ impl RemoteLayer {
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats.clone_for_residence_change(
|
||||
layer_map_lock_held_witness,
|
||||
LayerResidenceStatus::Resident,
|
||||
),
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
))
|
||||
} else {
|
||||
let fname = self.desc.image_file_name();
|
||||
@@ -248,10 +241,8 @@ impl RemoteLayer {
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats.clone_for_residence_change(
|
||||
layer_map_lock_held_witness,
|
||||
LayerResidenceStatus::Resident,
|
||||
),
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -197,9 +197,11 @@ impl Timeline {
|
||||
// We don't want to hold the layer map lock during eviction.
|
||||
// So, we just need to deal with this.
|
||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
||||
let layers = self.layers.read().unwrap();
|
||||
let guard = self.lcache.layer_in_use_read().await;
|
||||
let layers = self.layer_mgr.read();
|
||||
let mut candidates = Vec::new();
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
let hist_layer = self.lcache.get_from_desc(&hist_layer);
|
||||
if hist_layer.is_remote_layer() {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -304,12 +304,15 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
}
|
||||
|
||||
timeline.check_checkpoint_distance().with_context(|| {
|
||||
format!(
|
||||
"Failed to check checkpoint distance for timeline {}",
|
||||
timeline.timeline_id
|
||||
)
|
||||
})?;
|
||||
timeline
|
||||
.check_checkpoint_distance()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to check checkpoint distance for timeline {}",
|
||||
timeline.timeline_id
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let timeline_remote_consistent_lsn =
|
||||
|
||||
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
modification.commit()?;
|
||||
modification.commit().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1199,7 +1199,7 @@ mod tests {
|
||||
let mut m = tline.begin_modification(Lsn(0x10));
|
||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
|
||||
|
||||
Ok(walingest)
|
||||
@@ -1218,22 +1218,22 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x50));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x50));
|
||||
|
||||
@@ -1319,7 +1319,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_current_logical_size(&tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
@@ -1361,7 +1361,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
|
||||
@@ -1374,7 +1374,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
|
||||
@@ -1399,7 +1399,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
|
||||
@@ -1438,7 +1438,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
@@ -1457,7 +1457,7 @@ mod tests {
|
||||
// Drop rel
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(
|
||||
@@ -1475,7 +1475,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
@@ -1514,7 +1514,7 @@ mod tests {
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// The relation was created at LSN 20, not visible at LSN 1 yet.
|
||||
assert_eq!(
|
||||
@@ -1559,7 +1559,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(
|
||||
@@ -1608,7 +1608,7 @@ mod tests {
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
@@ -1655,7 +1655,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
}
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
@@ -1671,7 +1671,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
||||
RELSEG_SIZE
|
||||
@@ -1684,7 +1684,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
||||
RELSEG_SIZE - 1
|
||||
@@ -1700,7 +1700,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
||||
size as BlockNumber
|
||||
|
||||
@@ -4,49 +4,6 @@ import pytest
|
||||
import requests
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.utils import get_dir_size
|
||||
|
||||
|
||||
# @pytest.mark.xfail # We currently pass a 16MB pg_wal dir instead of creating it client-side
|
||||
def test_basebackup_size(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Start
|
||||
env.neon_cli.create_branch("test_startup")
|
||||
endpoint = env.endpoints.create_start("test_startup")
|
||||
|
||||
# Get metrics
|
||||
metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
|
||||
basebackup_bytes = metrics["basebackup_bytes"]
|
||||
zenbenchmark.record(
|
||||
"basebackup_size", basebackup_bytes / 1024, "KB", report=MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
# Stop so we force flush of any files and we can measure datadir sizes
|
||||
# NOTE the order of this line is important in relation to get_dir_size
|
||||
datadir = endpoint.pgdata_dir
|
||||
assert datadir is not None # for mypy
|
||||
endpoint.stop()
|
||||
|
||||
# Even though we don't insert any data, this nuber could be larger than basebackup
|
||||
# size because there could theoretically be compression, or postgres could create
|
||||
# or download data during startup. Currently if we don't send any pg_wal in the
|
||||
# basebackup, postgres will start up just fine, but during sync-safekeepers,
|
||||
# walproposer will try to recover the missing wal from safekeepers and cause the
|
||||
# same amount of network IO. We want to notice that if it happens.
|
||||
datadir_bytes = get_dir_size(datadir)
|
||||
zenbenchmark.record(
|
||||
"datadir_size", datadir_bytes / 1024, "KB", report=MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
wal_bytes = get_dir_size(datadir + "/pg_wal")
|
||||
zenbenchmark.record("wal_size", wal_bytes / 1024, "KB", report=MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
# Seems like a reasonable limit, but increase it if it becomes impossible to meet
|
||||
# assert basebackup_bytes < 70 * 1024
|
||||
# assert datadir_bytes < 70 * 1024
|
||||
# assert wal_bytes < 1 * 1024
|
||||
|
||||
|
||||
# Just start and measure duration.
|
||||
|
||||
@@ -163,7 +163,6 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
|
||||
assert conn.get_parameter_status(name) == value
|
||||
|
||||
|
||||
@pytest.mark.timeout(5)
|
||||
def test_close_on_connections_exit(static_proxy: NeonProxy):
|
||||
# Open two connections, send SIGTERM, then ensure that proxy doesn't exit
|
||||
# until after connections close.
|
||||
|
||||
Reference in New Issue
Block a user