mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 21:20:37 +00:00
Compare commits
12 Commits
releases/2
...
alek/remot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
595baa386e | ||
|
|
bb8ca7c7fd | ||
|
|
d597e6d42b | ||
|
|
71ccb07a43 | ||
|
|
ad8d777c1c | ||
|
|
2f97b43315 | ||
|
|
533a92636c | ||
|
|
bf303a6575 | ||
|
|
8cd20485f8 | ||
|
|
933a869f00 | ||
|
|
8c6541fea9 | ||
|
|
5cf75d92d8 |
42
.github/workflows/build_and_test.yml
vendored
42
.github/workflows/build_and_test.yml
vendored
@@ -933,6 +933,46 @@ jobs:
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
build-private-extensions:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
needs: [ promote-images, tag ]
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"build-and-upload-extensions\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"build-and-upload-extensions\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"remote_branch_name\": \"${{ github.ref_name }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
upload-postgres-extensions-to-s3:
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
@@ -945,7 +985,7 @@ jobs:
|
||||
version: [ v14, v15 ]
|
||||
|
||||
env:
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ github.ref_name == 'release' && 'latest' || needs.tag.outputs.build-tag }}
|
||||
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
|
||||
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
|
||||
|
||||
21
Cargo.lock
generated
21
Cargo.lock
generated
@@ -3569,9 +3569,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.100.1"
|
||||
version = "0.100.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
|
||||
checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"untrusted",
|
||||
@@ -4140,13 +4140,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.38"
|
||||
version = "0.4.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
|
||||
checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
"xattr 0.2.3",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4437,7 +4437,7 @@ dependencies = [
|
||||
"redox_syscall 0.3.5",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"xattr 1.0.0",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5404,15 +5404,6 @@ dependencies = [
|
||||
"time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "1.0.0"
|
||||
|
||||
@@ -108,10 +108,12 @@ pub fn get_pg_version(pgbin: &str) -> String {
|
||||
// pg_config --version returns a (platform specific) human readable string
|
||||
// such as "PostgreSQL 15.4". We parse this to v14/v15
|
||||
let human_version = get_pg_config("--version", pgbin);
|
||||
if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("14") {
|
||||
if human_version.contains("14") {
|
||||
return "v14".to_string();
|
||||
} else if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("16") {
|
||||
return "v16".to_string();
|
||||
}
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
@@ -4,7 +4,12 @@
|
||||
# to your expectations and requirements.
|
||||
|
||||
# Root options
|
||||
targets = []
|
||||
targets = [
|
||||
{ triple = "x86_64-unknown-linux-gnu" },
|
||||
{ triple = "aarch64-unknown-linux-gnu" },
|
||||
{ triple = "aarch64-apple-darwin" },
|
||||
{ triple = "x86_64-apple-darwin" },
|
||||
]
|
||||
all-features = false
|
||||
no-default-features = false
|
||||
feature-depth = 1
|
||||
@@ -18,7 +23,7 @@ vulnerability = "deny"
|
||||
unmaintained = "warn"
|
||||
yanked = "warn"
|
||||
notice = "warn"
|
||||
ignore = []
|
||||
ignore = ["RUSTSEC-2023-0052"]
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
|
||||
@@ -215,7 +215,6 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
TimelineId::generate(),
|
||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
Lsn(i),
|
||||
false,
|
||||
0,
|
||||
);
|
||||
updates.insert_historic(layer);
|
||||
|
||||
@@ -6,7 +6,7 @@ use metrics::{
|
||||
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum::VariantNames;
|
||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -570,23 +570,160 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
|
||||
"get_rel_exists",
|
||||
"get_rel_size",
|
||||
"get_page_at_lsn",
|
||||
"get_db_size",
|
||||
];
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
per_tenant_timeline: Histogram,
|
||||
}
|
||||
|
||||
pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
impl GlobalAndPerTimelineHistogram {
|
||||
fn observe(&self, value: f64) {
|
||||
self.global.observe(value);
|
||||
self.per_tenant_timeline.observe(value);
|
||||
}
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
h: &'a GlobalAndPerTimelineHistogram,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
self.h.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
IntoStaticStr,
|
||||
strum_macros::EnumCount,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum SmgrQueryType {
|
||||
GetRelExists,
|
||||
GetRelSize,
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SmgrQueryTimePerTimeline {
|
||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||
}
|
||||
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling",
|
||||
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds_global",
|
||||
"Time spent on smgr query handling, aggregated by query type.",
|
||||
&["smgr_query_type"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let metrics = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
let global = SMGR_QUERY_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap();
|
||||
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
GlobalAndPerTimelineHistogram {
|
||||
global,
|
||||
per_tenant_timeline,
|
||||
}
|
||||
});
|
||||
Self { metrics }
|
||||
}
|
||||
pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
|
||||
let metric = &self.metrics[op as usize];
|
||||
GlobalAndPerTimelineHistogramTimer {
|
||||
h: metric,
|
||||
start: std::time::Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
use super::SmgrQueryType::*;
|
||||
let expect: [(super::SmgrQueryType, &'static str); 4] = [
|
||||
(GetRelExists, "get_rel_exists"),
|
||||
(GetRelSize, "get_rel_size"),
|
||||
(GetPageAtLsn, "get_page_at_lsn"),
|
||||
(GetDbSize, "get_db_size"),
|
||||
];
|
||||
for (op, expect) in expect {
|
||||
let actual: &'static str = op.into();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
|
||||
|
||||
for op in &ops {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.metrics[*op as usize].global.get_sample_count())
|
||||
.sum();
|
||||
let per_tenant_timeline: u64 = ops
|
||||
.iter()
|
||||
.map(|op| {
|
||||
metrics.metrics[*op as usize]
|
||||
.per_tenant_timeline
|
||||
.get_sample_count()
|
||||
})
|
||||
.sum();
|
||||
(global, per_tenant_timeline)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let timer = metrics.start_timer(*op);
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
assert!(post_global > pre_global);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
@@ -1045,6 +1182,12 @@ impl Drop for TimelineMetrics {
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove(tenant_id, timeline_id);
|
||||
|
||||
// The following metrics are born outside of the TimelineMetrics lifecycle but still
|
||||
// removed at the end of it. The idea is to have the metrics outlive the
|
||||
// entity during which they're observed, e.g., the smgr metrics shall
|
||||
// outlive an individual smgr connection, but not the timeline.
|
||||
|
||||
for op in StorageTimeOperation::VARIANTS {
|
||||
let _ =
|
||||
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -1056,8 +1199,12 @@ impl Drop for TimelineMetrics {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in SMGR_QUERY_TIME_OPERATIONS {
|
||||
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
for op in SmgrQueryType::iter() {
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
op.into(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,7 +50,8 @@ use crate::basebackup;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
@@ -306,39 +307,6 @@ async fn page_service_conn_main(
|
||||
}
|
||||
}
|
||||
|
||||
struct PageRequestMetrics {
|
||||
get_rel_exists: metrics::Histogram,
|
||||
get_rel_size: metrics::Histogram,
|
||||
get_page_at_lsn: metrics::Histogram,
|
||||
get_db_size: metrics::Histogram,
|
||||
}
|
||||
|
||||
impl PageRequestMetrics {
|
||||
fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
|
||||
let get_rel_exists =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_rel_size =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_page_at_lsn =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_db_size =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]);
|
||||
|
||||
Self {
|
||||
get_rel_exists,
|
||||
get_rel_size,
|
||||
get_page_at_lsn,
|
||||
get_db_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PageServerHandler {
|
||||
_conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -406,7 +374,7 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
|
||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
@@ -446,21 +414,21 @@ impl PageServerHandler {
|
||||
|
||||
let response = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let _timer = metrics.get_rel_exists.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
|
||||
self.handle_get_rel_exists_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let _timer = metrics.get_rel_size.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
|
||||
self.handle_get_nblocks_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let _timer = metrics.get_page_at_lsn.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let _timer = metrics.get_db_size.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
|
||||
self.handle_db_size_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
};
|
||||
@@ -984,8 +952,8 @@ where
|
||||
false
|
||||
};
|
||||
|
||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
|
||||
@@ -72,7 +72,6 @@ use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::InitializationOrder;
|
||||
|
||||
use crate::tenant::timeline::delete::DeleteTimelineFlow;
|
||||
|
||||
@@ -50,7 +50,6 @@ use crate::context::RequestContext;
|
||||
use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::Result;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
|
||||
@@ -344,23 +344,6 @@ impl LayerAccessStats {
|
||||
/// are used in (timeline).
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
/// Inclusive start bound of the LSN range that this layer holds
|
||||
/// Exclusive end bound of the LSN range that this layer holds.
|
||||
///
|
||||
/// - For an open in-memory layer, this is MAX_LSN.
|
||||
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
fn get_lsn_range(&self) -> Range<Lsn>;
|
||||
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
|
||||
///
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
@@ -380,9 +363,6 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
@@ -467,7 +447,6 @@ pub mod tests {
|
||||
TimelineId::from_array([0; 16]),
|
||||
value.key_range,
|
||||
value.lsn,
|
||||
false,
|
||||
233,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -239,8 +239,54 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for DeltaLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
self.local_path()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
self.delete_resident_layer_file()
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.info(reset)
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
self.access_stats()
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -312,7 +358,7 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -331,52 +377,19 @@ impl Layer for DeltaLayer {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for DeltaLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
pub(crate) fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_range = self.layer_desc().lsn_range.clone();
|
||||
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
@@ -390,12 +403,10 @@ impl PersistentLayer for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.access_stats
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
tenant_id: &TenantId,
|
||||
|
||||
@@ -169,8 +169,52 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
/// Look up given page in the file
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
self.local_path()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
self.delete_resident_layer_file()
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.info(reset)
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
self.access_stats()
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -178,7 +222,7 @@ impl Layer for ImageLayer {
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.lsn,
|
||||
self.desc.is_incremental,
|
||||
self.desc.is_incremental(),
|
||||
self.desc.file_size
|
||||
);
|
||||
|
||||
@@ -203,8 +247,7 @@ impl Layer for ImageLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
async fn get_value_reconstruct_data(
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -225,65 +268,33 @@ impl Layer for ImageLayer {
|
||||
.with_context(|| format!("read {}", self.path().display()))
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
pub(crate) fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_start = self.layer_desc().image_layer_lsn();
|
||||
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name,
|
||||
layer_file_size: self.desc.file_size,
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.access_stats
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -371,7 +382,6 @@ impl ImageLayer {
|
||||
timeline_id,
|
||||
filename.key_range.clone(),
|
||||
filename.lsn,
|
||||
false,
|
||||
file_size,
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: filename.lsn,
|
||||
@@ -398,7 +408,6 @@ impl ImageLayer {
|
||||
summary.timeline_id,
|
||||
summary.key_range,
|
||||
summary.lsn,
|
||||
false,
|
||||
metadata.len(),
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
@@ -500,7 +509,6 @@ struct ImageLayerWriterInner {
|
||||
tenant_id: TenantId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
|
||||
blob_writer: WriteBlobWriter<VirtualFile>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
@@ -516,7 +524,6 @@ impl ImageLayerWriterInner {
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
@@ -551,7 +558,6 @@ impl ImageLayerWriterInner {
|
||||
lsn,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
is_incremental,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -612,7 +618,6 @@ impl ImageLayerWriterInner {
|
||||
self.timeline_id,
|
||||
self.key_range.clone(),
|
||||
self.lsn,
|
||||
self.is_incremental, // for now, image layer ALWAYS covers the full range
|
||||
metadata.len(),
|
||||
);
|
||||
|
||||
@@ -687,7 +692,6 @@ impl ImageLayerWriter {
|
||||
tenant_id: TenantId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(ImageLayerWriterInner::new(
|
||||
@@ -696,7 +700,6 @@ impl ImageLayerWriter {
|
||||
tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
is_incremental,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -85,11 +85,11 @@ impl std::fmt::Debug for InMemoryLayerInner {
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
pub fn get_timeline_id(&self) -> TimelineId {
|
||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
pub fn info(&self) -> InMemoryLayerInfo {
|
||||
pub(crate) fn info(&self) -> InMemoryLayerInfo {
|
||||
let lsn_start = self.start_lsn;
|
||||
|
||||
if let Some(&lsn_end) = self.end_lsn.get() {
|
||||
@@ -99,32 +99,22 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_writable(&self) {
|
||||
pub(crate) fn assert_writable(&self) {
|
||||
assert!(self.end_lsn.get().is_none());
|
||||
}
|
||||
|
||||
fn end_lsn_or_max(&self) -> Lsn {
|
||||
pub(crate) fn end_lsn_or_max(&self) -> Lsn {
|
||||
self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
// in-memory layer is always considered incremental.
|
||||
true
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
///
|
||||
/// this is likely completly unused
|
||||
pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_str = self.end_lsn_or_max();
|
||||
@@ -171,7 +161,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
async fn get_value_reconstruct_data(
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -221,6 +211,20 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let end_lsn = self.end_lsn_or_max();
|
||||
@@ -317,7 +321,7 @@ impl InMemoryLayer {
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
|
||||
@@ -19,16 +19,17 @@ use serde::{Deserialize, Serialize};
|
||||
pub struct PersistentLayerDesc {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
/// Range of keys that this layer covers
|
||||
pub key_range: Range<Key>,
|
||||
/// For image layer, this is `[lsn, lsn+1)`.
|
||||
/// Inclusive start, exclusive end of the LSN range that this layer holds.
|
||||
///
|
||||
/// - For an open in-memory layer, the end bound is MAX_LSN
|
||||
/// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
|
||||
/// range start
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
pub lsn_range: Range<Lsn>,
|
||||
/// Whether this is a delta layer.
|
||||
/// Whether this is a delta layer, and also, is this incremental.
|
||||
pub is_delta: bool,
|
||||
/// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
|
||||
/// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
|
||||
/// incremental.
|
||||
pub is_incremental: bool,
|
||||
/// File size
|
||||
pub file_size: u64,
|
||||
}
|
||||
|
||||
@@ -61,7 +62,6 @@ impl PersistentLayerDesc {
|
||||
key_range,
|
||||
lsn_range: Lsn(0)..Lsn(1),
|
||||
is_delta: false,
|
||||
is_incremental: false,
|
||||
file_size: 0,
|
||||
}
|
||||
}
|
||||
@@ -71,7 +71,6 @@ impl PersistentLayerDesc {
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
is_incremental: bool,
|
||||
file_size: u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
@@ -80,7 +79,6 @@ impl PersistentLayerDesc {
|
||||
key_range,
|
||||
lsn_range: Self::image_layer_lsn_range(lsn),
|
||||
is_delta: false,
|
||||
is_incremental,
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
@@ -98,7 +96,6 @@ impl PersistentLayerDesc {
|
||||
key_range,
|
||||
lsn_range,
|
||||
is_delta: true,
|
||||
is_incremental: true,
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
@@ -164,8 +161,12 @@ impl PersistentLayerDesc {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
pub fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
self.is_delta
|
||||
}
|
||||
|
||||
pub fn is_delta(&self) -> bool {
|
||||
@@ -182,7 +183,7 @@ impl PersistentLayerDesc {
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end,
|
||||
self.is_delta,
|
||||
self.is_incremental,
|
||||
self.is_incremental(),
|
||||
self.file_size,
|
||||
);
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ impl std::fmt::Debug for RemoteLayer {
|
||||
f.debug_struct("RemoteLayer")
|
||||
.field("file_name", &self.desc.filename())
|
||||
.field("layer_metadata", &self.layer_metadata)
|
||||
.field("is_incremental", &self.desc.is_incremental)
|
||||
.field("is_incremental", &self.desc.is_incremental())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -76,39 +76,6 @@ impl Layer for RemoteLayer {
|
||||
) -> Result<ValueReconstructResult> {
|
||||
bail!("layer {self} needs to be downloaded");
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.desc.lsn_range.start,
|
||||
self.desc.lsn_range.end,
|
||||
self.desc.is_delta,
|
||||
self.desc.is_incremental,
|
||||
self.desc.file_size,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.layer_desc().key_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.layer_desc().lsn_range.clone()
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.layer_desc().is_incremental
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
@@ -142,8 +109,8 @@ impl PersistentLayer for RemoteLayer {
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.filename().file_name();
|
||||
let lsn_range = self.get_lsn_range();
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_range = self.layer_desc().lsn_range.clone();
|
||||
|
||||
if self.desc.is_delta {
|
||||
HistoricLayerInfo::Delta {
|
||||
@@ -184,7 +151,6 @@ impl RemoteLayer {
|
||||
timelineid,
|
||||
fname.key_range.clone(),
|
||||
fname.lsn,
|
||||
false,
|
||||
layer_metadata.file_size(),
|
||||
),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
@@ -217,7 +183,7 @@ impl RemoteLayer {
|
||||
}
|
||||
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub fn create_downloaded_layer(
|
||||
pub(crate) fn create_downloaded_layer(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -95,7 +95,7 @@ use super::config::TenantConf;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStatsReset, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
@@ -3149,7 +3149,6 @@ impl Timeline {
|
||||
self.tenant_id,
|
||||
&img_range,
|
||||
lsn,
|
||||
false, // image layer always covers the full range
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
@@ -3410,8 +3409,8 @@ impl Timeline {
|
||||
/// start of level0 files compaction, the on-demand download should be revisited as well.
|
||||
///
|
||||
/// [`compact_inner`]: Self::compact_inner
|
||||
fn compact_level0_phase1(
|
||||
self: Arc<Self>,
|
||||
async fn compact_level0_phase1(
|
||||
self: &Arc<Self>,
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
|
||||
mut stats: CompactLevel0Phase1StatsBuilder,
|
||||
@@ -3562,7 +3561,7 @@ impl Timeline {
|
||||
.collect();
|
||||
for dl in downcast_deltas.iter() {
|
||||
// TODO: replace this with an await once we fully go async
|
||||
all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
|
||||
all_keys.extend(DeltaLayer::load_keys(dl, ctx).await?);
|
||||
}
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
@@ -3676,107 +3675,103 @@ impl Timeline {
|
||||
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
|
||||
// TODO remove this block_on wrapper once we fully go async
|
||||
Handle::current().block_on(async {
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
{
|
||||
let value = val.load().await?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
}
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if dup_end_lsn.is_valid() {
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
} else {
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
{
|
||||
let value = val.load().await?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
}
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Result::<_>::Err(anyhow::anyhow!(
|
||||
"failpoint delta-layer-writer-fail-before-finish"
|
||||
))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Err(CompactionError::Other(anyhow::anyhow!(
|
||||
"failpoint delta-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
|
||||
}
|
||||
@@ -3789,10 +3784,10 @@ impl Timeline {
|
||||
// we still might easily hit the limit otherwise.
|
||||
let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
|
||||
for layer in new_layers.iter() {
|
||||
if layer.desc.file_size > warn_limit {
|
||||
if layer.layer_desc().file_size > warn_limit {
|
||||
warn!(
|
||||
%layer,
|
||||
"created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
|
||||
"created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -3810,7 +3805,7 @@ impl Timeline {
|
||||
|
||||
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
|
||||
stats.new_deltas_count = Some(new_layers.len());
|
||||
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());
|
||||
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
|
||||
|
||||
match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
|
||||
.and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
|
||||
@@ -3850,8 +3845,7 @@ impl Timeline {
|
||||
deltas_to_compact,
|
||||
} = {
|
||||
let phase1_span = info_span!("compact_level0_phase1");
|
||||
let myself = Arc::clone(self);
|
||||
let ctx = ctx.attached_child(); // technically, the spawn_blocking can outlive this future
|
||||
let ctx = ctx.attached_child();
|
||||
let mut stats = CompactLevel0Phase1StatsBuilder {
|
||||
version: Some(2),
|
||||
tenant_id: Some(self.tenant_id),
|
||||
@@ -3865,18 +3859,15 @@ impl Timeline {
|
||||
stats.read_lock_acquisition_micros =
|
||||
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
|
||||
let layer_removal_cs = layer_removal_cs.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _entered = phase1_span.enter();
|
||||
myself.compact_level0_phase1(
|
||||
layer_removal_cs,
|
||||
phase1_layers_locked,
|
||||
stats,
|
||||
target_file_size,
|
||||
&ctx,
|
||||
)
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")??
|
||||
self.compact_level0_phase1(
|
||||
layer_removal_cs,
|
||||
phase1_layers_locked,
|
||||
stats,
|
||||
target_file_size,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(phase1_span)
|
||||
.await?
|
||||
};
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
tenant::{
|
||||
layer_map::{BatchedUpdates, LayerMap},
|
||||
storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer,
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
|
||||
PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
|
||||
},
|
||||
timeline::compare_arced_layers,
|
||||
@@ -20,30 +20,30 @@ use crate::{
|
||||
};
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
pub struct LayerManager {
|
||||
pub(crate) struct LayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager,
|
||||
}
|
||||
|
||||
/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
|
||||
/// scheduling deletes in remote client.
|
||||
pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
|
||||
pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
|
||||
|
||||
impl ApplyGcResultGuard<'_> {
|
||||
pub fn flush(self) {
|
||||
pub(crate) fn flush(self) {
|
||||
self.0.flush();
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub fn create() -> Self {
|
||||
pub(crate) fn create() -> Self {
|
||||
Self {
|
||||
layer_map: LayerMap::default(),
|
||||
layer_fmgr: LayerFileManager::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
}
|
||||
|
||||
@@ -51,18 +51,12 @@ impl LayerManager {
|
||||
///
|
||||
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
|
||||
pub fn layer_map(&self) -> &LayerMap {
|
||||
pub(crate) fn layer_map(&self) -> &LayerMap {
|
||||
&self.layer_map
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer`
|
||||
/// gets a refactor.
|
||||
pub fn layer_map_mut(&mut self) -> &mut LayerMap {
|
||||
&mut self.layer_map
|
||||
}
|
||||
|
||||
/// Replace layers in the layer file manager, used in evictions and layer downloads.
|
||||
pub fn replace_and_verify(
|
||||
pub(crate) fn replace_and_verify(
|
||||
&mut self,
|
||||
expected: Arc<dyn PersistentLayer>,
|
||||
new: Arc<dyn PersistentLayer>,
|
||||
@@ -73,7 +67,7 @@ impl LayerManager {
|
||||
/// Called from `load_layer_map`. Initialize the layer manager with:
|
||||
/// 1. all on-disk layers
|
||||
/// 2. next open layer (with disk disk_consistent_lsn LSN)
|
||||
pub fn initialize_local_layers(
|
||||
pub(crate) fn initialize_local_layers(
|
||||
&mut self,
|
||||
on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
next_open_layer_at: Lsn,
|
||||
@@ -87,11 +81,11 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Initialize when creating a new timeline, called in `init_empty_layer_map`.
|
||||
pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
|
||||
pub(crate) fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
|
||||
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||
}
|
||||
|
||||
pub fn initialize_remote_layers(
|
||||
pub(crate) fn initialize_remote_layers(
|
||||
&mut self,
|
||||
corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
remote_layers: Vec<Arc<RemoteLayer>>,
|
||||
@@ -108,7 +102,7 @@ impl LayerManager {
|
||||
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||
/// called within `get_layer_for_write`.
|
||||
pub fn get_layer_for_write(
|
||||
pub(crate) fn get_layer_for_write(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_record_lsn: Lsn,
|
||||
@@ -163,7 +157,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub async fn try_freeze_in_memory_layer(
|
||||
pub(crate) async fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
Lsn(last_record_lsn): Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
@@ -185,7 +179,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
pub fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
|
||||
pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in image_layers {
|
||||
Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
|
||||
@@ -194,7 +188,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Flush a frozen layer and add the written delta layer to the layer map.
|
||||
pub fn finish_flush_l0_layer(
|
||||
pub(crate) fn finish_flush_l0_layer(
|
||||
&mut self,
|
||||
delta_layer: Option<DeltaLayer>,
|
||||
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
||||
@@ -214,7 +208,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub fn finish_compact_l0(
|
||||
pub(crate) fn finish_compact_l0(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
compact_from: Vec<Arc<dyn PersistentLayer>>,
|
||||
@@ -242,7 +236,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
|
||||
pub fn finish_gc_timeline(
|
||||
pub(crate) fn finish_gc_timeline(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
gc_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
@@ -313,7 +307,7 @@ impl LayerManager {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||
pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||
HashMap<PersistentLayerKey, Arc<T>>,
|
||||
);
|
||||
|
||||
|
||||
@@ -15,8 +15,10 @@ use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use utils::http::endpoint::request_span;
|
||||
|
||||
use crate::receive_wal::WalReceiverState;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::{debug_dump, pull_timeline};
|
||||
|
||||
use crate::timelines_global_map::TimelineDeleteForceResult;
|
||||
@@ -99,6 +101,8 @@ pub struct TimelineStatus {
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
pub walsenders: Vec<WalSenderState>,
|
||||
pub walreceivers: Vec<WalReceiverState>,
|
||||
}
|
||||
|
||||
fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
|
||||
@@ -149,6 +153,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
backup_lsn: inmem.backup_lsn,
|
||||
peer_horizon_lsn: inmem.peer_horizon_lsn,
|
||||
remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
|
||||
walsenders: tli.get_walsenders().get_all(),
|
||||
walreceivers: tli.get_walreceivers().get_all(),
|
||||
};
|
||||
json_response(StatusCode::OK, status)
|
||||
}
|
||||
|
||||
@@ -11,11 +11,16 @@ use crate::wal_service::ConnectionId;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::BytesMut;
|
||||
use parking_lot::MappedMutexGuard;
|
||||
use parking_lot::Mutex;
|
||||
use parking_lot::MutexGuard;
|
||||
use postgres_backend::CopyStreamHandlerEnd;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::PostgresBackendReader;
|
||||
use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::AsyncRead;
|
||||
@@ -32,6 +37,105 @@ use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
|
||||
/// in Arc).
|
||||
pub struct WalReceivers {
|
||||
mutex: Mutex<WalReceiversShared>,
|
||||
}
|
||||
|
||||
/// Id under which walreceiver is registered in shmem.
|
||||
type WalReceiverId = usize;
|
||||
|
||||
impl WalReceivers {
|
||||
pub fn new() -> Arc<WalReceivers> {
|
||||
Arc::new(WalReceivers {
|
||||
mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
|
||||
})
|
||||
}
|
||||
|
||||
/// Register new walreceiver. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walreceiver = WalReceiverState::Voting;
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walreceiver);
|
||||
pos
|
||||
} else {
|
||||
let pos = slots.len();
|
||||
slots.push(Some(walreceiver));
|
||||
pos
|
||||
};
|
||||
WalReceiverGuard {
|
||||
id: pos,
|
||||
walreceivers: self.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get reference to locked slot contents. Slot must exist (registered
|
||||
/// earlier).
|
||||
fn get_slot<'a>(
|
||||
self: &'a Arc<WalReceivers>,
|
||||
id: WalReceiverId,
|
||||
) -> MappedMutexGuard<'a, WalReceiverState> {
|
||||
MutexGuard::map(self.mutex.lock(), |locked| {
|
||||
locked.slots[id]
|
||||
.as_mut()
|
||||
.expect("walreceiver doesn't exist")
|
||||
})
|
||||
}
|
||||
|
||||
/// Get number of walreceivers (compute connections).
|
||||
pub fn get_num(self: &Arc<WalReceivers>) -> usize {
|
||||
self.mutex.lock().slots.iter().flatten().count()
|
||||
}
|
||||
|
||||
/// Get state of all walreceivers.
|
||||
pub fn get_all(self: &Arc<WalReceivers>) -> Vec<WalReceiverState> {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
}
|
||||
|
||||
/// Unregister walsender.
|
||||
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
|
||||
let mut shared = self.mutex.lock();
|
||||
shared.slots[id] = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// Only a few connections are expected (normally one), so store in Vec.
|
||||
struct WalReceiversShared {
|
||||
slots: Vec<Option<WalReceiverState>>,
|
||||
}
|
||||
|
||||
/// Walreceiver status. Currently only whether it passed voting stage and
|
||||
/// started receiving the stream, but it is easy to add more if needed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum WalReceiverState {
|
||||
Voting,
|
||||
Streaming,
|
||||
}
|
||||
|
||||
/// Scope guard to access slot in WalSenders registry and unregister from it in
|
||||
/// Drop.
|
||||
pub struct WalReceiverGuard {
|
||||
id: WalReceiverId,
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
}
|
||||
|
||||
impl WalReceiverGuard {
|
||||
/// Get reference to locked shared state contents.
|
||||
fn get(&self) -> MappedMutexGuard<WalReceiverState> {
|
||||
self.walreceivers.get_slot(self.id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalReceiverGuard {
|
||||
fn drop(&mut self) {
|
||||
self.walreceivers.unregister(self.id);
|
||||
}
|
||||
}
|
||||
|
||||
const MSG_QUEUE_SIZE: usize = 256;
|
||||
const REPLY_QUEUE_SIZE: usize = 16;
|
||||
|
||||
@@ -246,10 +350,13 @@ impl WalAcceptor {
|
||||
/// it must mean that network thread terminated.
|
||||
async fn run(&mut self) -> anyhow::Result<()> {
|
||||
// Register the connection and defer unregister.
|
||||
self.tli.on_compute_connect().await?;
|
||||
let _guard = ComputeConnectionGuard {
|
||||
// Order of the next two lines is important: we want first to remove our entry and then
|
||||
// update status which depends on registered connections.
|
||||
let _compute_conn_guard = ComputeConnectionGuard {
|
||||
timeline: Arc::clone(&self.tli),
|
||||
};
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register();
|
||||
self.tli.update_status_notify().await?;
|
||||
|
||||
// After this timestamp we will stop processing AppendRequests and send a response
|
||||
// to the walproposer. walproposer sends at least one AppendRequest per second,
|
||||
@@ -263,6 +370,11 @@ impl WalAcceptor {
|
||||
}
|
||||
let mut next_msg = opt_msg.unwrap();
|
||||
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
|
||||
*walreceiver_guard.get() = WalReceiverState::Streaming;
|
||||
}
|
||||
|
||||
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
|
||||
// loop through AppendRequest's while it's readily available to
|
||||
// write as many WAL as possible without fsyncing
|
||||
@@ -311,6 +423,7 @@ impl WalAcceptor {
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls update_status_notify in drop to update timeline status.
|
||||
struct ComputeConnectionGuard {
|
||||
timeline: Arc<Timeline>,
|
||||
}
|
||||
@@ -318,11 +431,9 @@ struct ComputeConnectionGuard {
|
||||
impl Drop for ComputeConnectionGuard {
|
||||
fn drop(&mut self) {
|
||||
let tli = self.timeline.clone();
|
||||
// tokio forbids to call blocking_send inside the runtime, and see
|
||||
// comments in on_compute_disconnect why we call blocking_send.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = tli.on_compute_disconnect().await {
|
||||
error!("failed to unregister compute connection: {}", e);
|
||||
if let Err(e) = tli.update_status_notify().await {
|
||||
error!("failed to update timeline status: {}", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use utils::{
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||
SafekeeperMemState, ServerInfo, Term,
|
||||
@@ -164,8 +165,8 @@ impl SharedState {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_active(&self, remote_consistent_lsn: Lsn) -> bool {
|
||||
self.is_wal_backup_required()
|
||||
fn is_active(&self, num_computes: usize, remote_consistent_lsn: Lsn) -> bool {
|
||||
self.is_wal_backup_required(num_computes)
|
||||
// FIXME: add tracking of relevant pageservers and check them here individually,
|
||||
// otherwise migration won't work (we suspend too early).
|
||||
|| remote_consistent_lsn < self.sk.inmem.commit_lsn
|
||||
@@ -173,29 +174,34 @@ impl SharedState {
|
||||
|
||||
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||
/// start/stop action.
|
||||
fn update_status(&mut self, remote_consistent_lsn: Lsn, ttid: TenantTimelineId) -> bool {
|
||||
let is_active = self.is_active(remote_consistent_lsn);
|
||||
fn update_status(
|
||||
&mut self,
|
||||
num_computes: usize,
|
||||
remote_consistent_lsn: Lsn,
|
||||
ttid: TenantTimelineId,
|
||||
) -> bool {
|
||||
let is_active = self.is_active(num_computes, remote_consistent_lsn);
|
||||
if self.active != is_active {
|
||||
info!("timeline {} active={} now", ttid, is_active);
|
||||
}
|
||||
self.active = is_active;
|
||||
self.is_wal_backup_action_pending()
|
||||
self.is_wal_backup_action_pending(num_computes)
|
||||
}
|
||||
|
||||
/// Should we run s3 offloading in current state?
|
||||
fn is_wal_backup_required(&self) -> bool {
|
||||
fn is_wal_backup_required(&self, num_computes: usize) -> bool {
|
||||
let seg_size = self.get_wal_seg_size();
|
||||
self.num_computes > 0 ||
|
||||
num_computes > 0 ||
|
||||
// Currently only the whole segment is offloaded, so compare segment numbers.
|
||||
(self.sk.inmem.commit_lsn.segment_number(seg_size) >
|
||||
self.sk.inmem.backup_lsn.segment_number(seg_size))
|
||||
(self.sk.inmem.commit_lsn.segment_number(seg_size) >
|
||||
self.sk.inmem.backup_lsn.segment_number(seg_size))
|
||||
}
|
||||
|
||||
/// Is current state of s3 offloading is not what it ought to be?
|
||||
fn is_wal_backup_action_pending(&self) -> bool {
|
||||
let res = self.wal_backup_active != self.is_wal_backup_required();
|
||||
fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
|
||||
let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
|
||||
if res {
|
||||
let action_pending = if self.is_wal_backup_required() {
|
||||
let action_pending = if self.is_wal_backup_required(num_computes) {
|
||||
"start"
|
||||
} else {
|
||||
"stop"
|
||||
@@ -210,8 +216,8 @@ impl SharedState {
|
||||
|
||||
/// Returns whether s3 offloading is required and sets current status as
|
||||
/// matching.
|
||||
fn wal_backup_attend(&mut self) -> bool {
|
||||
self.wal_backup_active = self.is_wal_backup_required();
|
||||
fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
|
||||
self.wal_backup_active = self.is_wal_backup_required(num_computes);
|
||||
self.wal_backup_active
|
||||
}
|
||||
|
||||
@@ -295,6 +301,7 @@ pub struct Timeline {
|
||||
/// while holding it, ensuring that consensus checks are in order.
|
||||
mutex: Mutex<SharedState>,
|
||||
walsenders: Arc<WalSenders>,
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
|
||||
/// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
|
||||
cancellation_tx: watch::Sender<bool>,
|
||||
@@ -329,6 +336,7 @@ impl Timeline {
|
||||
commit_lsn_watch_rx,
|
||||
mutex: Mutex::new(shared_state),
|
||||
walsenders: WalSenders::new(rcl),
|
||||
walreceivers: WalReceivers::new(),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -355,6 +363,7 @@ impl Timeline {
|
||||
commit_lsn_watch_rx,
|
||||
mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
|
||||
walsenders: WalSenders::new(Lsn(0)),
|
||||
walreceivers: WalReceivers::new(),
|
||||
cancellation_rx,
|
||||
cancellation_tx,
|
||||
timeline_dir: conf.timeline_dir(&ttid),
|
||||
@@ -441,40 +450,22 @@ impl Timeline {
|
||||
}
|
||||
|
||||
fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state.update_status(self.get_walsenders().get_remote_consistent_lsn(), self.ttid)
|
||||
shared_state.update_status(
|
||||
self.walreceivers.get_num(),
|
||||
self.get_walsenders().get_remote_consistent_lsn(),
|
||||
self.ttid,
|
||||
)
|
||||
}
|
||||
|
||||
/// Register compute connection, starting timeline-related activity if it is
|
||||
/// not running yet.
|
||||
pub async fn on_compute_connect(&self) -> Result<()> {
|
||||
/// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
|
||||
pub async fn update_status_notify(&self) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
let is_wal_backup_action_pending: bool;
|
||||
{
|
||||
let is_wal_backup_action_pending: bool = {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.num_computes += 1;
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
// Wake up wal backup launcher, if offloading not started yet.
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
self.wal_backup_launcher_tx.send(self.ttid).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// De-register compute connection, shutting down timeline activity if
|
||||
/// pageserver doesn't need catchup.
|
||||
pub async fn on_compute_disconnect(&self) -> Result<()> {
|
||||
let is_wal_backup_action_pending: bool;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.num_computes -= 1;
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||
self.update_status(&mut shared_state)
|
||||
};
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
self.wal_backup_launcher_tx.send(self.ttid).await?;
|
||||
@@ -519,7 +510,9 @@ impl Timeline {
|
||||
return false;
|
||||
}
|
||||
|
||||
self.write_shared_state().await.wal_backup_attend()
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.wal_backup_attend(self.walreceivers.get_num())
|
||||
}
|
||||
|
||||
/// Returns commit_lsn watch channel.
|
||||
@@ -650,6 +643,10 @@ impl Timeline {
|
||||
&self.walsenders
|
||||
}
|
||||
|
||||
pub fn get_walreceivers(&self) -> &Arc<WalReceivers> {
|
||||
&self.walreceivers
|
||||
}
|
||||
|
||||
/// Returns flush_lsn.
|
||||
pub async fn get_flush_lsn(&self) -> Lsn {
|
||||
self.write_shared_state().await.sk.wal_store.flush_lsn()
|
||||
|
||||
@@ -70,6 +70,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_getpage_reconstruct_seconds_count",
|
||||
"pageserver_getpage_reconstruct_seconds_sum",
|
||||
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
||||
*histogram("pageserver_smgr_query_seconds_global"),
|
||||
*histogram("pageserver_read_num_fs_layers"),
|
||||
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
|
||||
*histogram("pageserver_wait_lsn_seconds"),
|
||||
|
||||
Reference in New Issue
Block a user