mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 06:00:38 +00:00
Compare commits
13 Commits
vlad/try-b
...
proxy-leak
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef7e96fb4e | ||
|
|
54c5196f75 | ||
|
|
2416da337e | ||
|
|
6cad0455b0 | ||
|
|
b5e95f68b5 | ||
|
|
dd40b19db4 | ||
|
|
68241f5a3e | ||
|
|
8154e88732 | ||
|
|
240ba7e10c | ||
|
|
7a796a9963 | ||
|
|
eddfd62333 | ||
|
|
cdaa2816e7 | ||
|
|
3cecbfc04d |
@@ -131,8 +131,8 @@ runs:
|
||||
exit 1
|
||||
fi
|
||||
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||
# -n16 uses sixteen processes to run tests via pytest-xdist
|
||||
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
|
||||
# -n sets the number of parallel processes that pytest-xdist will run
|
||||
EXTRA_PARAMS="-n12 $EXTRA_PARAMS"
|
||||
|
||||
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
|
||||
# to the same worker to make @pytest.mark.order work with xdist
|
||||
|
||||
@@ -278,9 +278,6 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: true
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
|
||||
3
.github/workflows/build_and_test.yml
vendored
3
.github/workflows/build_and_test.yml
vendored
@@ -286,9 +286,6 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
PAGESERVER_GET_IMPL: vectored
|
||||
PAGESERVER_VALIDATE_VEC_GET: false
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/storage_controller @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/safekeeper_api/ @neondatabase/storage
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
|
||||
/proxy/ @neondatabase/proxy
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/safekeeper/ @neondatabase/storage
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
@@ -52,7 +52,7 @@ pub mod defaults {
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
|
||||
@@ -83,16 +83,16 @@ pub mod defaults {
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
|
||||
|
||||
pub const DEFAULT_GET_IMPL: &str = "legacy";
|
||||
pub const DEFAULT_GET_IMPL: &str = "vectored";
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Disabled;
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
|
||||
@@ -102,8 +102,7 @@ use std::fmt::Debug;
|
||||
use std::fmt::Display;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -1227,11 +1226,29 @@ impl Tenant {
|
||||
Ok(timeline_preloads)
|
||||
}
|
||||
|
||||
pub async fn apply_timeline_archival_config(
|
||||
pub(crate) async fn apply_timeline_archival_config(
|
||||
&self,
|
||||
_timeline_id: TimelineId,
|
||||
_config: TimelineArchivalState,
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineArchivalState,
|
||||
) -> anyhow::Result<()> {
|
||||
let timeline = self
|
||||
.get_timeline(timeline_id, false)
|
||||
.context("Cannot apply timeline archival config to inexistent timeline")?;
|
||||
|
||||
let upload_needed = timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_timeline_archival_state(state)?;
|
||||
|
||||
if upload_needed {
|
||||
const MAX_WAIT: Duration = Duration::from_secs(10);
|
||||
let Ok(v) =
|
||||
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
||||
else {
|
||||
tracing::warn!("reached timeout for waiting on upload queue");
|
||||
bail!("reached timeout for upload queue flush");
|
||||
};
|
||||
v?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1616,21 +1633,23 @@ impl Tenant {
|
||||
/// This function is periodically called by compactor task.
|
||||
/// Also it can be explicitly requested per timeline through page server
|
||||
/// api's 'compact' command.
|
||||
///
|
||||
/// Returns whether we have pending compaction task.
|
||||
async fn compaction_iteration(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), timeline::CompactionError> {
|
||||
) -> Result<bool, timeline::CompactionError> {
|
||||
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
|
||||
if !self.is_active() {
|
||||
return Ok(());
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.load();
|
||||
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
||||
info!("Skipping compaction in location state {:?}", conf.location);
|
||||
return Ok(());
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1657,11 +1676,13 @@ impl Tenant {
|
||||
// Before doing any I/O work, check our circuit breaker
|
||||
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
|
||||
info!("Skipping compaction due to previous failures");
|
||||
return Ok(());
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let mut has_pending_task = false;
|
||||
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
timeline
|
||||
has_pending_task |= timeline
|
||||
.compact(cancel, EnumSet::empty(), ctx)
|
||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||
.await
|
||||
@@ -1681,7 +1702,7 @@ impl Tenant {
|
||||
.unwrap()
|
||||
.success(&CIRCUIT_BREAKERS_UNBROKEN);
|
||||
|
||||
Ok(())
|
||||
Ok(has_pending_task)
|
||||
}
|
||||
|
||||
// Call through to all timelines to freeze ephemeral layers if needed. Usually
|
||||
|
||||
@@ -187,7 +187,7 @@ use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -457,6 +457,17 @@ impl RemoteTimelineClient {
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns whether the timeline is archived.
|
||||
/// Return None if the remote index_part hasn't been downloaded yet.
|
||||
pub(crate) fn is_archived(&self) -> Option<bool> {
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.initialized_mut()
|
||||
.map(|q| q.clean.0.archived_at.is_some())
|
||||
.ok()
|
||||
}
|
||||
|
||||
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
|
||||
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
|
||||
current_remote_index_part
|
||||
@@ -617,7 +628,7 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
|
||||
/// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
|
||||
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
|
||||
self: &Arc<Self>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
@@ -628,6 +639,48 @@ impl RemoteTimelineClient {
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
|
||||
///
|
||||
/// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
|
||||
/// so either if the change is already sitting in the queue, but not commited yet, or the change has not
|
||||
/// been in the queue yet.
|
||||
pub(crate) fn schedule_index_upload_for_timeline_archival_state(
|
||||
self: &Arc<Self>,
|
||||
state: TimelineArchivalState,
|
||||
) -> anyhow::Result<bool> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
/// Returns Some(_) if a change is needed, and Some(true) if it's a
|
||||
/// change needed to set archived_at.
|
||||
fn need_change(
|
||||
archived_at: &Option<NaiveDateTime>,
|
||||
state: TimelineArchivalState,
|
||||
) -> Option<bool> {
|
||||
match (archived_at, state) {
|
||||
(Some(_), TimelineArchivalState::Archived)
|
||||
| (None, TimelineArchivalState::Unarchived) => {
|
||||
// Nothing to do
|
||||
tracing::info!("intended state matches present state");
|
||||
None
|
||||
}
|
||||
(None, TimelineArchivalState::Archived) => Some(true),
|
||||
(Some(_), TimelineArchivalState::Unarchived) => Some(false),
|
||||
}
|
||||
}
|
||||
let need_upload_scheduled = need_change(&upload_queue.dirty.archived_at, state);
|
||||
|
||||
if let Some(archived_at_set) = need_upload_scheduled {
|
||||
let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc());
|
||||
upload_queue.dirty.archived_at = intended_archived_at;
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
}
|
||||
|
||||
let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some();
|
||||
Ok(need_wait)
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
|
||||
@@ -32,6 +32,10 @@ pub struct IndexPart {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_at: Option<NaiveDateTime>,
|
||||
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub archived_at: Option<NaiveDateTime>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
@@ -80,10 +84,11 @@ impl IndexPart {
|
||||
/// - 5: lineage was added
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
/// - 7: metadata_bytes is no longer written, but still read
|
||||
const LATEST_VERSION: usize = 7;
|
||||
/// - 8: added `archived_at`
|
||||
const LATEST_VERSION: usize = 8;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -94,6 +99,7 @@ impl IndexPart {
|
||||
disk_consistent_lsn: metadata.disk_consistent_lsn(),
|
||||
metadata,
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
last_aux_file_policy: None,
|
||||
}
|
||||
@@ -284,6 +290,7 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
@@ -326,6 +333,7 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
@@ -369,6 +377,7 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
@@ -415,6 +424,7 @@ mod tests {
|
||||
])
|
||||
.unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
@@ -456,6 +466,7 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
@@ -496,6 +507,7 @@ mod tests {
|
||||
disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
@@ -545,6 +557,7 @@ mod tests {
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage {
|
||||
reparenting_history_truncated: false,
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
@@ -603,6 +616,63 @@ mod tests {
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v8_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 8,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"deleted_at": "2023-07-31T09:00:00.123",
|
||||
"archived_at": "2023-04-29T09:00:00.123"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 8,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
|
||||
lineage: Default::default(),
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
@@ -307,12 +307,10 @@ impl DeltaLayer {
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path()))
|
||||
}
|
||||
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?;
|
||||
|
||||
// not production code
|
||||
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
|
||||
@@ -760,27 +758,24 @@ impl DeltaLayerInner {
|
||||
&self.layer_lsn_range
|
||||
}
|
||||
|
||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||
/// - inner has the success or transient failure
|
||||
/// - outer has the permanent failure
|
||||
pub(super) async fn load(
|
||||
path: &Utf8Path,
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path, ctx).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
|
||||
let file_id = page_cache::next_file_id();
|
||||
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
let summary_blk = block_reader
|
||||
.read_blk(0, ctx)
|
||||
.await
|
||||
.context("read first block")?;
|
||||
|
||||
// TODO: this should be an assertion instead; see ImageLayerInner::load
|
||||
let actual_summary =
|
||||
@@ -802,7 +797,7 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Ok(DeltaLayerInner {
|
||||
Ok(DeltaLayerInner {
|
||||
file,
|
||||
file_id,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
@@ -810,7 +805,7 @@ impl DeltaLayerInner {
|
||||
max_vectored_read_bytes,
|
||||
layer_key_range: actual_summary.key_range,
|
||||
layer_lsn_range: actual_summary.lsn_range,
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
|
||||
@@ -265,9 +265,8 @@ impl ImageLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
let loaded =
|
||||
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?;
|
||||
|
||||
// not production code
|
||||
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
|
||||
@@ -385,17 +384,16 @@ impl ImageLayerInner {
|
||||
summary: Option<Summary>,
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path, ctx).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = match block_reader.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
let summary_blk = block_reader
|
||||
.read_blk(0, ctx)
|
||||
.await
|
||||
.context("read first block")?;
|
||||
|
||||
// length is the only way how this could fail, so it's not actually likely at all unless
|
||||
// read_blk returns wrong sized block.
|
||||
@@ -420,7 +418,7 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Ok(ImageLayerInner {
|
||||
Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn,
|
||||
@@ -428,7 +426,7 @@ impl ImageLayerInner {
|
||||
file_id,
|
||||
max_vectored_read_bytes,
|
||||
key_range: actual_summary.key_range,
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
|
||||
@@ -1651,8 +1651,9 @@ impl Drop for DownloadedLayer {
|
||||
}
|
||||
|
||||
impl DownloadedLayer {
|
||||
/// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to
|
||||
/// initialize it permanently.
|
||||
/// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`].
|
||||
/// Failure to load the layer is sticky, i.e., future `get()` calls will return
|
||||
/// the initial load failure immediately.
|
||||
///
|
||||
/// `owner` parameter is a strong reference at the same `LayerInner` as the
|
||||
/// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called,
|
||||
@@ -1683,7 +1684,7 @@ impl DownloadedLayer {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Delta))
|
||||
.map(LayerKind::Delta)
|
||||
} else {
|
||||
let lsn = owner.desc.image_layer_lsn();
|
||||
let summary = Some(image_layer::Summary::expected(
|
||||
@@ -1700,32 +1701,29 @@ impl DownloadedLayer {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Image))
|
||||
.map(LayerKind::Image)
|
||||
};
|
||||
|
||||
match res {
|
||||
Ok(Ok(layer)) => Ok(Ok(layer)),
|
||||
Ok(Err(transient)) => Err(transient),
|
||||
Err(permanent) => {
|
||||
Ok(layer) => Ok(layer),
|
||||
Err(err) => {
|
||||
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
||||
// TODO(#5815): we are not logging all errors, so temporarily log them **once**
|
||||
// here as well
|
||||
let permanent = permanent.context("load layer");
|
||||
tracing::error!("layer loading failed permanently: {permanent:#}");
|
||||
Ok(Err(permanent))
|
||||
// We log this message once over the lifetime of `Self`
|
||||
// => Ok and good to log backtrace and path here.
|
||||
tracing::error!(
|
||||
"layer load failed, assuming permanent failure: {}: {err:?}",
|
||||
owner.path
|
||||
);
|
||||
Err(err)
|
||||
}
|
||||
}
|
||||
};
|
||||
self.kind
|
||||
.get_or_try_init(init)
|
||||
// return transient errors using `?`
|
||||
.await?
|
||||
.get_or_init(init)
|
||||
.await
|
||||
.as_ref()
|
||||
.map_err(|e| {
|
||||
// errors are not clonabled, cannot but stringify
|
||||
// test_broken_timeline matches this string
|
||||
anyhow::anyhow!("layer loading failed: {e:#}")
|
||||
})
|
||||
// We already logged the full backtrace above, once. Don't repeat that here.
|
||||
.map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
@@ -1760,7 +1758,11 @@ impl DownloadedLayer {
|
||||
) -> Result<(), GetVectoredError> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
|
||||
match self
|
||||
.get(owner, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?
|
||||
{
|
||||
Delta(d) => {
|
||||
d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
|
||||
@@ -210,24 +210,28 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
} else {
|
||||
error_run_count = 0;
|
||||
period
|
||||
match tenant.compaction_iteration(&cancel, &ctx).await {
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
}
|
||||
Ok(has_pending_task) => {
|
||||
error_run_count = 0;
|
||||
// schedule the next compaction immediately in case there is a pending compaction task
|
||||
if has_pending_task { Duration::from_secs(0) } else { period }
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1769,13 +1769,14 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
|
||||
/// compaction tasks.
|
||||
pub(crate) async fn compact(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
) -> Result<bool, CompactionError> {
|
||||
// most likely the cancellation token is from background task, but in tests it could be the
|
||||
// request task as well.
|
||||
|
||||
@@ -1795,8 +1796,8 @@ impl Timeline {
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let (_guard, _permit) = tokio::select! {
|
||||
tuple = prepare => { tuple },
|
||||
_ = self.cancel.cancelled() => return Ok(()),
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
_ = self.cancel.cancelled() => return Ok(false),
|
||||
_ = cancel.cancelled() => return Ok(false),
|
||||
};
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
@@ -1804,11 +1805,14 @@ impl Timeline {
|
||||
// Last record Lsn could be zero in case the timeline was just created
|
||||
if !last_record_lsn.is_valid() {
|
||||
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
||||
return Ok(());
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
match self.get_compaction_algorithm_settings().kind {
|
||||
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
|
||||
CompactionAlgorithm::Tiered => {
|
||||
self.compact_tiered(cancel, ctx).await?;
|
||||
Ok(false)
|
||||
}
|
||||
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
|
||||
}
|
||||
}
|
||||
@@ -1997,6 +2001,11 @@ impl Timeline {
|
||||
self.current_state() == TimelineState::Active
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub(crate) fn is_archived(&self) -> Option<bool> {
|
||||
self.remote_client.is_archived()
|
||||
}
|
||||
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
self.current_state() == TimelineState::Stopping
|
||||
}
|
||||
|
||||
@@ -102,17 +102,19 @@ impl KeyHistoryRetention {
|
||||
|
||||
impl Timeline {
|
||||
/// TODO: cancellation
|
||||
///
|
||||
/// Returns whether the compaction has pending tasks.
|
||||
pub(crate) async fn compact_legacy(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
) -> Result<bool, CompactionError> {
|
||||
if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
|
||||
return self
|
||||
.compact_with_gc(cancel, ctx)
|
||||
self.compact_with_gc(cancel, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other);
|
||||
.map_err(CompactionError::Other)?;
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// High level strategy for compaction / image creation:
|
||||
@@ -160,7 +162,7 @@ impl Timeline {
|
||||
// Define partitioning schema if needed
|
||||
|
||||
// FIXME: the match should only cover repartitioning, not the next steps
|
||||
let partition_count = match self
|
||||
let (partition_count, has_pending_tasks) = match self
|
||||
.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
self.get_compaction_target_size(),
|
||||
@@ -177,30 +179,35 @@ impl Timeline {
|
||||
|
||||
// 2. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size, ctx).await?;
|
||||
let fully_compacted = self.compact_level0(target_file_size, ctx).await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
.parts
|
||||
.extend(sparse_partitioning.into_dense().parts);
|
||||
let image_layers = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
partitioning.parts.len()
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough". Skip image layer creation if L0 compaction cannot keep up.
|
||||
if fully_compacted {
|
||||
let image_layers = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
if flags.contains(CompactFlags::ForceImageLayerCreation) {
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
} else {
|
||||
info!("skipping image layer generation due to L0 compaction did not include all layers.");
|
||||
}
|
||||
(partitioning.parts.len(), !fully_compacted)
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -212,7 +219,7 @@ impl Timeline {
|
||||
if !self.cancel.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
1
|
||||
(1, false)
|
||||
}
|
||||
};
|
||||
|
||||
@@ -225,7 +232,7 @@ impl Timeline {
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(has_pending_tasks)
|
||||
}
|
||||
|
||||
/// Check for layers that are elegible to be rewritten:
|
||||
@@ -432,15 +439,16 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||
/// as Level 1 files.
|
||||
/// as Level 1 files. Returns whether the L0 layers are fully compacted.
|
||||
async fn compact_level0(
|
||||
self: &Arc<Self>,
|
||||
target_file_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
) -> Result<bool, CompactionError> {
|
||||
let CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact,
|
||||
fully_compacted,
|
||||
} = {
|
||||
let phase1_span = info_span!("compact_level0_phase1");
|
||||
let ctx = ctx.attached_child();
|
||||
@@ -463,12 +471,12 @@ impl Timeline {
|
||||
|
||||
if new_layers.is_empty() && deltas_to_compact.is_empty() {
|
||||
// nothing to do
|
||||
return Ok(());
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
|
||||
.await?;
|
||||
Ok(())
|
||||
Ok(fully_compacted)
|
||||
}
|
||||
|
||||
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
|
||||
@@ -535,6 +543,8 @@ impl Timeline {
|
||||
) as u64
|
||||
* std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
|
||||
|
||||
let mut fully_compacted = true;
|
||||
|
||||
deltas_to_compact.push(
|
||||
first_level0_delta
|
||||
.download_and_keep_resident()
|
||||
@@ -562,6 +572,7 @@ impl Timeline {
|
||||
"L0 compaction picker hit max delta layer size limit: {}",
|
||||
delta_size_limit
|
||||
);
|
||||
fully_compacted = false;
|
||||
|
||||
// Proceed with compaction, but only a subset of L0s
|
||||
break;
|
||||
@@ -923,6 +934,7 @@ impl Timeline {
|
||||
.into_iter()
|
||||
.map(|x| x.drop_eviction_guard())
|
||||
.collect::<Vec<_>>(),
|
||||
fully_compacted,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -931,6 +943,9 @@ impl Timeline {
|
||||
struct CompactLevel0Phase1Result {
|
||||
new_layers: Vec<ResidentLayer>,
|
||||
deltas_to_compact: Vec<Layer>,
|
||||
// Whether we have included all L0 layers, or selected only part of them due to the
|
||||
// L0 compaction size limit.
|
||||
fully_compacted: bool,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -5,6 +5,4 @@ pub use limit_algorithm::{
|
||||
};
|
||||
pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
|
||||
mod leaky_bucket;
|
||||
pub use leaky_bucket::{
|
||||
EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
|
||||
};
|
||||
pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::{
|
||||
hash::Hash,
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use ahash::RandomState;
|
||||
@@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
|
||||
|
||||
pub struct LeakyBucketRateLimiter<Key> {
|
||||
map: DashMap<Key, LeakyBucketState, RandomState>,
|
||||
config: LeakyBucketConfig,
|
||||
config: LeakyBucketConfigInner,
|
||||
access_count: AtomicUsize,
|
||||
}
|
||||
|
||||
@@ -29,7 +30,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
|
||||
Self {
|
||||
map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
|
||||
config,
|
||||
config: config.into(),
|
||||
access_count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
@@ -42,10 +43,10 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
self.do_gc(now);
|
||||
}
|
||||
|
||||
let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
|
||||
time: now,
|
||||
filled: 0.0,
|
||||
});
|
||||
let mut entry = self
|
||||
.map
|
||||
.entry(key)
|
||||
.or_insert_with(|| LeakyBucketState::new(now));
|
||||
|
||||
entry.check(&self.config, now, n as f64)
|
||||
}
|
||||
@@ -59,7 +60,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
let shard = thread_rng().gen_range(0..n);
|
||||
self.map.shards()[shard]
|
||||
.write()
|
||||
.retain(|_, value| !value.get_mut().update(&self.config, now));
|
||||
.retain(|_, value| value.get().should_retain(now));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,11 +69,6 @@ pub struct LeakyBucketConfig {
|
||||
pub max: f64,
|
||||
}
|
||||
|
||||
pub struct LeakyBucketState {
|
||||
filled: f64,
|
||||
time: Instant,
|
||||
}
|
||||
|
||||
impl LeakyBucketConfig {
|
||||
pub fn new(rps: f64, max: f64) -> Self {
|
||||
assert!(rps > 0.0, "rps must be positive");
|
||||
@@ -81,40 +77,76 @@ impl LeakyBucketConfig {
|
||||
}
|
||||
}
|
||||
|
||||
impl LeakyBucketState {
|
||||
pub fn new() -> Self {
|
||||
struct LeakyBucketConfigInner {
|
||||
/// "time cost" of a single request unit.
|
||||
/// loosely represents how long it takes to handle a request unit in active CPU time.
|
||||
time_cost: Duration,
|
||||
bucket_width: Duration,
|
||||
}
|
||||
|
||||
impl From<LeakyBucketConfig> for LeakyBucketConfigInner {
|
||||
fn from(config: LeakyBucketConfig) -> Self {
|
||||
// seconds_per_request = 1/(request_per_second)
|
||||
let spr = config.rps.recip();
|
||||
Self {
|
||||
filled: 0.0,
|
||||
time: Instant::now(),
|
||||
time_cost: Duration::from_secs_f64(spr),
|
||||
bucket_width: Duration::from_secs_f64(config.max * spr),
|
||||
}
|
||||
}
|
||||
|
||||
/// updates the timer and returns true if the bucket is empty
|
||||
fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
|
||||
let drain = now.duration_since(self.time);
|
||||
let drain = drain.as_secs_f64() * info.rps;
|
||||
|
||||
self.filled = (self.filled - drain).clamp(0.0, info.max);
|
||||
self.time = now;
|
||||
|
||||
self.filled == 0.0
|
||||
}
|
||||
|
||||
pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
|
||||
self.update(info, now);
|
||||
|
||||
if self.filled + n > info.max {
|
||||
return false;
|
||||
}
|
||||
self.filled += n;
|
||||
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LeakyBucketState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
struct LeakyBucketState {
|
||||
/// Bucket is represented by `start..end` where `start = end - config.bucket_width`.
|
||||
///
|
||||
/// At any given time, `end - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
|
||||
/// Adding `n` tokens to the bucket is done by moving `end` forward by `n * config.time_cost`.
|
||||
/// If `now < start`, the bucket is considered filled and cannot accept any more tokens.
|
||||
/// Draining the bucket will happen naturally as `now` moves forward.
|
||||
///
|
||||
/// Let `n` be some "time cost" for the request,
|
||||
/// If now is after end, the bucket is empty and the end is reset to now,
|
||||
/// If now is within the `bucket window + n`, we are within time budget.
|
||||
/// If now is before the `bucket window + n`, we have run out of budget.
|
||||
///
|
||||
/// This is inspired by the generic cell rate algorithm (GCRA) and works
|
||||
/// exactly the same as a leaky-bucket.
|
||||
end: Instant,
|
||||
}
|
||||
|
||||
impl LeakyBucketState {
|
||||
fn new(now: Instant) -> Self {
|
||||
Self { end: now }
|
||||
}
|
||||
|
||||
fn should_retain(&self, now: Instant) -> bool {
|
||||
// if self.end is after now, the bucket is not empty
|
||||
now < self.end
|
||||
}
|
||||
|
||||
fn check(&mut self, config: &LeakyBucketConfigInner, now: Instant, n: f64) -> bool {
|
||||
let start = self.end - config.bucket_width;
|
||||
|
||||
let n = config.time_cost.mul_f64(n);
|
||||
|
||||
// start end
|
||||
// | start+n | end+n
|
||||
// | / | /
|
||||
// ------{o-[---------o-}--]----o----
|
||||
// now1 ^ now2 ^ ^ now3
|
||||
//
|
||||
// at now1, the bucket would be completely filled if we add n tokens.
|
||||
// at now2, the bucket would be partially filled if we add n tokens.
|
||||
// at now3, the bucket would start completely empty before we add n tokens.
|
||||
|
||||
if self.end + n <= now {
|
||||
self.end = now + n;
|
||||
true
|
||||
} else if start + n <= now {
|
||||
self.end += n;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,47 +156,50 @@ mod tests {
|
||||
|
||||
use tokio::time::Instant;
|
||||
|
||||
use super::{LeakyBucketConfig, LeakyBucketState};
|
||||
use super::{LeakyBucketConfig, LeakyBucketConfigInner, LeakyBucketState};
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn check() {
|
||||
let info = LeakyBucketConfig::new(500.0, 2000.0);
|
||||
let mut bucket = LeakyBucketState::new();
|
||||
let config: LeakyBucketConfigInner = LeakyBucketConfig::new(500.0, 2000.0).into();
|
||||
assert_eq!(config.time_cost, Duration::from_millis(2));
|
||||
assert_eq!(config.bucket_width, Duration::from_secs(4));
|
||||
|
||||
let mut bucket = LeakyBucketState::new(Instant::now());
|
||||
|
||||
// should work for 2000 requests this second
|
||||
for _ in 0..2000 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
assert_eq!(bucket.filled, 2000.0);
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
assert_eq!(bucket.end - Instant::now(), config.bucket_width);
|
||||
|
||||
// in 1ms we should drain 0.5 tokens.
|
||||
// make sure we don't lose any tokens
|
||||
tokio::time::advance(Duration::from_millis(1)).await;
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
tokio::time::advance(Duration::from_millis(1)).await;
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
|
||||
// in 10ms we should drain 5 tokens
|
||||
tokio::time::advance(Duration::from_millis(10)).await;
|
||||
for _ in 0..5 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
|
||||
// in 10s we should drain 5000 tokens
|
||||
// but cap is only 2000
|
||||
tokio::time::advance(Duration::from_secs(10)).await;
|
||||
for _ in 0..2000 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
}
|
||||
assert!(!bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(!bucket.check(&config, Instant::now(), 1.0));
|
||||
|
||||
// should sustain 500rps
|
||||
for _ in 0..2000 {
|
||||
tokio::time::advance(Duration::from_millis(10)).await;
|
||||
for _ in 0..5 {
|
||||
assert!(bucket.check(&info, Instant::now(), 1.0));
|
||||
assert!(bucket.check(&config, Instant::now(), 1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,7 +143,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
self.tenant_id.unwrap_or(TenantId::from([0u8; 16])),
|
||||
self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])),
|
||||
);
|
||||
tracing::Span::current().record("ttid", tracing::field::display(ttid));
|
||||
tracing::Span::current()
|
||||
.record("ttid", tracing::field::display(ttid))
|
||||
.record(
|
||||
"application_name",
|
||||
tracing::field::debug(self.appname.clone()),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
} else {
|
||||
|
||||
@@ -43,7 +43,7 @@ pub async fn task_main(
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("", cid = %conn_id, ttid = field::Empty)),
|
||||
.instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::metrics::{
|
||||
METRICS_REGISTRY,
|
||||
};
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use anyhow::Context;
|
||||
use futures::Future;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
@@ -607,6 +607,13 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.step_down().await)
|
||||
}
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
@@ -734,6 +741,47 @@ struct RequestMeta {
|
||||
at: Instant,
|
||||
}
|
||||
|
||||
pub fn prologue_leadership_status_check_middleware<
|
||||
B: hyper::body::HttpBody + Send + Sync + 'static,
|
||||
>() -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
let state = get_state(&req);
|
||||
let leadership_status = state.service.get_leadership_status();
|
||||
|
||||
enum AllowedRoutes<'a> {
|
||||
All,
|
||||
Some(Vec<&'a str>),
|
||||
}
|
||||
|
||||
let allowed_routes = match leadership_status {
|
||||
LeadershipStatus::Leader => AllowedRoutes::All,
|
||||
LeadershipStatus::SteppedDown => {
|
||||
// TODO: does it make sense to allow /status here?
|
||||
AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
|
||||
}
|
||||
LeadershipStatus::Candidate => {
|
||||
AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
|
||||
}
|
||||
};
|
||||
|
||||
let uri = req.uri().to_string();
|
||||
match allowed_routes {
|
||||
AllowedRoutes::All => Ok(req),
|
||||
AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req),
|
||||
_ => {
|
||||
tracing::info!(
|
||||
"Request {} not allowed due to current leadership state",
|
||||
req.uri()
|
||||
);
|
||||
|
||||
Err(ApiError::ResourceUnavailable(
|
||||
format!("Current leadership status is {leadership_status}").into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
@@ -820,6 +868,7 @@ pub fn make_router(
|
||||
build_info: BuildInfo,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router()
|
||||
.middleware(prologue_leadership_status_check_middleware())
|
||||
.middleware(prologue_metrics_middleware())
|
||||
.middleware(epilogue_metrics_middleware());
|
||||
if auth.is_some() {
|
||||
@@ -971,6 +1020,9 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_policy"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/step_down", |r| {
|
||||
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
|
||||
@@ -13,7 +13,10 @@ use metrics::NeonMetrics;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::persistence::{DatabaseError, DatabaseOperation};
|
||||
use crate::{
|
||||
persistence::{DatabaseError, DatabaseOperation},
|
||||
service::LeadershipStatus,
|
||||
};
|
||||
|
||||
pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
|
||||
Lazy::new(StorageControllerMetrics::default);
|
||||
@@ -81,6 +84,8 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
|
||||
pub(crate) storage_controller_database_query_latency:
|
||||
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
|
||||
|
||||
pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
|
||||
}
|
||||
|
||||
impl StorageControllerMetrics {
|
||||
@@ -156,6 +161,12 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
|
||||
pub(crate) operation: DatabaseOperation,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = LeadershipStatusGroupSet)]
|
||||
pub(crate) struct LeadershipStatusGroup {
|
||||
pub(crate) status: LeadershipStatus,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||
pub(crate) enum ReconcileOutcome {
|
||||
#[label(rename = "ok")]
|
||||
|
||||
@@ -12,6 +12,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::failpoint_support;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -749,6 +750,8 @@ impl Reconciler {
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
}
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ use crate::{
|
||||
},
|
||||
compute_hook::NotifyError,
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
|
||||
metrics::LeadershipStatusGroup,
|
||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
||||
reconciler::{ReconcileError, ReconcileUnits},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
@@ -81,6 +82,7 @@ use crate::{
|
||||
ReconcilerWaiter, TenantShard,
|
||||
},
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// For operations that should be quick, like attaching a new tenant
|
||||
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
@@ -131,6 +133,24 @@ enum NodeOperations {
|
||||
Delete,
|
||||
}
|
||||
|
||||
/// The leadership status for the storage controller process.
|
||||
/// Allowed transitions are:
|
||||
/// 1. Leader -> SteppedDown
|
||||
/// 2. Candidate -> Leader
|
||||
#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub(crate) enum LeadershipStatus {
|
||||
/// This is the steady state where the storage controller can produce
|
||||
/// side effects in the cluster.
|
||||
Leader,
|
||||
/// We've been notified to step down by another candidate. No reconciliations
|
||||
/// take place in this state.
|
||||
SteppedDown,
|
||||
/// Initial state for a new storage controller instance. Will attempt to assume leadership.
|
||||
#[allow(unused)]
|
||||
Candidate,
|
||||
}
|
||||
|
||||
pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
|
||||
|
||||
// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
|
||||
@@ -140,6 +160,8 @@ const MAX_DELAYED_RECONCILES: usize = 10000;
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
struct ServiceState {
|
||||
leadership_status: LeadershipStatus,
|
||||
|
||||
tenants: BTreeMap<TenantShardId, TenantShard>,
|
||||
|
||||
nodes: Arc<HashMap<NodeId, Node>>,
|
||||
@@ -202,7 +224,21 @@ impl ServiceState {
|
||||
scheduler: Scheduler,
|
||||
delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
|
||||
) -> Self {
|
||||
let status = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_leadership_status;
|
||||
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::Leader,
|
||||
},
|
||||
1,
|
||||
);
|
||||
|
||||
Self {
|
||||
// TODO: Starting up as Leader is a transient state. Once we enable rolling
|
||||
// upgrades on the k8s side, we should start up as Candidate.
|
||||
leadership_status: LeadershipStatus::Leader,
|
||||
tenants,
|
||||
nodes: Arc::new(nodes),
|
||||
scheduler,
|
||||
@@ -220,6 +256,37 @@ impl ServiceState {
|
||||
) {
|
||||
(&mut self.nodes, &mut self.tenants, &mut self.scheduler)
|
||||
}
|
||||
|
||||
fn get_leadership_status(&self) -> LeadershipStatus {
|
||||
self.leadership_status
|
||||
}
|
||||
|
||||
fn step_down(&mut self) {
|
||||
self.leadership_status = LeadershipStatus::SteppedDown;
|
||||
|
||||
let status = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_leadership_status;
|
||||
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::SteppedDown,
|
||||
},
|
||||
1,
|
||||
);
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::Leader,
|
||||
},
|
||||
0,
|
||||
);
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::Candidate,
|
||||
},
|
||||
0,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -403,11 +470,30 @@ struct ShardUpdate {
|
||||
generation: Option<Generation>,
|
||||
}
|
||||
|
||||
enum StopReconciliationsReason {
|
||||
ShuttingDown,
|
||||
SteppingDown,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for StopReconciliationsReason {
|
||||
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
Self::ShuttingDown => "Shutting down",
|
||||
Self::SteppingDown => "Stepping down",
|
||||
};
|
||||
write!(writer, "{}", s)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum ReconcileResultRequest {
|
||||
ReconcileResult(ReconcileResult),
|
||||
Stop,
|
||||
}
|
||||
|
||||
// TODO: move this into the storcon peer client when that gets added
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
|
||||
|
||||
impl Service {
|
||||
pub fn get_config(&self) -> &Config {
|
||||
&self.config
|
||||
@@ -5603,17 +5689,22 @@ impl Service {
|
||||
Ok(std::cmp::max(waiter_count, reconciles_spawned))
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) {
|
||||
async fn stop_reconciliations(&self, reason: StopReconciliationsReason) {
|
||||
// Cancel all on-going reconciles and wait for them to exit the gate.
|
||||
tracing::info!("Shutting down: cancelling and waiting for in-flight reconciles");
|
||||
tracing::info!("{reason}: cancelling and waiting for in-flight reconciles");
|
||||
self.reconcilers_cancel.cancel();
|
||||
self.reconcilers_gate.close().await;
|
||||
|
||||
// Signal the background loop in [`Service::process_results`] to exit once
|
||||
// it has proccessed the results from all the reconciles we cancelled earlier.
|
||||
tracing::info!("Shutting down: processing results from previously in-flight reconciles");
|
||||
tracing::info!("{reason}: processing results from previously in-flight reconciles");
|
||||
self.result_tx.send(ReconcileResultRequest::Stop).ok();
|
||||
self.result_tx.closed().await;
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) {
|
||||
self.stop_reconciliations(StopReconciliationsReason::ShuttingDown)
|
||||
.await;
|
||||
|
||||
// Background tasks hold gate guards: this notifies them of the cancellation and
|
||||
// waits for them all to complete.
|
||||
@@ -6003,4 +6094,27 @@ impl Service {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
|
||||
self.inner.read().unwrap().get_leadership_status()
|
||||
}
|
||||
|
||||
pub(crate) async fn step_down(&self) -> GlobalObservedState {
|
||||
tracing::info!("Received step down request from peer");
|
||||
|
||||
self.inner.write().unwrap().step_down();
|
||||
// TODO: would it make sense to have a time-out for this?
|
||||
self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
|
||||
.await;
|
||||
|
||||
let mut global_observed = GlobalObservedState::default();
|
||||
let locked = self.inner.read().unwrap();
|
||||
for (tid, tenant_shard) in locked.tenants.iter() {
|
||||
global_observed
|
||||
.0
|
||||
.insert(*tid, tenant_shard.observed.clone());
|
||||
}
|
||||
|
||||
global_observed
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
};
|
||||
use serde::Serialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{instrument, Instrument};
|
||||
@@ -284,7 +284,7 @@ impl Drop for IntentState {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
#[derive(Default, Clone, Serialize, Deserialize, Debug)]
|
||||
pub(crate) struct ObservedState {
|
||||
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
|
||||
}
|
||||
@@ -298,7 +298,7 @@ pub(crate) struct ObservedState {
|
||||
/// what it is (e.g. we failed partway through configuring it)
|
||||
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
|
||||
/// and that configuration will still be present unless something external interfered.
|
||||
#[derive(Clone, Serialize)]
|
||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||
pub(crate) struct ObservedStateLocation {
|
||||
/// If None, it means we do not know the status of this shard's location on this node, but
|
||||
/// we know that we might have some state on this node.
|
||||
|
||||
@@ -87,7 +87,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
.push(format!("index_part.json version: {}", index_part.version()))
|
||||
}
|
||||
|
||||
if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
|
||||
let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
|
||||
if !newest_versions.any(|ip| ip == &index_part.version()) {
|
||||
info!(
|
||||
"index_part.json version is not latest: {}",
|
||||
index_part.version()
|
||||
|
||||
@@ -81,7 +81,7 @@ should go.
|
||||
Useful parameters and commands:
|
||||
|
||||
`--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk
|
||||
after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents.
|
||||
after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. If `NeonEnvBuilder#preserve_database_files` set to `True` for a particular test, the whole `repo` directory will be attached to Allure report (thus uploaded to S3) as `everything.tar.zst` for this test.
|
||||
|
||||
Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
|
||||
`./scripts/pytest -s --log-cli-level=INFO ...`
|
||||
|
||||
@@ -542,21 +542,6 @@ class NeonEnvBuilder:
|
||||
f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
|
||||
)
|
||||
|
||||
self.pageserver_get_vectored_impl: Optional[str] = None
|
||||
if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
|
||||
self.pageserver_get_vectored_impl = "vectored"
|
||||
log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
|
||||
|
||||
self.pageserver_get_impl: Optional[str] = None
|
||||
if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored":
|
||||
self.pageserver_get_impl = "vectored"
|
||||
log.debug('Overriding pageserver get_impl config to "vectored"')
|
||||
|
||||
self.pageserver_validate_vectored_get: Optional[bool] = None
|
||||
if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
|
||||
self.pageserver_validate_vectored_get = bool(validate)
|
||||
log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
|
||||
|
||||
self.pageserver_aux_file_policy = pageserver_aux_file_policy
|
||||
|
||||
self.safekeeper_extra_opts = safekeeper_extra_opts
|
||||
@@ -1157,12 +1142,6 @@ class NeonEnv:
|
||||
}
|
||||
if self.pageserver_virtual_file_io_engine is not None:
|
||||
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
|
||||
if config.pageserver_get_vectored_impl is not None:
|
||||
ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
|
||||
if config.pageserver_get_impl is not None:
|
||||
ps_cfg["get_impl"] = config.pageserver_get_impl
|
||||
if config.pageserver_validate_vectored_get is not None:
|
||||
ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
|
||||
if config.pageserver_default_tenant_config_compaction_algorithm is not None:
|
||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||
tenant_config[
|
||||
@@ -1415,7 +1394,7 @@ def _shared_simple_env(
|
||||
pg_distrib_dir=pg_distrib_dir,
|
||||
pg_version=pg_version,
|
||||
run_id=run_id,
|
||||
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
|
||||
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
@@ -1490,7 +1469,7 @@ def neon_env_builder(
|
||||
pg_version=pg_version,
|
||||
broker=default_broker,
|
||||
run_id=run_id,
|
||||
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
|
||||
preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
@@ -1499,6 +1478,11 @@ def neon_env_builder(
|
||||
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
|
||||
) as builder:
|
||||
yield builder
|
||||
# Propogate `preserve_database_files` to make it possible to use in other fixtures,
|
||||
# like `test_output_dir` fixture for attaching all database files to Allure report.
|
||||
request.node.user_properties.append(
|
||||
("preserve_database_files", builder.preserve_database_files)
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -2603,6 +2587,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
time.sleep(backoff)
|
||||
|
||||
def step_down(self):
|
||||
log.info("Asking storage controller to step down")
|
||||
response = self.request(
|
||||
"PUT",
|
||||
f"{self.env.storage_controller_api}/control/v1/step_down",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
@@ -4488,7 +4483,16 @@ def test_output_dir(
|
||||
|
||||
yield test_dir
|
||||
|
||||
allure_attach_from_dir(test_dir)
|
||||
preserve_database_files = False
|
||||
for k, v in request.node.user_properties:
|
||||
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).
|
||||
# So, neon_env_builder's cleanup runs before here.
|
||||
# The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property.
|
||||
if k == "preserve_database_files":
|
||||
assert isinstance(v, bool)
|
||||
preserve_database_files = v
|
||||
|
||||
allure_attach_from_dir(test_dir, preserve_database_files)
|
||||
|
||||
|
||||
class FileAndThreadLock:
|
||||
|
||||
@@ -240,9 +240,18 @@ ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
|
||||
)
|
||||
|
||||
|
||||
def allure_attach_from_dir(dir: Path):
|
||||
def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False):
|
||||
"""Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report"""
|
||||
|
||||
if preserve_database_files:
|
||||
zst_file = dir.with_suffix(".tar.zst")
|
||||
with zst_file.open("wb") as zst:
|
||||
cctx = zstandard.ZstdCompressor()
|
||||
with cctx.stream_writer(zst) as compressor:
|
||||
with tarfile.open(fileobj=compressor, mode="w") as tar:
|
||||
tar.add(dir, arcname="")
|
||||
allure.attach.file(zst_file, "everything.tar.zst", "application/zstd", "tar.zst")
|
||||
|
||||
for attachment in Path(dir).glob("**/*"):
|
||||
if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0:
|
||||
name = str(attachment.relative_to(dir))
|
||||
|
||||
@@ -17,6 +17,7 @@ from fixtures.pg_version import PgVersion
|
||||
# 3. Disk space used
|
||||
# 4. Peak memory usage
|
||||
#
|
||||
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124")
|
||||
def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
env = neon_with_baseline
|
||||
|
||||
|
||||
@@ -17,22 +17,17 @@ from fixtures.pg_version import PgVersion
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
||||
if neon_env_builder.pageserver_get_impl == "vectored":
|
||||
reconstruct_function_name = "get_values_reconstruct_data"
|
||||
else:
|
||||
reconstruct_function_name = "get_value_reconstruct_data"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
f".*{reconstruct_function_name} for layer .*",
|
||||
".*get_values_reconstruct_data for layer .*",
|
||||
".*could not find data for key.*",
|
||||
".*is not active. Current state: Broken.*",
|
||||
".*will not become active. Current state: Broken.*",
|
||||
".*failed to load metadata.*",
|
||||
".*load failed.*load local timeline.*",
|
||||
".*layer loading failed permanently: load layer: .*",
|
||||
".*: layer load failed, assuming permanent failure:.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -79,7 +74,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
||||
# (We don't check layer file contents on startup, when loading the timeline)
|
||||
#
|
||||
# This will change when we implement checksums for layers
|
||||
with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
|
||||
with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err:
|
||||
pg1.start()
|
||||
log.info(
|
||||
f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
|
||||
|
||||
@@ -227,12 +227,6 @@ def test_forward_compatibility(
|
||||
)
|
||||
|
||||
try:
|
||||
# Previous version neon_local and pageserver are not aware
|
||||
# of the new config.
|
||||
# TODO: remove these once the previous version of neon local supports them
|
||||
neon_env_builder.pageserver_get_impl = None
|
||||
neon_env_builder.pageserver_validate_vectored_get = None
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
# Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
|
||||
|
||||
@@ -48,8 +48,6 @@ def test_sharding_smoke(
|
||||
# that the scrubber doesn't barf when it sees a sharded tenant.
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
|
||||
neon_env_builder.preserve_database_files = True
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
@@ -372,8 +370,6 @@ def test_sharding_split_smoke(
|
||||
# that the scrubber doesn't barf when it sees a sharded tenant.
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
|
||||
neon_env_builder.preserve_database_files = True
|
||||
|
||||
non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
|
||||
|
||||
env = neon_env_builder.init_configs(True)
|
||||
|
||||
@@ -1783,3 +1783,78 @@ def test_storage_controller_node_deletion(
|
||||
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
|
||||
env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above.
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the `/control/v1/step_down` storage controller API. Upon receiving such
|
||||
a request, the storage controller cancels any on-going reconciles and replies
|
||||
with 503 to all requests apart from `/control/v1/step_down`, `/status` and `/metrics`.
|
||||
"""
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tid = TenantId.generate()
|
||||
tsid = str(TenantShardId(tid, shard_number=0, shard_count=0))
|
||||
env.storage_controller.tenant_create(tid)
|
||||
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)"))
|
||||
|
||||
# Make a change to the tenant config to trigger a slow reconcile
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
|
||||
env.storage_controller.allowed_errors.append(
|
||||
".*Accepted configuration update but reconciliation failed.*"
|
||||
)
|
||||
|
||||
observed_state = env.storage_controller.step_down()
|
||||
log.info(f"Storage controller stepped down with {observed_state=}")
|
||||
|
||||
# Validate that we waited for the slow reconcile to complete
|
||||
# and updated the observed state in the storcon before stepping down.
|
||||
node_id = str(env.pageserver.id)
|
||||
assert tsid in observed_state
|
||||
assert node_id in observed_state[tsid]["locations"]
|
||||
assert "conf" in observed_state[tsid]["locations"][node_id]
|
||||
assert "tenant_conf" in observed_state[tsid]["locations"][node_id]["conf"]
|
||||
|
||||
tenant_conf = observed_state[tsid]["locations"][node_id]["conf"]["tenant_conf"]
|
||||
assert "compaction_threshold" in tenant_conf
|
||||
assert tenant_conf["compaction_threshold"] == 5
|
||||
|
||||
# Validate that we propagated the change to the pageserver
|
||||
ps_tenant_conf = env.pageserver.http_client().tenant_config(tid)
|
||||
assert "compaction_threshold" in ps_tenant_conf.effective_config
|
||||
assert ps_tenant_conf.effective_config["compaction_threshold"] == 5
|
||||
|
||||
# Validate that the storcon is not replying to the usual requests
|
||||
# once it has stepped down.
|
||||
with pytest.raises(StorageControllerApiException, match="stepped_down"):
|
||||
env.storage_controller.tenant_list()
|
||||
|
||||
# Validate that we can step down multiple times and the observed state
|
||||
# doesn't change.
|
||||
observed_state_again = env.storage_controller.step_down()
|
||||
assert observed_state == observed_state_again
|
||||
|
||||
assert (
|
||||
env.storage_controller.get_metric_value(
|
||||
"storage_controller_leadership_status", filter={"status": "leader"}
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
assert (
|
||||
env.storage_controller.get_metric_value(
|
||||
"storage_controller_leadership_status", filter={"status": "stepped_down"}
|
||||
)
|
||||
== 1
|
||||
)
|
||||
|
||||
assert (
|
||||
env.storage_controller.get_metric_value(
|
||||
"storage_controller_leadership_status", filter={"status": "candidate"}
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
@@ -404,7 +404,7 @@ files:
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) as size
|
||||
from
|
||||
(select generate_series * 60 as x from generate_series(1, 60));
|
||||
(select generate_series * 60 as x from generate_series(1, 60)) as t (x);
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user