mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 09:00:37 +00:00
Compare commits
8 Commits
rc/proxy/2
...
vlad/storc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
71ff8f2433 | ||
|
|
56c43c4fae | ||
|
|
4187657082 | ||
|
|
b690ba5838 | ||
|
|
dd7cafdd97 | ||
|
|
c501a10612 | ||
|
|
1fdbef9a44 | ||
|
|
3ad1221e55 |
@@ -19,10 +19,6 @@ on:
|
||||
description: 'debug or release'
|
||||
required: true
|
||||
type: string
|
||||
pg-versions:
|
||||
description: 'a json array of postgres versions to run regression tests on'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -258,7 +254,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg_version: ${{ fromJson(inputs.pg-versions) }}
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -288,5 +284,5 @@ jobs:
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
9
.github/workflows/build_and_test.yml
vendored
9
.github/workflows/build_and_test.yml
vendored
@@ -203,8 +203,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
|
||||
build-type: [ debug, release ]
|
||||
include:
|
||||
- build-type: release
|
||||
arch: arm64
|
||||
@@ -214,8 +213,6 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
|
||||
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
|
||||
secrets: inherit
|
||||
|
||||
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
||||
@@ -309,7 +306,7 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
@@ -871,7 +868,7 @@ jobs:
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: github.ref_name == 'main'
|
||||
|
||||
13
Cargo.lock
generated
13
Cargo.lock
generated
@@ -1744,6 +1744,18 @@ dependencies = [
|
||||
"const-random",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dns-lookup"
|
||||
version = "2.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5766087c2235fec47fafa4cfecc81e494ee679d0fd4a59887ea0919bfb0e4fc"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"socket2 0.5.5",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dsl_auto_type"
|
||||
version = "0.1.1"
|
||||
@@ -5724,6 +5736,7 @@ dependencies = [
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"dns-lookup",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
|
||||
@@ -144,7 +144,6 @@ impl RemotePath {
|
||||
///
|
||||
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
||||
/// NoDelimiter mode will only populate `keys`.
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum ListingMode {
|
||||
WithDelimiter,
|
||||
NoDelimiter,
|
||||
|
||||
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
|
||||
use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
@@ -29,9 +31,11 @@ use tracing::*;
|
||||
use metrics::set_build_info_metric;
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
deletion_queue::DeletionQueue,
|
||||
http, page_cache, page_service, task_mgr,
|
||||
task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
tenant::mgr,
|
||||
virtual_file,
|
||||
};
|
||||
@@ -125,7 +129,6 @@ fn main() -> anyhow::Result<()> {
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.get_impl, "starting with get page implementation");
|
||||
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
@@ -590,13 +593,30 @@ fn start_pageserver(
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
|
||||
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
|
||||
pageserver_listener
|
||||
.set_nonblocking(true)
|
||||
.context("set listener to nonblocking")?;
|
||||
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
|
||||
});
|
||||
let libpq_listener = {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
// create a separate sub-contexts for each connection, with their
|
||||
// own download behavior. This context is used only to listen and
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"libpq listener",
|
||||
page_service::libpq_listener_main(
|
||||
tenant_manager.clone(),
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
),
|
||||
));
|
||||
LibpqEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
@@ -624,7 +644,7 @@ fn start_pageserver(
|
||||
shutdown_pageserver.take();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
page_service,
|
||||
libpq_listener,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
|
||||
@@ -29,7 +29,6 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -296,10 +295,6 @@ pub struct PageServerConf {
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub l0_flush: L0FlushConfig,
|
||||
|
||||
/// This flag is temporary and will be removed after gradual rollout.
|
||||
/// See <https://github.com/neondatabase/neon/issues/8184>.
|
||||
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -406,8 +401,6 @@ struct PageServerConfigBuilder {
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
l0_flush: BuilderValue<L0FlushConfig>,
|
||||
|
||||
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -497,7 +490,6 @@ impl PageServerConfigBuilder {
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -681,10 +673,6 @@ impl PageServerConfigBuilder {
|
||||
self.l0_flush = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
|
||||
self.compact_level0_phase1_value_access = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -742,7 +730,6 @@ impl PageServerConfigBuilder {
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
l0_flush,
|
||||
compact_level0_phase1_value_access,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1015,9 +1002,6 @@ impl PageServerConf {
|
||||
"l0_flush" => {
|
||||
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
|
||||
}
|
||||
"compact_level0_phase1_value_access" => {
|
||||
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1102,7 +1086,6 @@ impl PageServerConf {
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1344,7 +1327,6 @@ background_task_maximum_delay = '334 s'
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1419,7 +1401,6 @@ background_task_maximum_delay = '334 s'
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => {
|
||||
ApiError::ResourceUnavailable(format!("{}", e).into())
|
||||
}
|
||||
GetActiveTenantError::SwitchedTenant => {
|
||||
// in our HTTP handlers, this error doesn't happen
|
||||
// TODO: separate error types
|
||||
ApiError::ResourceUnavailable("switched tenant".into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod l0_flush;
|
||||
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
pub mod aux_file;
|
||||
@@ -32,13 +30,14 @@ pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tenant::{
|
||||
mgr::{BackgroundPurges, TenantManager},
|
||||
secondary,
|
||||
};
|
||||
use tracing::{info, info_span};
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
@@ -64,6 +63,7 @@ pub struct CancellableTask {
|
||||
pub cancel: CancellationToken,
|
||||
}
|
||||
pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct LibpqEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
impl CancellableTask {
|
||||
@@ -77,7 +77,7 @@ impl CancellableTask {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn shutdown_pageserver(
|
||||
http_listener: HttpEndpointListener,
|
||||
page_service: page_service::Listener,
|
||||
libpq_listener: LibpqEndpointListener,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
|
||||
exit_code: i32,
|
||||
) {
|
||||
use std::time::Duration;
|
||||
|
||||
// If the orderly shutdown below takes too long, we still want to make
|
||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||
//
|
||||
// (Leftover walredo processes are the hypothesized trigger for the systemd freezes
|
||||
// that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
|
||||
//
|
||||
// We use a thread instead of a tokio task because the background runtime is likely busy
|
||||
// with the final flushing / uploads. This activity here has priority, and due to lack
|
||||
// of scheduling priority feature sin the tokio scheduler, using a separate thread is
|
||||
// an effective priority booster.
|
||||
let walredo_extraordinary_shutdown_thread_span = {
|
||||
let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
|
||||
span.follows_from(tracing::Span::current());
|
||||
span
|
||||
};
|
||||
let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
|
||||
let walredo_extraordinary_shutdown_thread = std::thread::spawn({
|
||||
let walredo_extraordinary_shutdown_thread_cancel =
|
||||
walredo_extraordinary_shutdown_thread_cancel.clone();
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let _entered = rt.enter();
|
||||
let _entered = walredo_extraordinary_shutdown_thread_span.enter();
|
||||
if let Ok(()) = rt.block_on(tokio::time::timeout(
|
||||
Duration::from_secs(8),
|
||||
walredo_extraordinary_shutdown_thread_cancel.cancelled(),
|
||||
)) {
|
||||
info!("cancellation requested");
|
||||
return;
|
||||
}
|
||||
let managers = tenant::WALREDO_MANAGERS
|
||||
.lock()
|
||||
.unwrap()
|
||||
// prevents new walredo managers from being inserted
|
||||
.take()
|
||||
.expect("only we take()");
|
||||
// Use FuturesUnordered to get in queue early for each manager's
|
||||
// heavier_once_cell semaphore wait list.
|
||||
// Also, for idle tenants that for some reason haven't
|
||||
// shut down yet, it's quite likely that we're not going
|
||||
// to get Poll::Pending once.
|
||||
let mut futs: FuturesUnordered<_> = managers
|
||||
.into_iter()
|
||||
.filter_map(|(_, mgr)| mgr.upgrade())
|
||||
.map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
|
||||
.collect();
|
||||
info!(count=%futs.len(), "built FuturesUnordered");
|
||||
let mut last_log_at = std::time::Instant::now();
|
||||
#[derive(Debug, Default)]
|
||||
struct Results {
|
||||
initiated: u64,
|
||||
already: u64,
|
||||
}
|
||||
let mut results = Results::default();
|
||||
while let Some(we_initiated) = rt.block_on(futs.next()) {
|
||||
if we_initiated {
|
||||
results.initiated += 1;
|
||||
} else {
|
||||
results.already += 1;
|
||||
}
|
||||
if last_log_at.elapsed() > Duration::from_millis(100) {
|
||||
info!(remaining=%futs.len(), ?results, "progress");
|
||||
last_log_at = std::time::Instant::now();
|
||||
}
|
||||
}
|
||||
info!(?results, "done");
|
||||
}
|
||||
});
|
||||
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
let remaining_connections = timed(
|
||||
page_service.stop_accepting(),
|
||||
timed(
|
||||
libpq_listener.0.shutdown(),
|
||||
"shutdown LibpqEndpointListener",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
|
||||
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||
// should already have been canclled via mgr::shutdown_all_tenants
|
||||
timed(
|
||||
remaining_connections.shutdown(),
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
info!("cancel & join walredo_extraordinary_shutdown_thread");
|
||||
walredo_extraordinary_shutdown_thread_cancel.cancel();
|
||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||
info!("walredo_extraordinary_shutdown_thread done");
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,8 @@ use std::time::Duration;
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
pub enum Value {
|
||||
/// An Image value contains a full copy of the value
|
||||
Image(Bytes),
|
||||
|
||||
@@ -33,7 +33,6 @@ use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
@@ -313,66 +312,14 @@ impl std::fmt::Debug for Tenant {
|
||||
}
|
||||
|
||||
pub(crate) enum WalRedoManager {
|
||||
Prod(WalredoManagerId, PostgresRedoManager),
|
||||
Prod(PostgresRedoManager),
|
||||
#[cfg(test)]
|
||||
Test(harness::TestRedoManager),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("pageserver is shutting down")]
|
||||
pub(crate) struct GlobalShutDown;
|
||||
|
||||
impl WalRedoManager {
|
||||
pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
|
||||
let id = WalredoManagerId::next();
|
||||
let arc = Arc::new(Self::Prod(id, mgr));
|
||||
let mut guard = WALREDO_MANAGERS.lock().unwrap();
|
||||
match &mut *guard {
|
||||
Some(map) => {
|
||||
map.insert(id, Arc::downgrade(&arc));
|
||||
Ok(arc)
|
||||
}
|
||||
None => Err(GlobalShutDown),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoManager {
|
||||
fn drop(&mut self) {
|
||||
match self {
|
||||
Self::Prod(id, _) => {
|
||||
let mut guard = WALREDO_MANAGERS.lock().unwrap();
|
||||
if let Some(map) = &mut *guard {
|
||||
map.remove(id).expect("new() registers, drop() unregisters");
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
|
||||
/// the walredo processes outside of the regular order.
|
||||
///
|
||||
/// This is necessary to work around a systemd bug where it freezes if there are
|
||||
/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
|
||||
Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
|
||||
> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
|
||||
#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
|
||||
pub(crate) struct WalredoManagerId(u64);
|
||||
impl WalredoManagerId {
|
||||
pub fn next() -> Self {
|
||||
static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
|
||||
let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if id == 0 {
|
||||
panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
|
||||
}
|
||||
Self(id)
|
||||
impl From<PostgresRedoManager> for WalRedoManager {
|
||||
fn from(mgr: PostgresRedoManager) -> Self {
|
||||
Self::Prod(mgr)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -384,20 +331,19 @@ impl From<harness::TestRedoManager> for WalRedoManager {
|
||||
}
|
||||
|
||||
impl WalRedoManager {
|
||||
pub(crate) async fn shutdown(&self) -> bool {
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
match self {
|
||||
Self::Prod(_, mgr) => mgr.shutdown().await,
|
||||
Self::Prod(mgr) => mgr.shutdown().await,
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||
match self {
|
||||
Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
@@ -417,7 +363,7 @@ impl WalRedoManager {
|
||||
pg_version: u32,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
Self::Prod(_, mgr) => {
|
||||
Self::Prod(mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
}
|
||||
@@ -431,7 +377,7 @@ impl WalRedoManager {
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
match self {
|
||||
WalRedoManager::Prod(_, m) => Some(m.status()),
|
||||
WalRedoManager::Prod(m) => Some(m.status()),
|
||||
#[cfg(test)]
|
||||
WalRedoManager::Test(_) => None,
|
||||
}
|
||||
@@ -440,8 +386,6 @@ impl WalRedoManager {
|
||||
|
||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||
pub enum GetTimelineError {
|
||||
#[error("Timeline is shutting down")]
|
||||
ShuttingDown,
|
||||
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
|
||||
NotActive {
|
||||
tenant_id: TenantShardId,
|
||||
@@ -731,9 +675,11 @@ impl Tenant {
|
||||
init_order: Option<InitializationOrder>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||
let wal_redo_manager =
|
||||
WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
|
||||
) -> Arc<Tenant> {
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
)));
|
||||
|
||||
let TenantSharedResources {
|
||||
broker_client,
|
||||
@@ -932,7 +878,7 @@ impl Tenant {
|
||||
}
|
||||
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
|
||||
);
|
||||
Ok(tenant)
|
||||
tenant
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
@@ -7401,7 +7347,6 @@ mod tests {
|
||||
Lsn(0x60),
|
||||
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
|
||||
3,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -7526,7 +7471,7 @@ mod tests {
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
|
||||
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
@@ -7572,114 +7517,6 @@ mod tests {
|
||||
};
|
||||
assert_eq!(res, expected_res);
|
||||
|
||||
// In case of branch compaction, the branch itself does not have the full history, and we need to provide
|
||||
// the ancestor image in the test case.
|
||||
|
||||
let history = vec![
|
||||
(
|
||||
key,
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(
|
||||
key,
|
||||
&history,
|
||||
Lsn(0x60),
|
||||
&[],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
below_horizon: vec![(
|
||||
Lsn(0x60),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x60),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
|
||||
)]),
|
||||
)],
|
||||
above_horizon: KeyLogAtLsn(vec![(
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
)]),
|
||||
};
|
||||
assert_eq!(res, expected_res);
|
||||
|
||||
let history = vec![
|
||||
(
|
||||
key,
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x60),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(
|
||||
key,
|
||||
&history,
|
||||
Lsn(0x60),
|
||||
&[Lsn(0x30)],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
below_horizon: vec![
|
||||
(
|
||||
Lsn(0x30),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||
)]),
|
||||
),
|
||||
(
|
||||
Lsn(0x60),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x60),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
|
||||
)]),
|
||||
),
|
||||
],
|
||||
above_horizon: KeyLogAtLsn(vec![(
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
)]),
|
||||
};
|
||||
assert_eq!(res, expected_res);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7878,186 +7715,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let parent_tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // delta layers
|
||||
vec![(Lsn(0x18), img_layer)], // image layers
|
||||
Lsn(0x18),
|
||||
)
|
||||
.await?;
|
||||
|
||||
parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||
|
||||
let branch_tline = tenant
|
||||
.branch_timeline_test_with_layers(
|
||||
&parent_tline,
|
||||
NEW_TIMELINE_ID,
|
||||
Some(Lsn(0x18)),
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
|
||||
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = parent_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x10),
|
||||
space: Lsn(0x10),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = branch_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x50),
|
||||
space: Lsn(0x50),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_40 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
branch_tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
branch_tline
|
||||
.get(get_key(idx as u32), Lsn(0x40), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_40[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -296,19 +296,13 @@ where
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
let mut node_buf = [0_u8; PAGE_SZ];
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Read the node, through the PS PageCache, into local variable `node_buf`.
|
||||
// We could keep the page cache read guard alive, but, at the time of writing,
|
||||
// we run quite small PS PageCache s => can't risk running out of
|
||||
// PageCache space because this stream isn't consumed fast enough.
|
||||
let page_read_guard = block_cursor
|
||||
// Locate the node.
|
||||
let node_buf = block_cursor
|
||||
.read_blk(self.start_blk + node_blknum, ctx)
|
||||
.await?;
|
||||
node_buf.copy_from_slice(page_read_guard.as_ref());
|
||||
drop(page_read_guard); // drop page cache read guard early
|
||||
|
||||
let node = OnDiskNode::deparse(&node_buf)?;
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
@@ -351,7 +345,6 @@ where
|
||||
Either::Left(idx..node.num_children.into())
|
||||
};
|
||||
|
||||
|
||||
// idx points to the first match now. Keep going from there
|
||||
while let Some(idx) = iter.next() {
|
||||
let key_off = idx * suffix_len;
|
||||
|
||||
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use super::remote_timeline_client::remote_tenant_path;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||
use super::{GlobalShutDown, TenantSharedResources};
|
||||
use super::TenantSharedResources;
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
/// - `Attached`: has a full Tenant object, is elegible to service
|
||||
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
|
||||
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||
/// ignore it.
|
||||
Zero,
|
||||
/// Pick the first shard we find for the TenantId
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
/// The shard ID is known: pick the given shard
|
||||
@@ -665,20 +667,17 @@ pub async fn init_tenant_mgr(
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
let shard_identity = location_conf.shard;
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => TenantSlot::Attached(
|
||||
tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
)
|
||||
.expect("global shutdown during init_tenant_mgr cannot happen"),
|
||||
),
|
||||
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
)),
|
||||
LocationMode::Secondary(secondary_conf) => {
|
||||
info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
@@ -726,7 +725,7 @@ fn tenant_spawn(
|
||||
init_order: Option<InitializationOrder>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||
) -> Arc<Tenant> {
|
||||
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
|
||||
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
|
||||
// to avoid impacting prod runtime performance.
|
||||
@@ -1193,10 +1192,7 @@ impl TenantManager {
|
||||
None,
|
||||
spawn_mode,
|
||||
ctx,
|
||||
)
|
||||
.map_err(|_: GlobalShutDown| {
|
||||
UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
|
||||
})?;
|
||||
);
|
||||
|
||||
TenantSlot::Attached(tenant)
|
||||
}
|
||||
@@ -1317,7 +1313,7 @@ impl TenantManager {
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
);
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
@@ -2051,7 +2047,7 @@ impl TenantManager {
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)?;
|
||||
);
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
@@ -2092,6 +2088,7 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return ShardResolveResult::Found(tenant.clone())
|
||||
}
|
||||
@@ -2173,9 +2170,6 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// never happen.
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
|
||||
#[error("reconnect to switch tenant id")]
|
||||
SwitchedTenant,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -3,7 +3,6 @@ pub(crate) mod compaction;
|
||||
pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
mod eviction_task;
|
||||
pub(crate) mod handle;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
pub(crate) mod logical_size;
|
||||
@@ -18,7 +17,6 @@ use camino::Utf8Path;
|
||||
use chrono::{DateTime, Utc};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use handle::ShardTimelineId;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{
|
||||
@@ -76,7 +74,6 @@ use crate::{
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::PersistentLayerDesc,
|
||||
},
|
||||
walredo,
|
||||
};
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
@@ -427,8 +424,6 @@ pub struct Timeline {
|
||||
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
|
||||
|
||||
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -534,6 +529,7 @@ impl GetVectoredError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
shard: ShardNumber,
|
||||
@@ -544,12 +540,6 @@ pub struct MissingKeyError {
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
@@ -1001,10 +991,7 @@ impl Timeline {
|
||||
.for_get_kind(GetKind::Singular)
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if cfg!(feature = "testing")
|
||||
&& res.is_err()
|
||||
&& !matches!(res, Err(PageReconstructError::Cancelled))
|
||||
{
|
||||
if cfg!(feature = "testing") && res.is_err() {
|
||||
// it can only be walredo issue
|
||||
use std::fmt::Write;
|
||||
|
||||
@@ -1923,9 +1910,6 @@ impl Timeline {
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
self.cancel.cancel();
|
||||
|
||||
// Ensure Prevent new page service requests from starting.
|
||||
self.handles.shutdown();
|
||||
|
||||
// Transition the remote_client into a state where it's only useful for timeline deletion.
|
||||
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
|
||||
self.remote_client.stop();
|
||||
@@ -2451,8 +2435,6 @@ impl Timeline {
|
||||
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
||||
|
||||
l0_flush_global_state: resources.l0_flush_global_state,
|
||||
|
||||
handles: Default::default(),
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
@@ -3722,17 +3704,6 @@ impl Timeline {
|
||||
&self.shard_identity
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||
ShardTimelineId {
|
||||
shard_index: ShardIndex {
|
||||
shard_number: self.shard_identity.number,
|
||||
shard_count: self.shard_identity.count,
|
||||
},
|
||||
timeline_id: self.timeline_id,
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the latest layer for appending.
|
||||
///
|
||||
@@ -5470,22 +5441,20 @@ impl Timeline {
|
||||
} else {
|
||||
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
|
||||
};
|
||||
let res = self
|
||||
|
||||
let img = match self
|
||||
.walredo_mgr
|
||||
.as_ref()
|
||||
.context("timeline has no walredo manager")
|
||||
.map_err(PageReconstructError::WalRedo)?
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.await;
|
||||
let img = match res {
|
||||
.await
|
||||
.context("reconstruct a page image")
|
||||
{
|
||||
Ok(img) => img,
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(e)) => {
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
e.context("reconstruct a page image"),
|
||||
))
|
||||
}
|
||||
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
||||
};
|
||||
|
||||
Ok(img)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,6 @@ use super::{
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
@@ -70,21 +69,17 @@ impl KeyHistoryRetention {
|
||||
self,
|
||||
key: Key,
|
||||
delta_writer: &mut Vec<(Key, Lsn, Value)>,
|
||||
mut image_writer: Option<&mut ImageLayerWriter>,
|
||||
image_writer: &mut ImageLayerWriter,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
for (_, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
if first_batch {
|
||||
if logs.len() == 1 && logs[0].1.is_image() {
|
||||
let Value::Image(img) = &logs[0].1 else {
|
||||
unreachable!()
|
||||
};
|
||||
if let Some(image_writer) = image_writer.as_mut() {
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
} else {
|
||||
delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
|
||||
}
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
delta_writer.push((key, lsn, val));
|
||||
@@ -703,140 +698,7 @@ impl Timeline {
|
||||
|
||||
// This iterator walks through all key-value pairs from all the layers
|
||||
// we're compacting, in key, LSN order.
|
||||
// If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
|
||||
// then the Value::Image is ordered before Value::WalRecord.
|
||||
//
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
|
||||
// option and validation code once we've reached confidence.
|
||||
enum AllValuesIter<'a> {
|
||||
PageCachedBlobIo {
|
||||
all_keys_iter: VecIter<'a>,
|
||||
},
|
||||
StreamingKmergeBypassingPageCache {
|
||||
merge_iter: MergeIterator<'a>,
|
||||
},
|
||||
ValidatingStreamingKmergeBypassingPageCache {
|
||||
mode: CompactL0BypassPageCacheValidation,
|
||||
merge_iter: MergeIterator<'a>,
|
||||
all_keys_iter: VecIter<'a>,
|
||||
},
|
||||
}
|
||||
type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
|
||||
impl AllValuesIter<'_> {
|
||||
async fn next_all_keys_iter(
|
||||
iter: &mut VecIter<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let Some(DeltaEntry {
|
||||
key,
|
||||
lsn,
|
||||
val: value_ref,
|
||||
..
|
||||
}) = iter.next()
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let value = value_ref.load(ctx).await?;
|
||||
Ok(Some((*key, *lsn, value)))
|
||||
}
|
||||
async fn next(
|
||||
&mut self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
match self {
|
||||
AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
|
||||
Self::next_all_keys_iter(iter, ctx).await
|
||||
}
|
||||
AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
|
||||
AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
|
||||
// advance both iterators
|
||||
let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
|
||||
let merge_iter_item = merge_iter.next().await;
|
||||
// compare results & log warnings as needed
|
||||
macro_rules! rate_limited_warn {
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) || cfg!(feature = "testing") {
|
||||
warn!($($arg)*);
|
||||
panic!("CompactL0BypassPageCacheValidation failure, check logs");
|
||||
}
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!($($arg)*);
|
||||
});
|
||||
}}
|
||||
}
|
||||
match (&all_keys_iter_item, &merge_iter_item) {
|
||||
(Err(_), Err(_)) => {
|
||||
// don't bother asserting equivality of the errors
|
||||
}
|
||||
(Err(all_keys), Ok(merge)) => {
|
||||
rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
|
||||
},
|
||||
(Ok(all_keys), Err(merge)) => {
|
||||
rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
|
||||
},
|
||||
(Ok(None), Ok(None)) => { }
|
||||
(Ok(Some(all_keys)), Ok(None)) => {
|
||||
rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
|
||||
}
|
||||
(Ok(None), Ok(Some(merge))) => {
|
||||
rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
|
||||
}
|
||||
(Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
|
||||
match mode {
|
||||
// TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
|
||||
CompactL0BypassPageCacheValidation::KeyLsn => {
|
||||
let all_keys = (all_keys_key, all_keys_lsn);
|
||||
let merge = (merge_key, merge_lsn);
|
||||
if all_keys != merge {
|
||||
rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
|
||||
}
|
||||
}
|
||||
CompactL0BypassPageCacheValidation::KeyLsnValue => {
|
||||
let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
|
||||
let merge = (merge_key, merge_lsn, merge_value);
|
||||
if all_keys != merge {
|
||||
rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// in case of mismatch, trust the legacy all_keys_iter_item
|
||||
all_keys_iter_item
|
||||
}.instrument(info_span!("next")).await
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
|
||||
CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
|
||||
all_keys_iter: all_keys.iter(),
|
||||
},
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
|
||||
let merge_iter = {
|
||||
let mut deltas = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact.iter() {
|
||||
let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
deltas.push(l);
|
||||
}
|
||||
MergeIterator::create(&deltas, &[], ctx)
|
||||
};
|
||||
match validate {
|
||||
None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
|
||||
Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
|
||||
mode: validate.clone(),
|
||||
merge_iter,
|
||||
all_keys_iter: all_keys.iter(),
|
||||
},
|
||||
}
|
||||
}
|
||||
};
|
||||
let all_values_iter = all_keys.iter();
|
||||
|
||||
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||
let mut all_keys_iter = all_keys
|
||||
@@ -909,11 +771,11 @@ impl Timeline {
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
let mut next_hole = 0; // index of next hole in holes vector
|
||||
|
||||
while let Some((key, lsn, value)) = all_values_iter
|
||||
.next(ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
{
|
||||
let value = val.load(ctx).await.map_err(CompactionError::Other)?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
@@ -1098,10 +960,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// Without this, rustc complains about deltas_to_compact still
|
||||
// being borrowed when we `.into_iter()` below.
|
||||
drop(all_values_iter);
|
||||
|
||||
Ok(CompactLevel0Phase1Result {
|
||||
new_layers,
|
||||
deltas_to_compact: deltas_to_compact
|
||||
@@ -1209,43 +1067,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum CompactL0Phase1ValueAccess {
|
||||
/// The old way.
|
||||
PageCachedBlobIo,
|
||||
/// The new way.
|
||||
StreamingKmerge {
|
||||
/// If set, we run both the old way and the new way, validate that
|
||||
/// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
|
||||
/// and if the validation fails,
|
||||
/// - in tests: fail them with a panic or
|
||||
/// - in prod, log a rate-limited warning and use the old way's results.
|
||||
///
|
||||
/// If not set, we only run the new way and trust its results.
|
||||
validate: Option<CompactL0BypassPageCacheValidation>,
|
||||
},
|
||||
}
|
||||
|
||||
/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum CompactL0BypassPageCacheValidation {
|
||||
/// Validate that the series of (key, lsn) pairs are the same.
|
||||
KeyLsn,
|
||||
/// Validate that the entire output of old and new way is identical.
|
||||
KeyLsnValue,
|
||||
}
|
||||
|
||||
impl Default for CompactL0Phase1ValueAccess {
|
||||
fn default() -> Self {
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge {
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
|
||||
validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Entry point for new tiered compaction algorithm.
|
||||
///
|
||||
@@ -1333,7 +1154,6 @@ impl Timeline {
|
||||
horizon: Lsn,
|
||||
retain_lsn_below_horizon: &[Lsn],
|
||||
delta_threshold_cnt: usize,
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
if cfg!(debug_assertions) {
|
||||
@@ -1363,7 +1183,6 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
}
|
||||
let has_ancestor = base_img_from_ancestor.is_some();
|
||||
// Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
|
||||
// and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
|
||||
let (mut split_history, lsn_split_points) = {
|
||||
@@ -1397,9 +1216,6 @@ impl Timeline {
|
||||
// For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
|
||||
// keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
|
||||
// dropped.
|
||||
//
|
||||
// TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
|
||||
// threshold, we could have kept delta instead to save space. This is an optimization for the future.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -1417,13 +1233,9 @@ impl Timeline {
|
||||
"should have at least below + above horizon batches"
|
||||
);
|
||||
let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
|
||||
if let Some((key, lsn, img)) = base_img_from_ancestor {
|
||||
replay_history.push((key, lsn, Value::Image(img)));
|
||||
}
|
||||
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
|
||||
// TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
|
||||
records_since_last_image += split_for_lsn.len();
|
||||
let generate_image = if i == 0 && !has_ancestor {
|
||||
let generate_image = if i == 0 {
|
||||
// We always generate images for the first batch (below horizon / lowest retain_lsn)
|
||||
true
|
||||
} else if i == batch_cnt - 1 {
|
||||
@@ -1546,25 +1358,20 @@ impl Timeline {
|
||||
retain_lsns_below_horizon.sort();
|
||||
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
|
||||
};
|
||||
let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
|
||||
Lsn(self.ancestor_lsn.0 + 1)
|
||||
} else {
|
||||
let res = retain_lsns_below_horizon
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
assert_eq!(
|
||||
res,
|
||||
retain_lsns_below_horizon
|
||||
.iter()
|
||||
.min()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff)
|
||||
);
|
||||
}
|
||||
res
|
||||
};
|
||||
let lowest_retain_lsn = retain_lsns_below_horizon
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
assert_eq!(
|
||||
lowest_retain_lsn,
|
||||
retain_lsns_below_horizon
|
||||
.iter()
|
||||
.min()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff)
|
||||
);
|
||||
}
|
||||
info!(
|
||||
"picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
|
||||
layer_selection.len(),
|
||||
@@ -1605,7 +1412,6 @@ impl Timeline {
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn flush_deltas(
|
||||
deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
|
||||
last_key: Key,
|
||||
@@ -1614,7 +1420,6 @@ impl Timeline {
|
||||
tline: &Arc<Timeline>,
|
||||
lowest_retain_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
last_batch: bool,
|
||||
) -> anyhow::Result<Option<ResidentLayer>> {
|
||||
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
|
||||
// overlapping layers.
|
||||
@@ -1635,7 +1440,7 @@ impl Timeline {
|
||||
*current_delta_split_point += 1;
|
||||
need_split = true;
|
||||
}
|
||||
if !need_split && !last_batch {
|
||||
if !need_split {
|
||||
return Ok(None);
|
||||
}
|
||||
let deltas = std::mem::take(deltas);
|
||||
@@ -1660,44 +1465,15 @@ impl Timeline {
|
||||
Ok(Some(delta_layer))
|
||||
}
|
||||
|
||||
// Only create image layers when there is no ancestor branches. TODO: create covering image layer
|
||||
// when some condition meet.
|
||||
let mut image_layer_writer = if self.ancestor_timeline.is_none() {
|
||||
Some(
|
||||
ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(Key::MIN..Key::MAX), // covers the full key range
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
/// Returns None if there is no ancestor branch. Throw an error when the key is not found.
|
||||
///
|
||||
/// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
|
||||
/// is needed for reconstruction. This should be fixed in the future.
|
||||
///
|
||||
/// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
|
||||
/// images.
|
||||
async fn get_ancestor_image(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
|
||||
if tline.ancestor_timeline.is_none() {
|
||||
return Ok(None);
|
||||
};
|
||||
// This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
|
||||
// as much existing code as possible.
|
||||
let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
|
||||
Ok(Some((key, tline.ancestor_lsn, img)))
|
||||
}
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(Key::MIN..Key::MAX), // covers the full key range
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut delta_values = Vec::new();
|
||||
let delta_split_points = delta_split_points.into_iter().collect_vec();
|
||||
@@ -1718,17 +1494,11 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
&mut delta_values,
|
||||
image_layer_writer.as_mut(),
|
||||
ctx,
|
||||
)
|
||||
.pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
@@ -1739,7 +1509,6 @@ impl Timeline {
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
false,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
@@ -1758,17 +1527,11 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
&mut delta_values,
|
||||
image_layer_writer.as_mut(),
|
||||
ctx,
|
||||
)
|
||||
.pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
@@ -1779,25 +1542,19 @@ impl Timeline {
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
true,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
assert!(delta_values.is_empty(), "unprocessed keys");
|
||||
|
||||
let image_layer = if let Some(writer) = image_layer_writer {
|
||||
Some(writer.finish(self, ctx).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
info!(
|
||||
"produced {} delta layers and {} image layers",
|
||||
delta_layers.len(),
|
||||
if image_layer.is_some() { 1 } else { 0 }
|
||||
1
|
||||
);
|
||||
let mut compact_to = Vec::new();
|
||||
compact_to.extend(delta_layers);
|
||||
compact_to.extend(image_layer);
|
||||
compact_to.push(image_layer);
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
@@ -1,967 +0,0 @@
|
||||
//! An efficient way to keep the timeline gate open without preventing
|
||||
//! timeline shutdown for longer than a single call to a timeline method.
|
||||
//!
|
||||
//! # Motivation
|
||||
//!
|
||||
//! On a single page service connection, we're typically serving a single TenantTimelineId.
|
||||
//!
|
||||
//! Without sharding, there is a single Timeline object to which we dispatch
|
||||
//! all requests. For example, a getpage request gets dispatched to the
|
||||
//! Timeline::get method of the Timeline object that represents the
|
||||
//! (tenant,timeline) of that connection.
|
||||
//!
|
||||
//! With sharding, for each request that comes in on the connection,
|
||||
//! we first have to perform shard routing based on the requested key (=~ page number).
|
||||
//! The result of shard routing is a Timeline object.
|
||||
//! We then dispatch the request to that Timeline object.
|
||||
//!
|
||||
//! Regardless of whether the tenant is sharded or not, we want to ensure that
|
||||
//! we hold the Timeline gate open while we're invoking the method on the
|
||||
//! Timeline object.
|
||||
//!
|
||||
//! However, we want to avoid the overhead of entering the gate for every
|
||||
//! method invocation.
|
||||
//!
|
||||
//! Further, for shard routing, we want to avoid calling the tenant manager to
|
||||
//! resolve the shard for every request. Instead, we want to cache the
|
||||
//! routing result so we can bypass the tenant manager for all subsequent requests
|
||||
//! that get routed to that shard.
|
||||
//!
|
||||
//! Regardless of how we accomplish the above, it should not
|
||||
//! prevent the Timeline from shutting down promptly.
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
|
||||
//!
|
||||
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
|
||||
//!
|
||||
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
|
||||
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
|
||||
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
|
||||
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the `Weak<HandleInner>` in the cache.
|
||||
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
|
||||
//!
|
||||
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//!
|
||||
//! # Memory Management / How The Reference Cycle Is Broken
|
||||
//!
|
||||
//! The attentive reader may have noticed the strong reference cycle
|
||||
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
|
||||
//!
|
||||
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
|
||||
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
|
||||
//!
|
||||
//! The cycle is broken by either
|
||||
//! - `PerTimelineState::shutdown` or
|
||||
//! - dropping the `Cache`.
|
||||
//!
|
||||
//! Concurrently existing `Handle`s will extend the existence of the cycle.
|
||||
//! However, since `Handle`s are short-lived and new `Handle`s are not
|
||||
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
|
||||
//! that extension of the cycle is bounded.
|
||||
//!
|
||||
//! # Fast Path for Shard Routing
|
||||
//!
|
||||
//! The `Cache` has a fast path for shard routing to avoid calling into
|
||||
//! the tenant manager for every request.
|
||||
//!
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
|
||||
//!
|
||||
//! The current implementation uses the first entry in the hash map
|
||||
//! to determine the `ShardParameters` and derive the correct
|
||||
//! `ShardIndex` for the requested key.
|
||||
//!
|
||||
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
|
||||
//!
|
||||
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
|
||||
//! it's a hit.
|
||||
//!
|
||||
//! ## Cache invalidation
|
||||
//!
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
|
||||
//! The only reasons why an entry in the cache can become stale are:
|
||||
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
|
||||
//! being detached, timeline or shard deleted, or pageserver is shutting down.
|
||||
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
|
||||
//!
|
||||
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
|
||||
//! timeline has shut down, and when that happens, we remove the entry from the cache.
|
||||
//!
|
||||
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
|
||||
//! to the parent shard during a shard split. Eventually, the shard split task will
|
||||
//! shut down the parent => case (1).
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::instrument;
|
||||
use tracing::trace;
|
||||
use utils::id::TimelineId;
|
||||
use utils::shard::ShardIndex;
|
||||
use utils::shard::ShardNumber;
|
||||
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
|
||||
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
|
||||
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||
type TenantManagerError: Sized + std::fmt::Debug;
|
||||
type TenantManager: TenantManager<Self> + Sized;
|
||||
type Timeline: ArcTimeline<Self> + Sized;
|
||||
}
|
||||
|
||||
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
|
||||
/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
|
||||
/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
|
||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
|
||||
struct CacheId(u64);
|
||||
|
||||
impl CacheId {
|
||||
fn next() -> Self {
|
||||
static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
|
||||
let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if id == 0 {
|
||||
panic!("CacheId::new() returned 0, overflow");
|
||||
}
|
||||
Self(id)
|
||||
}
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Cache<T: Types> {
|
||||
id: CacheId,
|
||||
map: Map<T>,
|
||||
}
|
||||
|
||||
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
|
||||
|
||||
impl<T: Types> Default for Cache<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id: CacheId::next(),
|
||||
map: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
|
||||
pub(crate) struct ShardTimelineId {
|
||||
pub(crate) shard_index: ShardIndex,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
|
||||
struct HandleInner<T: Types> {
|
||||
shut_down: AtomicBool,
|
||||
timeline: T::Timeline,
|
||||
// The timeline's gate held open.
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
|
||||
///
|
||||
/// See module-level comment for details.
|
||||
pub struct PerTimelineState<T: Types> {
|
||||
// None = shutting down
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
|
||||
}
|
||||
|
||||
impl<T: Types> Default for PerTimelineState<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
handles: Mutex::new(Some(Default::default())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Abstract view of [`crate::tenant::mgr`], for testability.
|
||||
pub(crate) trait TenantManager<T: Types> {
|
||||
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
|
||||
/// Errors are returned as [`GetError::TenantManager`].
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<T::Timeline, T::TenantManagerError>;
|
||||
}
|
||||
|
||||
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||
pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate;
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId;
|
||||
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||
}
|
||||
|
||||
/// Errors returned by [`Cache::get`].
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum GetError<T: Types> {
|
||||
TenantManager(T::TenantManagerError),
|
||||
TimelineGateClosed,
|
||||
PerTimelineStateShutDown,
|
||||
}
|
||||
|
||||
/// Internal type used in [`Cache::get`].
|
||||
enum RoutingResult<T: Types> {
|
||||
FastPath(Handle<T>),
|
||||
SlowPath(ShardTimelineId),
|
||||
NeedConsultTenantManager,
|
||||
}
|
||||
|
||||
impl<T: Types> Cache<T> {
|
||||
/// See module-level comment for details.
|
||||
///
|
||||
/// Does NOT check for the shutdown state of [`Types::Timeline`].
|
||||
/// Instead, the methods of [`Types::Timeline`] that are invoked through
|
||||
/// the [`Handle`] are responsible for checking these conditions
|
||||
/// and if so, return an error that causes the page service to
|
||||
/// close the connection.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
// terminates because each iteration removes an element from the map
|
||||
loop {
|
||||
let handle = self
|
||||
.get_impl(timeline_id, shard_selector, tenant_manager)
|
||||
.await?;
|
||||
if handle.0.shut_down.load(Ordering::Relaxed) {
|
||||
let removed = self
|
||||
.map
|
||||
.remove(&handle.0.timeline.shard_timeline_id())
|
||||
.expect("invariant of get_impl is that the returned handle is in the map");
|
||||
assert!(
|
||||
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
|
||||
"shard_timeline_id() incorrect?"
|
||||
);
|
||||
} else {
|
||||
return Ok(handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
async fn get_impl(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
let miss: ShardSelector = {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
match routing_state {
|
||||
RoutingResult::FastPath(handle) => return Ok(handle),
|
||||
RoutingResult::SlowPath(key) => match self.map.get(&key) {
|
||||
Some(cached) => match cached.upgrade() {
|
||||
Some(upgraded) => return Ok(Handle(upgraded)),
|
||||
None => {
|
||||
trace!("handle cache stale");
|
||||
self.map.remove(&key).unwrap();
|
||||
ShardSelector::Known(key.shard_index)
|
||||
}
|
||||
},
|
||||
None => ShardSelector::Known(key.shard_index),
|
||||
},
|
||||
RoutingResult::NeedConsultTenantManager => shard_selector,
|
||||
}
|
||||
};
|
||||
self.get_miss(timeline_id, miss, tenant_manager).await
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn shard_routing(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> RoutingResult<T> {
|
||||
loop {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let Some((first_key, first_handle)) = self.map.iter().next() else {
|
||||
return RoutingResult::NeedConsultTenantManager;
|
||||
};
|
||||
let Some(first_handle) = first_handle.upgrade() else {
|
||||
// TODO: dedup with get()
|
||||
trace!("handle cache stale");
|
||||
let first_key_owned = *first_key;
|
||||
self.map.remove(&first_key_owned).unwrap();
|
||||
continue;
|
||||
};
|
||||
|
||||
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
|
||||
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
|
||||
shard_number: shard_num,
|
||||
shard_count: first_handle_shard_identity.count,
|
||||
};
|
||||
|
||||
let need_idx = match shard_selector {
|
||||
ShardSelector::Page(key) => {
|
||||
make_shard_index(first_handle_shard_identity.get_shard_number(&key))
|
||||
}
|
||||
ShardSelector::Zero => make_shard_index(ShardNumber(0)),
|
||||
ShardSelector::Known(shard_idx) => shard_idx,
|
||||
};
|
||||
let need_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: need_idx,
|
||||
timeline_id,
|
||||
};
|
||||
let first_handle_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: first_handle_shard_identity.shard_index(),
|
||||
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
|
||||
};
|
||||
|
||||
if need_shard_timeline_id == first_handle_shard_timeline_id {
|
||||
return RoutingResult::FastPath(Handle(first_handle));
|
||||
} else {
|
||||
return RoutingResult::SlowPath(need_shard_timeline_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
#[inline(always)]
|
||||
async fn get_miss(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
match tenant_manager.resolve(timeline_id, shard_selector).await {
|
||||
Ok(timeline) => {
|
||||
let key = timeline.shard_timeline_id();
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
let gate_guard = match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
};
|
||||
trace!("creating new HandleInner");
|
||||
let handle = Arc::new(
|
||||
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
|
||||
// so we can identify reference cycle bugs.
|
||||
HandleInner {
|
||||
shut_down: AtomicBool::new(false),
|
||||
_gate_guard: gate_guard,
|
||||
timeline: timeline.clone(),
|
||||
},
|
||||
);
|
||||
let handle = {
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
// This cannot not happen because
|
||||
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||
// while we were waiting for the tenant manager.
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(Arc::downgrade(&handle));
|
||||
handle
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Handle(handle))
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> PerTimelineState<T> {
|
||||
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
/// Even if [`TenantManager::resolve`] would still resolve to it.
|
||||
///
|
||||
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
|
||||
/// That's ok because they're short-lived. See module-level comment for details.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(super) fn shutdown(&self) {
|
||||
let handles = self
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned")
|
||||
// NB: this .take() sets locked to None.
|
||||
// That's what makes future `Cache::get` misses fail.
|
||||
// Cache hits are taken care of below.
|
||||
.take();
|
||||
let Some(handles) = handles else {
|
||||
trace!("already shut down");
|
||||
return;
|
||||
};
|
||||
for handle in handles.values() {
|
||||
// Make hits fail.
|
||||
handle.shut_down.store(true, Ordering::Relaxed);
|
||||
}
|
||||
drop(handles);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<T: Types> Drop for HandleInner<T> {
|
||||
fn drop(&mut self) {
|
||||
trace!("HandleInner dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
|
||||
impl<T: Types> Drop for Cache<T> {
|
||||
fn drop(&mut self) {
|
||||
for (_, weak) in self.map.drain() {
|
||||
if let Some(strong) = weak.upgrade() {
|
||||
// handle is still being kept alive in PerTimelineState
|
||||
let timeline = strong.timeline.per_timeline_state();
|
||||
let mut handles = timeline.handles.lock().expect("mutex poisoned");
|
||||
if let Some(handles) = &mut *handles {
|
||||
let Some(removed) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
assert!(Arc::ptr_eq(&removed, &strong));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::{
|
||||
key::{rel_block_to_key, Key, DBDIR_KEY},
|
||||
models::ShardParameters,
|
||||
reltag::RelTag,
|
||||
shard::ShardStripeSize,
|
||||
};
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
use super::*;
|
||||
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestTypes;
|
||||
impl Types for TestTypes {
|
||||
type TenantManagerError = anyhow::Error;
|
||||
type TenantManager = StubManager;
|
||||
type Timeline = Arc<StubTimeline>;
|
||||
}
|
||||
|
||||
struct StubManager {
|
||||
shards: Vec<Arc<StubTimeline>>,
|
||||
}
|
||||
|
||||
struct StubTimeline {
|
||||
gate: utils::sync::gate::Gate,
|
||||
id: TimelineId,
|
||||
shard: ShardIdentity,
|
||||
per_timeline_state: PerTimelineState<TestTypes>,
|
||||
myself: Weak<StubTimeline>,
|
||||
}
|
||||
|
||||
impl StubTimeline {
|
||||
fn getpage(&self) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||
&self.gate
|
||||
}
|
||||
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||
ShardTimelineId {
|
||||
shard_index: self.shard.shard_index(),
|
||||
timeline_id: self.id,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
&self.shard
|
||||
}
|
||||
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
|
||||
&self.per_timeline_state
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantManager<TestTypes> for StubManager {
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> anyhow::Result<Arc<StubTimeline>> {
|
||||
for timeline in &self.shards {
|
||||
if timeline.id == timeline_id {
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Zero => continue,
|
||||
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Page(_) => continue,
|
||||
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Known(_) => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::bail!("not found")
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_timeline_shutdown() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
//
|
||||
// fill the cache
|
||||
//
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
let handle: Handle<_> = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
let handle_inner_weak = Arc::downgrade(&handle.0);
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
assert_eq!(
|
||||
(
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
Weak::weak_count(&handle_inner_weak)
|
||||
),
|
||||
(2, 2),
|
||||
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
|
||||
);
|
||||
assert_eq!(cache.map.len(), 1);
|
||||
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
//
|
||||
// demonstrate that Handle holds up gate closure
|
||||
// but shutdown prevents new handles from being handed out
|
||||
//
|
||||
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("cache and per-timeline handler state keep cache open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
// NB: first poll of close() makes it enter closing state
|
||||
}
|
||||
}
|
||||
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
|
||||
// SHUTDOWN
|
||||
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"through local var handle"
|
||||
);
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
1,
|
||||
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(via handle), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// this handle is perfectly usable
|
||||
handle.getpage();
|
||||
|
||||
cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
0,
|
||||
"first access after shutdown cleans up the Weak's from the cache"
|
||||
);
|
||||
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
0,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"the HandleInner destructor already ran"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// closing gate succeeds after dropping handle
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
|
||||
// map gets cleaned on next lookup
|
||||
cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
assert_eq!(cache.map.len(), 0);
|
||||
|
||||
// ensure all refs to shard0 are gone and we're not leaking anything
|
||||
let myself = Weak::clone(&shard0.myself);
|
||||
drop(shard0);
|
||||
drop(mgr);
|
||||
assert_eq!(Weak::strong_count(&myself), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multiple_timelines_and_deletion() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let timeline_a = TimelineId::generate();
|
||||
let timeline_b = TimelineId::generate();
|
||||
assert_ne!(timeline_a, timeline_b);
|
||||
let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_a,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_b,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mut mgr = StubManager {
|
||||
shards: vec![timeline_a.clone(), timeline_b.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
cache
|
||||
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have it");
|
||||
cache
|
||||
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert_eq!(cache.map.len(), 2);
|
||||
|
||||
// delete timeline A
|
||||
timeline_a.per_timeline_state.shutdown();
|
||||
mgr.shards.retain(|t| t.id != timeline_a.id);
|
||||
assert!(
|
||||
mgr.resolve(timeline_a.id, ShardSelector::Page(key))
|
||||
.await
|
||||
.is_err(),
|
||||
"broken StubManager implementation"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
2,
|
||||
"cache still has a Weak handle to Timeline A"
|
||||
);
|
||||
cache
|
||||
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
|
||||
|
||||
cache
|
||||
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we still have it");
|
||||
}
|
||||
|
||||
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
|
||||
rel_block_to_key(
|
||||
RelTag {
|
||||
spcnode: 1663,
|
||||
dbnode: 208101,
|
||||
relnode: 2620,
|
||||
forknum: 0,
|
||||
},
|
||||
shard.0 as u32 * params.stripe_size.0,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_shard_split() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let parent = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child_params = ShardParameters {
|
||||
count: ShardCount(2),
|
||||
stripe_size: ShardStripeSize::default(),
|
||||
};
|
||||
let child0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child1 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child_shards_by_shard_number = [child0.clone(), child1.clone()];
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
// fill the cache with the parent
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||
"mgr returns parent first"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
//
|
||||
// SHARD SPLIT: tenant manager changes, but the cache isn't informed
|
||||
//
|
||||
|
||||
// while we haven't shut down the parent, the cache will return the cached parent, even
|
||||
// if the tenant manager returns the child
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||
"mgr returns parent"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
let parent_handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
|
||||
|
||||
// invalidate the cache
|
||||
parent.per_timeline_state.shutdown();
|
||||
|
||||
// the cache will now return the child, even though the parent handle still exists
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(
|
||||
&handle.myself,
|
||||
&child_shards_by_shard_number[i as usize].myself
|
||||
),
|
||||
"mgr returns child"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
// all the while the parent handle kept the parent gate open
|
||||
tokio::select! {
|
||||
_ = parent_handle.gate.close() => {
|
||||
panic!("parent handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
drop(parent_handle);
|
||||
tokio::select! {
|
||||
_ = parent.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("parent handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_connection_handler_exit() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
// Simulate 10 connections that's opened, used, and closed
|
||||
let mut used_handles = vec![];
|
||||
for _ in 0..10 {
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
let handle = {
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.0));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown
|
||||
assert!(used_handles
|
||||
.iter()
|
||||
.all(|weak| Weak::strong_count(weak) == 0));
|
||||
|
||||
// ... thus the gate should close immediately, even without shutdown
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -241,9 +241,6 @@ impl PostgresRedoManager {
|
||||
|
||||
/// Shut down the WAL redo manager.
|
||||
///
|
||||
/// Returns `true` if this call was the one that initiated shutdown.
|
||||
/// `true` may be observed by no caller if the first caller stops polling.
|
||||
///
|
||||
/// After this future completes
|
||||
/// - no redo process is running
|
||||
/// - no new redo process will be spawned
|
||||
@@ -253,32 +250,22 @@ impl PostgresRedoManager {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn shutdown(&self) -> bool {
|
||||
pub async fn shutdown(&self) {
|
||||
// prevent new processes from being spawned
|
||||
let maybe_permit = match self.redo_process.get_or_init_detached().await {
|
||||
let permit = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => {
|
||||
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
|
||||
None
|
||||
} else {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
Some(permit)
|
||||
}
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
permit
|
||||
}
|
||||
Err(permit) => Some(permit),
|
||||
};
|
||||
let it_was_us = if let Some(permit) = maybe_permit {
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
Err(permit) => permit,
|
||||
};
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
||||
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
||||
// for the underlying process.
|
||||
self.launched_processes.close().await;
|
||||
it_was_us
|
||||
}
|
||||
|
||||
/// This type doesn't have its own background task to check for idleness: we
|
||||
|
||||
@@ -67,7 +67,6 @@ FALLBACK_DURATION = {
|
||||
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
|
||||
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
|
||||
|
||||
@@ -53,6 +53,7 @@ diesel = { version = "2.1.4", features = [
|
||||
] }
|
||||
diesel_migrations = { version = "2.1.0" }
|
||||
r2d2 = { version = "0.8.10" }
|
||||
dns-lookup = { version = "2.0.4" }
|
||||
|
||||
utils = { path = "../libs/utils/" }
|
||||
metrics = { path = "../libs/metrics/" }
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
DROP TABLE leader;
|
||||
@@ -0,0 +1,6 @@
|
||||
CREATE TABLE leader (
|
||||
hostname VARCHAR NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
started_at TIMESTAMPTZ NOT NULL,
|
||||
PRIMARY KEY(hostname, port, started_at)
|
||||
);
|
||||
@@ -10,6 +10,7 @@ mod id_lock_map;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
mod pageserver_client;
|
||||
mod peer_client;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
mod scheduler;
|
||||
|
||||
@@ -81,6 +81,9 @@ struct Cli {
|
||||
#[arg(long, default_value = "5s")]
|
||||
db_connect_timeout: humantime::Duration,
|
||||
|
||||
#[arg(long, default_value = "false")]
|
||||
start_as_candidate: bool,
|
||||
|
||||
/// `neon_local` sets this to the path of the neon_local repo dir.
|
||||
/// Only relevant for testing.
|
||||
// TODO: make `cfg(feature = "testing")`
|
||||
@@ -273,6 +276,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
|
||||
split_threshold: args.split_threshold,
|
||||
neon_local_repo_dir: args.neon_local_repo_dir,
|
||||
start_as_candidate: args.start_as_candidate,
|
||||
http_service_port: args.listen.port() as i32,
|
||||
};
|
||||
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
|
||||
104
storage_controller/src/peer_client.rs
Normal file
104
storage_controller/src/peer_client.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
use crate::tenant_shard::ObservedState;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use reqwest::{StatusCode, Url};
|
||||
use utils::{backoff, http::error::HttpErrorBody};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct PeerClient {
|
||||
hostname: String,
|
||||
port: i32,
|
||||
jwt: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum StorageControllerPeerError {
|
||||
#[error("failed to deserialize error response with status code {0} at {1}: {2}")]
|
||||
DeserializationError(StatusCode, Url, reqwest::Error),
|
||||
#[error("storage controller peer API error ({0}): {1}")]
|
||||
ApiError(StatusCode, String),
|
||||
#[error("failed to send HTTP request: {0}")]
|
||||
SendError(reqwest::Error),
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
pub(crate) type Result<T> = std::result::Result<T, StorageControllerPeerError>;
|
||||
|
||||
pub(crate) trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
|
||||
}
|
||||
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
let url = self.url().to_owned();
|
||||
Err(match self.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg),
|
||||
Err(err) => StorageControllerPeerError::DeserializationError(status, url, err),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
|
||||
|
||||
impl PeerClient {
|
||||
pub(crate) fn new(hostname: String, port: i32, jwt: Option<String>) -> Self {
|
||||
Self {
|
||||
hostname,
|
||||
port,
|
||||
jwt,
|
||||
client: reqwest::Client::new(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn request_step_down(&self) -> Result<GlobalObservedState> {
|
||||
let uri = format!("{}:{}/control/v1/step_down", self.hostname, self.port);
|
||||
let req = self.client.put(uri);
|
||||
let req = if let Some(jwt) = &self.jwt {
|
||||
req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}"))
|
||||
} else {
|
||||
req
|
||||
};
|
||||
|
||||
let res = req
|
||||
.send()
|
||||
.await
|
||||
.map_err(StorageControllerPeerError::SendError)?;
|
||||
let response = res.error_from_body().await?;
|
||||
|
||||
let status = response.status();
|
||||
let url = response.url().to_owned();
|
||||
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err))
|
||||
}
|
||||
|
||||
pub(crate) async fn step_down(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<GlobalObservedState> {
|
||||
backoff::retry(
|
||||
|| self.request_step_down(),
|
||||
|_e| false,
|
||||
4,
|
||||
8,
|
||||
"Send step down request",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| StorageControllerPeerError::Cancelled)
|
||||
.and_then(|x| x)
|
||||
}
|
||||
}
|
||||
@@ -95,6 +95,8 @@ pub(crate) enum DatabaseOperation {
|
||||
ListMetadataHealth,
|
||||
ListMetadataHealthUnhealthy,
|
||||
ListMetadataHealthOutdated,
|
||||
GetLeader,
|
||||
UpdateLeader,
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
@@ -785,6 +787,71 @@ impl Persistence {
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Get the current entry from the `leader` table if one exists.
|
||||
/// It is an error for the table to contain more than one entry.
|
||||
pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<LeaderPersistence>> {
|
||||
let mut leader: Vec<LeaderPersistence> = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::GetLeader,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::leader::table.load::<LeaderPersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
if leader.len() > 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"More than one entry present in the leader table: {leader:?}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(leader.pop())
|
||||
}
|
||||
|
||||
/// Update the new leader with compare-exchange semantics. If `prev` does not
|
||||
/// match the current leader entry, then the update is treated as a failure.
|
||||
/// When `prev` is not specified, the update is forced.
|
||||
pub(crate) async fn update_leader(
|
||||
&self,
|
||||
prev: Option<LeaderPersistence>,
|
||||
new: LeaderPersistence,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::leader::dsl::*;
|
||||
|
||||
let updated = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::UpdateLeader,
|
||||
move |conn| -> DatabaseResult<usize> {
|
||||
let updated = match &prev {
|
||||
Some(prev) => diesel::update(leader)
|
||||
.filter(hostname.eq(prev.hostname.clone()))
|
||||
.filter(port.eq(prev.port))
|
||||
.filter(started_at.eq(prev.started_at))
|
||||
.set((
|
||||
hostname.eq(new.hostname.clone()),
|
||||
port.eq(new.port),
|
||||
started_at.eq(new.started_at),
|
||||
))
|
||||
.execute(conn)?,
|
||||
None => diesel::insert_into(leader)
|
||||
.values(new.clone())
|
||||
.execute(conn)?,
|
||||
};
|
||||
|
||||
Ok(updated)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
if updated == 0 {
|
||||
return Err(DatabaseError::Logical(
|
||||
"Leader table update failed".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
@@ -910,3 +977,13 @@ impl From<MetadataHealthPersistence> for MetadataHealthRecord {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone,
|
||||
)]
|
||||
#[diesel(table_name = crate::schema::leader)]
|
||||
pub(crate) struct LeaderPersistence {
|
||||
pub(crate) hostname: String,
|
||||
pub(crate) port: i32,
|
||||
pub(crate) started_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
@@ -656,8 +656,11 @@ impl Reconciler {
|
||||
// reconcile this location. This includes locations with different configurations, as well
|
||||
// as locations with unknown (None) observed state.
|
||||
|
||||
// Incrementing generation is the safe general case, but is inefficient for changes that only
|
||||
// modify some details (e.g. the tenant's config).
|
||||
// The general case is to increment the generation. However, there are cases
|
||||
// where this is not necessary:
|
||||
// - if we are only updating the TenantConf part of the location
|
||||
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
|
||||
// and the location was already in the correct generation
|
||||
let increment_generation = match observed {
|
||||
None => true,
|
||||
Some(ObservedStateLocation { conf: None }) => true,
|
||||
@@ -666,11 +669,18 @@ impl Reconciler {
|
||||
}) => {
|
||||
let generations_match = observed.generation == wanted_conf.generation;
|
||||
|
||||
// We may skip incrementing the generation if the location is already in the expected mode and
|
||||
// generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
|
||||
// but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
|
||||
// after a restart/crash, so fall back to the universally safe path of incrementing generation.
|
||||
!generations_match || (observed.mode != wanted_conf.mode)
|
||||
use LocationConfigMode::*;
|
||||
let mode_transition_requires_gen_inc =
|
||||
match (observed.mode, wanted_conf.mode) {
|
||||
// Usually the short-lived attachment modes (multi and stale) are only used
|
||||
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
|
||||
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
|
||||
(AttachedSingle, AttachedStale) => false,
|
||||
(AttachedMulti, AttachedSingle) => false,
|
||||
(lhs, rhs) => lhs != rhs,
|
||||
};
|
||||
|
||||
!generations_match || mode_transition_requires_gen_inc
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
// @generated automatically by Diesel CLI.
|
||||
|
||||
diesel::table! {
|
||||
leader (hostname, port, started_at) {
|
||||
hostname -> Varchar,
|
||||
port -> Int4,
|
||||
started_at -> Timestamptz,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
metadata_health (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
@@ -36,4 +44,4 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
|
||||
diesel::allow_tables_to_appear_in_same_query!(leader, metadata_health, nodes, tenant_shards,);
|
||||
|
||||
@@ -16,7 +16,10 @@ use crate::{
|
||||
compute_hook::NotifyError,
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
|
||||
metrics::LeadershipStatusGroup,
|
||||
persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
|
||||
peer_client::{GlobalObservedState, PeerClient},
|
||||
persistence::{
|
||||
AbortShardSplitStatus, LeaderPersistence, MetadataHealthPersistence, TenantFilter,
|
||||
},
|
||||
reconciler::{ReconcileError, ReconcileUnits},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
tenant_shard::{
|
||||
@@ -82,7 +85,6 @@ use crate::{
|
||||
ReconcilerWaiter, TenantShard,
|
||||
},
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// For operations that should be quick, like attaching a new tenant
|
||||
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
@@ -223,6 +225,7 @@ impl ServiceState {
|
||||
tenants: BTreeMap<TenantShardId, TenantShard>,
|
||||
scheduler: Scheduler,
|
||||
delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
|
||||
initial_leadership_status: LeadershipStatus,
|
||||
) -> Self {
|
||||
let status = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
@@ -230,15 +233,13 @@ impl ServiceState {
|
||||
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::Leader,
|
||||
status: initial_leadership_status,
|
||||
},
|
||||
1,
|
||||
);
|
||||
|
||||
Self {
|
||||
// TODO: Starting up as Leader is a transient state. Once we enable rolling
|
||||
// upgrades on the k8s side, we should start up as Candidate.
|
||||
leadership_status: LeadershipStatus::Leader,
|
||||
leadership_status: initial_leadership_status,
|
||||
tenants,
|
||||
nodes: Arc::new(nodes),
|
||||
scheduler,
|
||||
@@ -287,6 +288,33 @@ impl ServiceState {
|
||||
0,
|
||||
);
|
||||
}
|
||||
|
||||
fn become_leader(&mut self) {
|
||||
self.leadership_status = LeadershipStatus::Leader;
|
||||
|
||||
let status = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_leadership_status;
|
||||
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::Leader,
|
||||
},
|
||||
1,
|
||||
);
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::SteppedDown,
|
||||
},
|
||||
0,
|
||||
);
|
||||
status.set(
|
||||
LeadershipStatusGroup {
|
||||
status: LeadershipStatus::Candidate,
|
||||
},
|
||||
0,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -323,6 +351,10 @@ pub struct Config {
|
||||
|
||||
// TODO: make this cfg(feature = "testing")
|
||||
pub neon_local_repo_dir: Option<PathBuf>,
|
||||
|
||||
pub start_as_candidate: bool,
|
||||
|
||||
pub http_service_port: i32,
|
||||
}
|
||||
|
||||
impl From<DatabaseError> for ApiError {
|
||||
@@ -490,9 +522,10 @@ pub(crate) enum ReconcileResultRequest {
|
||||
Stop,
|
||||
}
|
||||
|
||||
// TODO: move this into the storcon peer client when that gets added
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
|
||||
struct LeaderStepDownState {
|
||||
observed: GlobalObservedState,
|
||||
leader: LeaderPersistence,
|
||||
}
|
||||
|
||||
impl Service {
|
||||
pub fn get_config(&self) -> &Config {
|
||||
@@ -504,15 +537,11 @@ impl Service {
|
||||
#[instrument(skip_all)]
|
||||
async fn startup_reconcile(
|
||||
self: &Arc<Service>,
|
||||
leader_step_down_state: Option<LeaderStepDownState>,
|
||||
bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
|
||||
Result<(), (TenantShardId, NotifyError)>,
|
||||
>,
|
||||
) {
|
||||
// For all tenant shards, a vector of observed states on nodes (where None means
|
||||
// indeterminate, same as in [`ObservedStateLocation`])
|
||||
let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
|
||||
HashMap::new();
|
||||
|
||||
// Startup reconciliation does I/O to other services: whether they
|
||||
// are responsive or not, we should aim to finish within our deadline, because:
|
||||
// - If we don't, a k8s readiness hook watching /ready will kill us.
|
||||
@@ -526,26 +555,29 @@ impl Service {
|
||||
.checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
|
||||
.expect("Reconcile timeout is a modest constant");
|
||||
|
||||
let (observed, current_leader) = if let Some(state) = leader_step_down_state {
|
||||
tracing::info!(
|
||||
"Using observed received from leader at {}:{}",
|
||||
state.leader.hostname,
|
||||
state.leader.port
|
||||
);
|
||||
(state.observed, Some(state.leader))
|
||||
} else {
|
||||
(
|
||||
self.build_global_observed_state(node_scan_deadline).await,
|
||||
None,
|
||||
)
|
||||
};
|
||||
|
||||
// Accumulate a list of any tenant locations that ought to be detached
|
||||
let mut cleanup = Vec::new();
|
||||
|
||||
let node_listings = self.scan_node_locations(node_scan_deadline).await;
|
||||
// Send initial heartbeat requests to nodes that replied to the location listing above.
|
||||
let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
|
||||
|
||||
for (node_id, list_response) in node_listings {
|
||||
let tenant_shards = list_response.tenant_shards;
|
||||
tracing::info!(
|
||||
"Received {} shard statuses from pageserver {}, setting it to Active",
|
||||
tenant_shards.len(),
|
||||
node_id
|
||||
);
|
||||
|
||||
for (tenant_shard_id, conf_opt) in tenant_shards {
|
||||
let shard_observations = observed.entry(tenant_shard_id).or_default();
|
||||
shard_observations.push((node_id, conf_opt));
|
||||
}
|
||||
}
|
||||
// Send initial heartbeat requests to all nodes loaded from the database
|
||||
let all_nodes = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.nodes.clone()
|
||||
};
|
||||
let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
|
||||
|
||||
// List of tenants for which we will attempt to notify compute of their location at startup
|
||||
let mut compute_notifications = Vec::new();
|
||||
@@ -568,17 +600,16 @@ impl Service {
|
||||
}
|
||||
*nodes = Arc::new(new_nodes);
|
||||
|
||||
for (tenant_shard_id, shard_observations) in observed {
|
||||
for (node_id, observed_loc) in shard_observations {
|
||||
let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
|
||||
cleanup.push((tenant_shard_id, node_id));
|
||||
continue;
|
||||
};
|
||||
tenant_shard
|
||||
.observed
|
||||
.locations
|
||||
.insert(node_id, ObservedStateLocation { conf: observed_loc });
|
||||
}
|
||||
for (tenant_shard_id, observed_state) in observed.0 {
|
||||
let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
|
||||
for node_id in observed_state.locations.keys() {
|
||||
cleanup.push((tenant_shard_id, *node_id));
|
||||
}
|
||||
|
||||
continue;
|
||||
};
|
||||
|
||||
tenant_shard.observed = observed_state;
|
||||
}
|
||||
|
||||
// Populate each tenant's intent state
|
||||
@@ -612,6 +643,22 @@ impl Service {
|
||||
tenants.len()
|
||||
};
|
||||
|
||||
// Before making any obeservable changes to the cluster, persist self
|
||||
// as leader in database and memory.
|
||||
|
||||
let proposed_leader = self.get_proposed_leader_info();
|
||||
|
||||
if let Err(err) = self
|
||||
.persistence
|
||||
.update_leader(current_leader, proposed_leader)
|
||||
.await
|
||||
{
|
||||
tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
self.inner.write().unwrap().become_leader();
|
||||
|
||||
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
|
||||
// generation_pageserver in the database.
|
||||
|
||||
@@ -777,6 +824,31 @@ impl Service {
|
||||
node_results
|
||||
}
|
||||
|
||||
async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState {
|
||||
let node_listings = self.scan_node_locations(deadline).await;
|
||||
let mut observed = GlobalObservedState::default();
|
||||
|
||||
for (node_id, location_confs) in node_listings {
|
||||
tracing::info!(
|
||||
"Received {} shard statuses from pageserver {}",
|
||||
location_confs.tenant_shards.len(),
|
||||
node_id
|
||||
);
|
||||
|
||||
for (tid, location_conf) in location_confs.tenant_shards {
|
||||
let entry = observed.0.entry(tid).or_default();
|
||||
entry.locations.insert(
|
||||
node_id,
|
||||
ObservedStateLocation {
|
||||
conf: location_conf,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
observed
|
||||
}
|
||||
|
||||
/// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
|
||||
///
|
||||
/// This is safe to run in the background, because if we don't have this TenantShardId in our map of
|
||||
@@ -1255,12 +1327,20 @@ impl Service {
|
||||
config.max_warming_up_interval,
|
||||
cancel.clone(),
|
||||
);
|
||||
|
||||
let initial_leadership_status = if config.start_as_candidate {
|
||||
LeadershipStatus::Candidate
|
||||
} else {
|
||||
LeadershipStatus::Leader
|
||||
};
|
||||
|
||||
let this = Arc::new(Self {
|
||||
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
|
||||
nodes,
|
||||
tenants,
|
||||
scheduler,
|
||||
delayed_reconcile_rx,
|
||||
initial_leadership_status,
|
||||
))),
|
||||
config: config.clone(),
|
||||
persistence,
|
||||
@@ -1329,7 +1409,16 @@ impl Service {
|
||||
return;
|
||||
};
|
||||
|
||||
this.startup_reconcile(bg_compute_notify_result_tx).await;
|
||||
let leadership_status = this.inner.read().unwrap().get_leadership_status();
|
||||
let peer_observed_state = match leadership_status {
|
||||
LeadershipStatus::Candidate => this.request_step_down().await,
|
||||
LeadershipStatus::Leader => None,
|
||||
LeadershipStatus::SteppedDown => unreachable!(),
|
||||
};
|
||||
|
||||
this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
|
||||
.await;
|
||||
|
||||
drop(startup_completion);
|
||||
}
|
||||
});
|
||||
@@ -6179,4 +6268,88 @@ impl Service {
|
||||
|
||||
global_observed
|
||||
}
|
||||
|
||||
/// Collect the details for the current proccess wishing to become the storage controller
|
||||
/// leader.
|
||||
///
|
||||
/// On failures to discover and resolve the hostname the process is killed and we rely on k8s to retry.
|
||||
fn get_proposed_leader_info(&self) -> LeaderPersistence {
|
||||
let hostname = match dns_lookup::get_hostname() {
|
||||
Ok(name) => name,
|
||||
Err(err) => {
|
||||
tracing::error!("Failed to discover hostname: {err}. Aborting start-up ...");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let mut addrs = match dns_lookup::lookup_host(&hostname) {
|
||||
Ok(addrs) => addrs,
|
||||
Err(err) => {
|
||||
tracing::error!("Failed to resolve hostname: {err}. Aborting start-up ...");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let addr = addrs
|
||||
.pop()
|
||||
.expect("k8s configured hostname always resolves");
|
||||
|
||||
let proposed = LeaderPersistence {
|
||||
hostname: addr.to_string(),
|
||||
port: self.get_config().http_service_port,
|
||||
started_at: chrono::Utc::now(),
|
||||
};
|
||||
|
||||
tracing::info!("Proposed leader details are: {proposed:?}");
|
||||
|
||||
proposed
|
||||
}
|
||||
|
||||
/// Request step down from the currently registered leader in the database
|
||||
///
|
||||
/// If such an entry is persisted, the success path returns the observed
|
||||
/// state and details of the leader. Otherwise, None is returned indicating
|
||||
/// there is no leader currently.
|
||||
///
|
||||
/// On failures to query the database or step down error responses the process is killed
|
||||
/// and we rely on k8s to retry.
|
||||
async fn request_step_down(&self) -> Option<LeaderStepDownState> {
|
||||
let leader = match self.persistence.get_leader().await {
|
||||
Ok(leader) => leader,
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
"Failed to query database for current leader: {err}. Aborting start-up ..."
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
match leader {
|
||||
Some(leader) => {
|
||||
// TODO: jwt token
|
||||
let client = PeerClient::new(
|
||||
leader.hostname.to_owned(),
|
||||
leader.port,
|
||||
self.config.jwt_token.clone(),
|
||||
);
|
||||
let state = client.step_down(&self.cancel).await;
|
||||
match state {
|
||||
Ok(state) => Some(LeaderStepDownState {
|
||||
observed: state,
|
||||
leader: leader.clone(),
|
||||
}),
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
"Leader ({}:{}) did not respond to step-down request: {}",
|
||||
leader.hostname,
|
||||
leader.port,
|
||||
err
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,8 @@ use utils::id::TenantId;
|
||||
|
||||
use crate::{
|
||||
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
|
||||
init_remote_generic, list_objects_with_retries_generic,
|
||||
metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
|
||||
init_remote, init_remote_generic, list_objects_with_retries,
|
||||
metadata_stream::{stream_tenant_timelines, stream_tenants},
|
||||
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
|
||||
};
|
||||
|
||||
@@ -153,7 +153,7 @@ async fn find_garbage_inner(
|
||||
node_kind: NodeKind,
|
||||
) -> anyhow::Result<GarbageList> {
|
||||
// Construct clients for S3 and for Console API
|
||||
let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
|
||||
let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
|
||||
let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
|
||||
|
||||
// Build a set of console-known tenants, for quickly eliminating known-active tenants without having
|
||||
@@ -179,7 +179,7 @@ async fn find_garbage_inner(
|
||||
|
||||
// Enumerate Tenants in S3, and check if each one exists in Console
|
||||
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
|
||||
let tenants = stream_tenants_generic(&remote_client, &target);
|
||||
let tenants = stream_tenants(&s3_client, &target);
|
||||
let tenants_checked = tenants.map_ok(|t| {
|
||||
let api_client = cloud_admin_api_client.clone();
|
||||
let console_cache = console_cache.clone();
|
||||
@@ -237,26 +237,25 @@ async fn find_garbage_inner(
|
||||
// Special case: If it's missing in console, check for known bugs that would enable us to conclusively
|
||||
// identify it as purge-able anyway
|
||||
if console_result.is_none() {
|
||||
let timelines =
|
||||
stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
|
||||
.await?
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
|
||||
.await?
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
if timelines.is_empty() {
|
||||
// No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
|
||||
let tenant_objects = list_objects_with_retries_generic(
|
||||
&remote_client,
|
||||
ListingMode::WithDelimiter,
|
||||
let tenant_objects = list_objects_with_retries(
|
||||
&s3_client,
|
||||
&target.tenant_root(&tenant_shard_id),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let object = tenant_objects.keys.first().unwrap();
|
||||
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
|
||||
let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
|
||||
if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
|
||||
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
|
||||
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||
continue;
|
||||
} else {
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
|
||||
}
|
||||
} else {
|
||||
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
|
||||
@@ -265,18 +264,24 @@ async fn find_garbage_inner(
|
||||
|
||||
for timeline_r in timelines {
|
||||
let timeline = timeline_r?;
|
||||
let timeline_objects = list_objects_with_retries_generic(
|
||||
&remote_client,
|
||||
ListingMode::WithDelimiter,
|
||||
let timeline_objects = list_objects_with_retries(
|
||||
&s3_client,
|
||||
&target.timeline_root(&timeline),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
if !timeline_objects.prefixes.is_empty() {
|
||||
if timeline_objects
|
||||
.common_prefixes
|
||||
.as_ref()
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0)
|
||||
> 0
|
||||
{
|
||||
// Sub-paths? Unexpected
|
||||
any_non_initdb = true;
|
||||
} else {
|
||||
let object = timeline_objects.keys.first().unwrap();
|
||||
if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
|
||||
let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
|
||||
if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
|
||||
tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
|
||||
} else {
|
||||
any_non_initdb = true;
|
||||
@@ -331,8 +336,7 @@ async fn find_garbage_inner(
|
||||
|
||||
// Construct a stream of all timelines within active tenants
|
||||
let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
|
||||
let timelines =
|
||||
active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
|
||||
let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
|
||||
let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
|
||||
let timelines = timelines.try_flatten();
|
||||
|
||||
|
||||
@@ -427,7 +427,6 @@ async fn list_objects_with_retries(
|
||||
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
|
||||
}
|
||||
|
||||
/// Listing possibly large amounts of keys in a streaming fashion.
|
||||
fn stream_objects_with_retries<'a>(
|
||||
storage_client: &'a GenericRemoteStorage,
|
||||
listing_mode: ListingMode,
|
||||
@@ -466,45 +465,6 @@ fn stream_objects_with_retries<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
|
||||
/// use [`stream_objects_with_retries`] instead.
|
||||
async fn list_objects_with_retries_generic(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
listing_mode: ListingMode,
|
||||
s3_target: &S3Target,
|
||||
) -> anyhow::Result<Listing> {
|
||||
let cancel = CancellationToken::new();
|
||||
let prefix_str = &s3_target
|
||||
.prefix_in_bucket
|
||||
.strip_prefix("/")
|
||||
.unwrap_or(&s3_target.prefix_in_bucket);
|
||||
let prefix = RemotePath::from_string(prefix_str)?;
|
||||
for trial in 0..MAX_RETRIES {
|
||||
match remote_client
|
||||
.list(Some(&prefix), listing_mode, None, &cancel)
|
||||
.await
|
||||
{
|
||||
Ok(response) => return Ok(response),
|
||||
Err(e) => {
|
||||
if trial == MAX_RETRIES - 1 {
|
||||
return Err(e)
|
||||
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
|
||||
}
|
||||
error!(
|
||||
"list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
|
||||
s3_target.bucket_name,
|
||||
s3_target.prefix_in_bucket,
|
||||
s3_target.delimiter,
|
||||
DisplayErrorContext(e),
|
||||
);
|
||||
let backoff_time = 1 << trial.max(5);
|
||||
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
panic!("MAX_RETRIES is not allowed to be 0");
|
||||
}
|
||||
|
||||
async fn download_object_with_retries(
|
||||
s3_client: &Client,
|
||||
bucket_name: &str,
|
||||
|
||||
@@ -189,63 +189,6 @@ pub async fn stream_tenant_timelines<'a>(
|
||||
})
|
||||
}
|
||||
|
||||
/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
|
||||
/// using a listing. The listing is done before the stream is built, so that this
|
||||
/// function can be used to generate concurrency on a stream using buffer_unordered.
|
||||
pub async fn stream_tenant_timelines_generic<'a>(
|
||||
remote_client: &'a GenericRemoteStorage,
|
||||
target: &'a RootTarget,
|
||||
tenant: TenantShardId,
|
||||
) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
|
||||
let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
|
||||
let timelines_target = target.timelines_root(&tenant);
|
||||
|
||||
let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
|
||||
remote_client,
|
||||
ListingMode::WithDelimiter,
|
||||
&timelines_target
|
||||
));
|
||||
loop {
|
||||
tracing::debug!("Listing in {tenant}");
|
||||
let fetch_response = match objects_stream.next().await {
|
||||
None => break,
|
||||
Some(Err(e)) => {
|
||||
timeline_ids.push(Err(e));
|
||||
break;
|
||||
}
|
||||
Some(Ok(r)) => r,
|
||||
};
|
||||
|
||||
let new_entry_ids = fetch_response
|
||||
.prefixes
|
||||
.iter()
|
||||
.filter_map(|prefix| -> Option<&str> {
|
||||
prefix
|
||||
.get_path()
|
||||
.as_str()
|
||||
.strip_prefix(&timelines_target.prefix_in_bucket)?
|
||||
.strip_suffix('/')
|
||||
})
|
||||
.map(|entry_id_str| {
|
||||
entry_id_str
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
|
||||
});
|
||||
|
||||
for i in new_entry_ids {
|
||||
timeline_ids.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!("Yielding for {}", tenant);
|
||||
Ok(stream! {
|
||||
for i in timeline_ids {
|
||||
let id = i?;
|
||||
yield Ok(TenantShardTimelineId::new(tenant, id));
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn stream_listing<'a>(
|
||||
s3_client: &'a Client,
|
||||
target: &'a S3Target,
|
||||
|
||||
@@ -6,8 +6,21 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
|
||||
assert mode == "normal" or mode == "with_snapshots"
|
||||
@pytest.mark.timeout(10000)
|
||||
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||
"""
|
||||
Test that GC is able to collect all old layers even if them are forming
|
||||
"stairs" and there are not three delta layers since last image layer.
|
||||
|
||||
Information about image layers needed to collect old layers should
|
||||
be propagated by GC to compaction task which should take in in account
|
||||
when make a decision which new image layers needs to be created.
|
||||
|
||||
NB: this test demonstrates the problem. The source tree contained the
|
||||
`gc_feedback` mechanism for about 9 months, but, there were problems
|
||||
with it and it wasn't enabled at runtime.
|
||||
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
|
||||
"""
|
||||
env = neon_env_builder.init_start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
@@ -61,9 +74,6 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
|
||||
|
||||
physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
|
||||
log.info(f"Physical storage size {physical_size}")
|
||||
if mode == "with_snapshots":
|
||||
if step == n_steps / 2:
|
||||
env.neon_cli.create_branch("child")
|
||||
|
||||
max_num_of_deltas_above_image = 0
|
||||
max_total_num_of_deltas = 0
|
||||
@@ -139,37 +149,3 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
|
||||
log.info(f"Writing layer map to {layer_map_path}")
|
||||
with layer_map_path.open("w") as f:
|
||||
f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
|
||||
|
||||
|
||||
@pytest.mark.timeout(10000)
|
||||
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||
"""
|
||||
Test that GC is able to collect all old layers even if them are forming
|
||||
"stairs" and there are not three delta layers since last image layer.
|
||||
|
||||
Information about image layers needed to collect old layers should
|
||||
be propagated by GC to compaction task which should take in in account
|
||||
when make a decision which new image layers needs to be created.
|
||||
|
||||
NB: this test demonstrates the problem. The source tree contained the
|
||||
`gc_feedback` mechanism for about 9 months, but, there were problems
|
||||
with it and it wasn't enabled at runtime.
|
||||
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
|
||||
|
||||
And the bottom-most GC-compaction epic resolves the problem.
|
||||
https://github.com/neondatabase/neon/issues/8002
|
||||
"""
|
||||
gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
|
||||
|
||||
|
||||
@pytest.mark.timeout(10000)
|
||||
def test_gc_feedback_with_snapshots(
|
||||
neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
|
||||
):
|
||||
"""
|
||||
Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
|
||||
of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC
|
||||
horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
|
||||
and images covering the full key range (in a delta layer) at the GC horizon.
|
||||
"""
|
||||
gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")
|
||||
|
||||
@@ -277,12 +277,8 @@ files:
|
||||
help: 'Bytes between received and replayed LSN'
|
||||
key_labels:
|
||||
values: [replication_delay_bytes]
|
||||
# We use a GREATEST call here because this calculation can be negative.
|
||||
# The calculation is not atomic, meaning after we've gotten the receive
|
||||
# LSN, the replay LSN may have advanced past the receive LSN we
|
||||
# are using for the calculation.
|
||||
query: |
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
|
||||
|
||||
- metric_name: replication_delay_seconds
|
||||
type: gauge
|
||||
|
||||
Reference in New Issue
Block a user