mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 09:30:37 +00:00
Compare commits
46 Commits
tokio-unst
...
jcsp/tenan
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0baf91fcac | ||
|
|
cd27f42839 | ||
|
|
edf303ff44 | ||
|
|
ec189338eb | ||
|
|
e22f9b9fbb | ||
|
|
83fc486636 | ||
|
|
3afec731dc | ||
|
|
559611f9a6 | ||
|
|
d56dad3824 | ||
|
|
547acb6653 | ||
|
|
5264ffc2a9 | ||
|
|
ee841708ff | ||
|
|
f356df1860 | ||
|
|
fbe1da2981 | ||
|
|
22c94d99f5 | ||
|
|
6de414b5a5 | ||
|
|
8c0ce9723a | ||
|
|
5a4c371e94 | ||
|
|
65096ac992 | ||
|
|
049cb1fb4b | ||
|
|
640350a2c0 | ||
|
|
d422105d88 | ||
|
|
ead1931167 | ||
|
|
e0ebdfc7ce | ||
|
|
c508d3b5fa | ||
|
|
acda65d7d4 | ||
|
|
378daa358b | ||
|
|
85f4514e7d | ||
|
|
f70019797c | ||
|
|
325258413a | ||
|
|
4ddbc0e46d | ||
|
|
a673e4e7a9 | ||
|
|
c155cc0c3f | ||
|
|
32126d705b | ||
|
|
5683ae9eab | ||
|
|
4778b6a12e | ||
|
|
8b8be7bed4 | ||
|
|
a461c459d8 | ||
|
|
4ae2d1390d | ||
|
|
c5949e1fd6 | ||
|
|
127837abb0 | ||
|
|
b2c96047d0 | ||
|
|
44202eeb3b | ||
|
|
4bef977c56 | ||
|
|
a0b862a8bd | ||
|
|
767ef29390 |
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -5,4 +5,6 @@ self-hosted-runner:
|
||||
- small
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
|
||||
@@ -203,6 +203,10 @@ runs:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
|
||||
run: |
|
||||
if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
12
.github/workflows/build_and_test.yml
vendored
12
.github/workflows/build_and_test.yml
vendored
@@ -340,11 +340,11 @@ jobs:
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
|
||||
export REMOTE_STORAGE_AZURE_REGION=eastus2
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
@@ -433,7 +433,7 @@ jobs:
|
||||
rerun_flaky: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
@@ -468,7 +468,7 @@ jobs:
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -847,7 +847,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.18.1
|
||||
VM_BUILDER_VERSION: v0.18.2
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
@@ -221,8 +221,21 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if attach_req.pageserver_id.is_some() {
|
||||
if let Some(attaching_pageserver) = attach_req.pageserver_id.as_ref() {
|
||||
tenant_state.generation += 1;
|
||||
tracing::info!(
|
||||
"attach_hook: issuing generation {} to pageserver {}",
|
||||
attaching_pageserver,
|
||||
tenant_state.generation
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||
tracing::info!(
|
||||
"attach_hook: dropping pageserver {} in generation {}",
|
||||
ps_id,
|
||||
tenant_state.generation
|
||||
);
|
||||
} else {
|
||||
tracing::info!("attach_hook: no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.pageserver = attach_req.pageserver_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
|
||||
// Then we could replace the custo Ord and PartialOrd implementations below with
|
||||
// deriving them.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
// Then we could replace the custom Ord and PartialOrd implementations below with
|
||||
// deriving them. This will require changes in walredoproc.c.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {
|
||||
|
||||
impl Ord for RelTag {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp = self.spcnode.cmp(&other.spcnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.dbnode.cmp(&other.dbnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.relnode.cmp(&other.relnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.forknum.cmp(&other.forknum);
|
||||
|
||||
cmp
|
||||
// Custom ordering where we put forknum to the end of the list
|
||||
let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
|
||||
(self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -73,6 +73,8 @@ pub mod completion;
|
||||
/// Reporting utilities
|
||||
pub mod error;
|
||||
|
||||
pub mod sync;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
1
libs/utils/src/sync.rs
Normal file
1
libs/utils/src/sync.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod heavier_once_cell;
|
||||
306
libs/utils/src/sync/heavier_once_cell.rs
Normal file
306
libs/utils/src/sync/heavier_once_cell.rs
Normal file
@@ -0,0 +1,306 @@
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||
/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
|
||||
/// for the duration of initialization.
|
||||
///
|
||||
/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
|
||||
///
|
||||
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
||||
pub struct OnceCell<T> {
|
||||
inner: Mutex<Inner<T>>,
|
||||
}
|
||||
|
||||
impl<T> Default for OnceCell<T> {
|
||||
/// Create new uninitialized [`OnceCell`].
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Semaphore is the current state:
|
||||
/// - open semaphore means the value is `None`, not yet initialized
|
||||
/// - closed semaphore means the value has been initialized
|
||||
#[derive(Debug)]
|
||||
struct Inner<T> {
|
||||
init_semaphore: Arc<Semaphore>,
|
||||
value: Option<T>,
|
||||
}
|
||||
|
||||
impl<T> Default for Inner<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
init_semaphore: Arc::new(Semaphore::new(1)),
|
||||
value: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> OnceCell<T> {
|
||||
/// Creates an already initialized `OnceCell` with the given value.
|
||||
pub fn new(value: T) -> Self {
|
||||
let sem = Semaphore::new(1);
|
||||
sem.close();
|
||||
Self {
|
||||
inner: Mutex::new(Inner {
|
||||
init_semaphore: Arc::new(sem),
|
||||
value: Some(value),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
|
||||
/// returning the guard.
|
||||
///
|
||||
/// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
|
||||
///
|
||||
/// Initialization is panic-safe and cancellation-safe.
|
||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
|
||||
where
|
||||
F: FnOnce() -> Fut,
|
||||
Fut: std::future::Future<Output = Result<T, E>>,
|
||||
{
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
let permit = sem.acquire_owned().await;
|
||||
if permit.is_err() {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
} else {
|
||||
// now we try
|
||||
let value = factory().await?;
|
||||
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_none(),
|
||||
"we won permit, must not be initialized"
|
||||
);
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Ok(Guard(guard))
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, if any.
|
||||
pub fn get(&self) -> Option<Guard<'_, T>> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
Some(Guard(guard))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
||||
/// initialized value.
|
||||
#[derive(Debug)]
|
||||
pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
|
||||
|
||||
impl<T> std::ops::Deref for Guard<'_, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0
|
||||
.value
|
||||
.as_ref()
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> std::ops::DerefMut for Guard<'_, T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.0
|
||||
.value
|
||||
.as_mut()
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Guard<'a, T> {
|
||||
/// Take the current value, and a new permit for it's deinitialization.
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let permit = swapped
|
||||
.init_semaphore
|
||||
.clone()
|
||||
.try_acquire_owned()
|
||||
.expect("we just created this");
|
||||
std::mem::swap(&mut *self.0, &mut swapped);
|
||||
swapped
|
||||
.value
|
||||
.map(|v| (v, permit))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn many_initializers() {
|
||||
#[derive(Default, Debug)]
|
||||
struct Counters {
|
||||
factory_got_to_run: AtomicUsize,
|
||||
future_polled: AtomicUsize,
|
||||
winners: AtomicUsize,
|
||||
}
|
||||
|
||||
let initializers = 100;
|
||||
|
||||
let cell = Arc::new(OnceCell::default());
|
||||
let counters = Arc::new(Counters::default());
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
for i in 0..initializers {
|
||||
js.spawn({
|
||||
let cell = cell.clone();
|
||||
let counters = counters.clone();
|
||||
let barrier = barrier.clone();
|
||||
|
||||
async move {
|
||||
barrier.wait().await;
|
||||
let won = {
|
||||
let g = cell
|
||||
.get_or_init(|| {
|
||||
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
||||
async {
|
||||
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
||||
Ok::<_, Infallible>(i)
|
||||
}
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
*g == i
|
||||
};
|
||||
|
||||
if won {
|
||||
counters.winners.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
barrier.wait().await;
|
||||
|
||||
while let Some(next) = js.join_next().await {
|
||||
next.expect("no panics expected");
|
||||
}
|
||||
|
||||
let mut counters = Arc::try_unwrap(counters).unwrap();
|
||||
|
||||
assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
|
||||
assert_eq!(*counters.future_polled.get_mut(), 1);
|
||||
assert_eq!(*counters.winners.get_mut(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn reinit_waits_for_deinit() {
|
||||
// with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
|
||||
let sleep_for = Duration::from_secs(1);
|
||||
let initial = 42;
|
||||
let reinit = 1;
|
||||
let cell = Arc::new(OnceCell::new(initial));
|
||||
|
||||
let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
|
||||
|
||||
let jh = tokio::spawn({
|
||||
let cell = cell.clone();
|
||||
let deinitialization_started = deinitialization_started.clone();
|
||||
async move {
|
||||
let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
|
||||
assert_eq!(answer, initial);
|
||||
|
||||
deinitialization_started.wait().await;
|
||||
tokio::time::sleep(sleep_for).await;
|
||||
}
|
||||
});
|
||||
|
||||
deinitialization_started.wait().await;
|
||||
|
||||
let started_at = tokio::time::Instant::now();
|
||||
cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let elapsed = started_at.elapsed();
|
||||
assert!(
|
||||
elapsed >= sleep_for,
|
||||
"initialization should had taken at least the time time slept with permit"
|
||||
);
|
||||
|
||||
jh.await.unwrap();
|
||||
|
||||
assert_eq!(*cell.get().unwrap(), reinit);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn initialization_attemptable_until_ok() {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
for _ in 0..10 {
|
||||
cell.get_or_init(|| async { Err("whatever error") })
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("finally success") })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "finally success");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn initialization_is_cancellation_safe() {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
let barrier = tokio::sync::Barrier::new(2);
|
||||
|
||||
let initializer = cell.get_or_init(|| async {
|
||||
barrier.wait().await;
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
Ok::<_, Infallible>("never reached")
|
||||
});
|
||||
|
||||
tokio::select! {
|
||||
_ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
|
||||
_ = barrier.wait() => {}
|
||||
};
|
||||
|
||||
// now initializer is dropped
|
||||
|
||||
assert!(cell.get().is_none());
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "now initialized");
|
||||
}
|
||||
}
|
||||
@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::TenantSharedResources;
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -408,7 +408,7 @@ fn start_pageserver(
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
conf,
|
||||
TenantSharedResources {
|
||||
broker_client: broker_client.clone(),
|
||||
@@ -418,6 +418,7 @@ fn start_pageserver(
|
||||
order,
|
||||
shutdown_pageserver.clone(),
|
||||
))?;
|
||||
let tenant_manager = Arc::new(tenant_manager);
|
||||
|
||||
BACKGROUND_RUNTIME.spawn({
|
||||
let init_done_rx = init_done_rx;
|
||||
@@ -523,6 +524,18 @@ fn start_pageserver(
|
||||
}
|
||||
});
|
||||
|
||||
let secondary_controller = if let Some(remote_storage) = &remote_storage {
|
||||
secondary::spawn_tasks(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
remote_storage.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
shutdown_pageserver.clone(),
|
||||
)
|
||||
} else {
|
||||
secondary::null_controller()
|
||||
};
|
||||
|
||||
// shared state between the disk-usage backed eviction background task and the http endpoint
|
||||
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
||||
// is still accessible even if background task is not configured as long as remote storage has
|
||||
@@ -534,6 +547,7 @@ fn start_pageserver(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
tenant_manager.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
)?;
|
||||
}
|
||||
@@ -546,11 +560,13 @@ fn start_pageserver(
|
||||
let router_state = Arc::new(
|
||||
http::routes::State::new(
|
||||
conf,
|
||||
tenant_manager,
|
||||
http_auth.clone(),
|
||||
remote_storage.clone(),
|
||||
broker_client.clone(),
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue.new_client(),
|
||||
secondary_controller,
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
);
|
||||
|
||||
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
|
||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
|
||||
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
|
||||
// We can put in some prioritization for consumption metrics.
|
||||
// Same for the loop that fetches computed metrics.
|
||||
|
||||
@@ -206,7 +206,6 @@ pub(super) async fn collect_all_metrics(
|
||||
None
|
||||
} else {
|
||||
crate::tenant::mgr::get_tenant(id, true)
|
||||
.await
|
||||
.ok()
|
||||
.map(|tenant| (id, tenant))
|
||||
}
|
||||
|
||||
2
pageserver/src/deletion_queue/check.log
Normal file
2
pageserver/src/deletion_queue/check.log
Normal file
@@ -0,0 +1,2 @@
|
||||
Checking pageserver v0.1.0 (/home/neon/neon/pageserver)
|
||||
Finished dev [optimized + debuginfo] target(s) in 7.62s
|
||||
@@ -48,19 +48,26 @@ use std::{
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use utils::completion;
|
||||
use utils::serde_percent::Percent;
|
||||
use utils::{
|
||||
completion,
|
||||
id::{TenantId, TenantTimelineId},
|
||||
};
|
||||
use utils::{id::TimelineId, serde_percent::Percent};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
|
||||
tenant::{
|
||||
mgr::TenantManager,
|
||||
secondary::SecondaryTenant,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer},
|
||||
Timeline,
|
||||
},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -83,6 +90,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
conf: &'static PageServerConf,
|
||||
storage: GenericRemoteStorage,
|
||||
state: Arc<State>,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
background_jobs_barrier: completion::Barrier,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
@@ -108,8 +116,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
_ = background_jobs_barrier.wait() => { }
|
||||
};
|
||||
|
||||
disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
|
||||
.await;
|
||||
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
@@ -121,8 +128,8 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
async fn disk_usage_eviction_task(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: GenericRemoteStorage,
|
||||
tenants_dir: &Utf8Path,
|
||||
_storage: &GenericRemoteStorage,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
scopeguard::defer! {
|
||||
@@ -145,14 +152,9 @@ async fn disk_usage_eviction_task(
|
||||
let start = Instant::now();
|
||||
|
||||
async {
|
||||
let res = disk_usage_eviction_task_iteration(
|
||||
state,
|
||||
task_config,
|
||||
&storage,
|
||||
tenants_dir,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
let res =
|
||||
disk_usage_eviction_task_iteration(state, task_config, &tenant_manager, &cancel)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
@@ -183,13 +185,14 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
|
||||
async fn disk_usage_eviction_task_iteration(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenants_dir: &Utf8Path,
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
let tenants_dir = tenant_manager.get_conf().tenants_path();
|
||||
let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
let res =
|
||||
disk_usage_eviction_task_iteration_impl(state, usage_pre, tenant_manager, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
@@ -199,7 +202,7 @@ async fn disk_usage_eviction_task_iteration(
|
||||
}
|
||||
IterationOutcome::Finished(outcome) => {
|
||||
// Verify with statvfs whether we made any real progress
|
||||
let after = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
let after = filesystem_level_usage::get(&tenants_dir, task_config)
|
||||
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
|
||||
.context("get filesystem-level disk usage after evictions")?;
|
||||
|
||||
@@ -273,8 +276,8 @@ struct LayerCount {
|
||||
|
||||
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
storage: &GenericRemoteStorage,
|
||||
usage_pre: U,
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||
@@ -294,7 +297,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||
let candidates = match collect_eviction_candidates(tenant_manager, cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
@@ -330,9 +333,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// If we get far enough in the list that we start to evict layers that are below
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
|
||||
|
||||
// Evictions for attached tenants, batched by timeline
|
||||
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
|
||||
|
||||
// Evictions for secondary locations, batched by tenant
|
||||
let mut secondary_by_tenant: HashMap<TenantId, Vec<(TimelineId, Layer)>> = HashMap::new();
|
||||
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut max_batch_size = 0;
|
||||
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
debug!(
|
||||
@@ -349,10 +359,26 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
|
||||
|
||||
batched
|
||||
.entry(TimelineKey(candidate.timeline))
|
||||
.or_default()
|
||||
.push(candidate.layer);
|
||||
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
|
||||
// tasks to evict all seen layers until we have evicted enough
|
||||
|
||||
match candidate.source {
|
||||
EvictionCandidateSource::Attached(timeline) => {
|
||||
let batch = batched.entry(TimelineKey(timeline)).or_default();
|
||||
|
||||
// semaphore will later be used to limit eviction concurrency, and we can express at
|
||||
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
|
||||
// but fail gracefully by not making batches larger.
|
||||
if batch.len() < u32::MAX as usize {
|
||||
batch.push(candidate.layer);
|
||||
max_batch_size = max_batch_size.max(batch.len());
|
||||
}
|
||||
}
|
||||
EvictionCandidateSource::Secondary(ttid) => {
|
||||
let batch = secondary_by_tenant.entry(ttid.tenant_id).or_default();
|
||||
batch.push((ttid.timeline_id, candidate.layer));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let usage_planned = match warned {
|
||||
@@ -367,71 +393,116 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
};
|
||||
debug!(?usage_planned, "usage planned");
|
||||
|
||||
// phase2: evict victims batched by timeline
|
||||
// phase2 (secondary tenants): evict victims batched by tenant
|
||||
for (tenant_id, timeline_layers) in secondary_by_tenant {
|
||||
// Q: Why do we go via TenantManager again rather than just deleting files, or keeping
|
||||
// an Arc ref to the secondary state?
|
||||
// A: It's because a given tenant's local storage **belongs** to whoever is currently
|
||||
// live in the TenantManager. We must avoid a race where we might plan an eviction
|
||||
// for secondary, and then execute it when the tenant is actually in an attached state.
|
||||
tenant_manager
|
||||
.evict_tenant_layers(&tenant_id, timeline_layers)
|
||||
.instrument(tracing::info_span!("evict_batch", %tenant_id))
|
||||
.await;
|
||||
}
|
||||
|
||||
// phase2 (attached tenants): evict victims batched by timeline
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
|
||||
// ratelimit to 1k files or any higher max batch size
|
||||
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
|
||||
|
||||
// After the loop, `usage_assumed` is the post-eviction usage,
|
||||
// according to internal accounting.
|
||||
let mut usage_assumed = usage_pre;
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
for (timeline, batch) in batched {
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let batch_size = batch.len();
|
||||
let batch_size =
|
||||
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
|
||||
|
||||
// I dislike naming of `available_permits` but it means current total amount of permits
|
||||
// because permits can be added
|
||||
assert!(batch_size as usize <= limit.available_permits());
|
||||
|
||||
debug!(%timeline_id, "evicting batch for timeline");
|
||||
|
||||
async {
|
||||
let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
|
||||
let evict = {
|
||||
let limit = limit.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
let mut evicted_bytes = 0;
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
|
||||
match results {
|
||||
Err(e) => {
|
||||
warn!("failed to evict batch: {:#}", e);
|
||||
}
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
match result {
|
||||
Some(Ok(())) => {
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
}
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
|
||||
}
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_)
|
||||
| e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(%layer, "failed to evict layer: {e}");
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
Some(Err(EvictionError::MetadataInconsistency(detail))) => {
|
||||
warn!(%layer, "failed to evict layer: {detail}");
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
None => {
|
||||
assert!(cancel.is_cancelled());
|
||||
return;
|
||||
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
|
||||
// semaphore closing means cancelled
|
||||
return (evicted_bytes, evictions_failed);
|
||||
};
|
||||
|
||||
let results = timeline.evict_layers(&batch, &cancel).await;
|
||||
|
||||
match results {
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
match result {
|
||||
Some(Ok(())) => {
|
||||
evicted_bytes += file_size;
|
||||
}
|
||||
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
None => {
|
||||
assert!(cancel.is_cancelled());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to evict batch: {:#}", e);
|
||||
}
|
||||
}
|
||||
(evicted_bytes, evictions_failed)
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
|
||||
.await;
|
||||
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
js.spawn(evict);
|
||||
|
||||
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
|
||||
// chance of making progress
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
|
||||
let join_all = async move {
|
||||
// After the evictions, `usage_assumed` is the post-eviction usage,
|
||||
// according to internal accounting.
|
||||
let mut usage_assumed = usage_pre;
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
Ok((evicted_bytes, failed)) => {
|
||||
usage_assumed.add_available_bytes(evicted_bytes);
|
||||
evictions_failed.file_sizes += failed.file_sizes;
|
||||
evictions_failed.count += failed.count;
|
||||
}
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => { /* already logged */ }
|
||||
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
|
||||
}
|
||||
}
|
||||
(usage_assumed, evictions_failed)
|
||||
};
|
||||
|
||||
let (usage_assumed, evictions_failed) = tokio::select! {
|
||||
tuple = join_all => { tuple },
|
||||
_ = cancel.cancelled() => {
|
||||
// close the semaphore to stop any pending acquires
|
||||
limit.close();
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(IterationOutcome::Finished(IterationOutcomeFinished {
|
||||
before: usage_pre,
|
||||
@@ -443,10 +514,19 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
}))
|
||||
}
|
||||
|
||||
// An eviction candidate might originate from either an attached tenant
|
||||
// with a [`Tenant`] and [`Timeline`] object, or from a secondary tenant
|
||||
// location. These differ in how we will execute the eviction.
|
||||
#[derive(Clone)]
|
||||
enum EvictionCandidateSource {
|
||||
Attached(Arc<Timeline>),
|
||||
Secondary(TenantTimelineId),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct EvictionCandidate {
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
source: EvictionCandidateSource,
|
||||
layer: Layer,
|
||||
last_activity_ts: SystemTime,
|
||||
}
|
||||
|
||||
@@ -495,27 +575,18 @@ enum EvictionCandidates {
|
||||
/// after exhauting the `Above` partition.
|
||||
/// So, we did not respect each tenant's min_resident_size.
|
||||
async fn collect_eviction_candidates(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
for (tenant_id, _state) in &tenants {
|
||||
let tenants = tenant_manager.get_attached_tenants();
|
||||
|
||||
for tenant in tenants {
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// collect layers from all timelines in this tenant
|
||||
//
|
||||
@@ -578,7 +649,7 @@ async fn collect_eviction_candidates(
|
||||
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||
let file_size = layer_info.file_size();
|
||||
let candidate = EvictionCandidate {
|
||||
timeline,
|
||||
source: EvictionCandidateSource::Attached(timeline),
|
||||
last_activity_ts: layer_info.last_activity_ts,
|
||||
layer: layer_info.layer,
|
||||
};
|
||||
@@ -592,6 +663,43 @@ async fn collect_eviction_candidates(
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: this is a long loop over all secondary locations. At the least, respect
|
||||
// cancellation here, but really we need to break up the loop. We could extract the
|
||||
// Arc<SecondaryTenant>s and iterate over them with some tokio yields in there. Ideally
|
||||
// though we should just reduce the total amount of work: our eviction goals do not require
|
||||
// listing absolutely every layer in every tenant: we could sample this.
|
||||
tenant_manager.foreach_secondary_tenants(
|
||||
|tenant_id: &TenantId, state: &Arc<SecondaryTenant>| {
|
||||
let mut tenant_candidates = Vec::new();
|
||||
for (timeline_id, layer_info) in state.get_layers_for_eviction() {
|
||||
debug!(tenant_id=%tenant_id, timeline_id=%timeline_id, "timeline resident layers (secondary) count: {}", layer_info.resident_layers.len());
|
||||
tenant_candidates.extend(
|
||||
layer_info.resident_layers
|
||||
.into_iter()
|
||||
.map(|layer_infos| (timeline_id, layer_infos)),
|
||||
);
|
||||
}
|
||||
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
|
||||
candidates.extend(tenant_candidates.into_iter().map(|(timeline_id, candidate)| {
|
||||
(
|
||||
// Secondary locations' layers are always considered above the min resident size,
|
||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
||||
// the layers have sufficiently old access times.
|
||||
MinResidentSizePartition::Above,
|
||||
EvictionCandidate {
|
||||
source: EvictionCandidateSource::Secondary(TenantTimelineId { tenant_id: *tenant_id, timeline_id}),
|
||||
last_activity_ts: candidate.last_activity_ts,
|
||||
layer: candidate.layer,
|
||||
}
|
||||
)
|
||||
}));
|
||||
|
||||
},
|
||||
);
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
candidates
|
||||
|
||||
@@ -392,13 +392,19 @@ paths:
|
||||
type: string
|
||||
format: date-time
|
||||
description: A timestamp to get the LSN
|
||||
- name: version
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
description: The version of the endpoint to use
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
@@ -1384,6 +1390,19 @@ components:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
LsnByTimestampResponse:
|
||||
type: object
|
||||
required:
|
||||
- lsn
|
||||
- kind
|
||||
properties:
|
||||
lsn:
|
||||
type: string
|
||||
format: hex
|
||||
kind:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -4,11 +4,12 @@
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use futures::TryFutureExt;
|
||||
use humantime::format_rfc3339;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
@@ -17,6 +18,7 @@ use pageserver_api::models::{
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -35,8 +37,10 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
|
||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
@@ -62,22 +66,27 @@ use super::models::ConfigureFailpointsRequest;
|
||||
|
||||
pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
}
|
||||
|
||||
impl State {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
|
||||
.iter()
|
||||
@@ -85,12 +94,14 @@ impl State {
|
||||
.collect::<Vec<_>>();
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_manager,
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_storage,
|
||||
broker_client,
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue_client,
|
||||
secondary_controller,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -146,28 +157,60 @@ impl From<PageReconstructError> for ApiError {
|
||||
impl From<TenantMapInsertError> for ApiError {
|
||||
fn from(tmie: TenantMapInsertError) -> ApiError {
|
||||
match tmie {
|
||||
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
|
||||
ApiError::ResourceUnavailable(format!("{tmie}").into())
|
||||
}
|
||||
TenantMapInsertError::TenantAlreadyExists(id, state) => {
|
||||
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
|
||||
}
|
||||
TenantMapInsertError::TenantExistsSecondary(id) => {
|
||||
ApiError::Conflict(format!("tenant {id} already exists as secondary"))
|
||||
}
|
||||
TenantMapInsertError::SlotError(e) => e.into(),
|
||||
TenantMapInsertError::SlotUpsertError(e) => e.into(),
|
||||
TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantSlotError> for ApiError {
|
||||
fn from(e: TenantSlotError) -> ApiError {
|
||||
use TenantSlotError::*;
|
||||
match e {
|
||||
NotFound(tenant_id) => {
|
||||
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||
}
|
||||
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
|
||||
e @ Conflict(_) => ApiError::Conflict(format!("{e}")),
|
||||
InProgress => {
|
||||
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||
}
|
||||
MapState(e) => e.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantSlotUpsertError> for ApiError {
|
||||
fn from(e: TenantSlotUpsertError) -> ApiError {
|
||||
use TenantSlotUpsertError::*;
|
||||
match e {
|
||||
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
|
||||
MapState(e) => e.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantMapError> for ApiError {
|
||||
fn from(e: TenantMapError) -> ApiError {
|
||||
use TenantMapError::*;
|
||||
match e {
|
||||
StillInitializing | ShuttingDown => {
|
||||
ApiError::ResourceUnavailable(format!("{e}").into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantStateError> for ApiError {
|
||||
fn from(tse: TenantStateError) -> ApiError {
|
||||
match tse {
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
TenantStateError::IsStopping(_) => {
|
||||
ApiError::ResourceUnavailable("Tenant is stopping".into())
|
||||
}
|
||||
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
|
||||
TenantStateError::SlotError(e) => e.into(),
|
||||
TenantStateError::SlotUpsertError(e) => e.into(),
|
||||
TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -242,6 +285,9 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
Get(g) => ApiError::from(g),
|
||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
|
||||
SlotError(e) => e.into(),
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
}
|
||||
@@ -368,7 +414,7 @@ async fn timeline_create_handler(
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
@@ -415,7 +461,7 @@ async fn timeline_list_handler(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
@@ -454,7 +500,7 @@ async fn timeline_detail_handler(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let timeline_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -484,6 +530,8 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let version: Option<u8> = parse_query_param(&request, "version")?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let timestamp_raw = must_get_query_param(&request, "timestamp")?;
|
||||
let timestamp = humantime::parse_rfc3339(×tamp_raw)
|
||||
@@ -495,13 +543,32 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
|
||||
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
json_response(StatusCode::OK, result)
|
||||
if version.unwrap_or(0) > 1 {
|
||||
#[serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct Result {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
LsnForTimestamp::Future(lsn) => (lsn, "future"),
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
json_response(StatusCode::OK, Result { lsn, kind })
|
||||
} else {
|
||||
// FIXME: this is a temporary crutch not to break backwards compatibility
|
||||
// See https://github.com/neondatabase/neon/pull/5608
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_timestamp_of_lsn_handler(
|
||||
@@ -691,7 +758,7 @@ async fn tenant_status(
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, false)?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
@@ -754,7 +821,7 @@ async fn tenant_size_handler(
|
||||
let headers = request.headers();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
@@ -767,6 +834,10 @@ async fn tenant_size_handler(
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut sizes = None;
|
||||
let accepts_html = headers
|
||||
.get(header::ACCEPT)
|
||||
.map(|v| v == "text/html")
|
||||
.unwrap_or_default();
|
||||
if !inputs_only.unwrap_or(false) {
|
||||
let storage_model = inputs
|
||||
.calculate_model()
|
||||
@@ -774,11 +845,11 @@ async fn tenant_size_handler(
|
||||
let size = storage_model.calculate();
|
||||
|
||||
// If request header expects html, return html
|
||||
if headers["Accept"] == "text/html" {
|
||||
if accepts_html {
|
||||
return synthetic_size_html_response(inputs, storage_model, size);
|
||||
}
|
||||
sizes = Some(size);
|
||||
} else if headers["Accept"] == "text/html" {
|
||||
} else if accepts_html {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"inputs_only parameter is incompatible with html output request"
|
||||
)));
|
||||
@@ -929,7 +1000,7 @@ fn synthetic_size_html_response(
|
||||
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
|
||||
let response = Response::builder()
|
||||
.status(status)
|
||||
.header(hyper::header::CONTENT_TYPE, "text/html")
|
||||
.header(header::CONTENT_TYPE, "text/html")
|
||||
.body(Body::from(data.as_bytes().to_vec()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
@@ -1009,7 +1080,7 @@ async fn get_tenant_config_handler(
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, false)?;
|
||||
|
||||
let response = HashMap::from([
|
||||
(
|
||||
@@ -1053,6 +1124,9 @@ async fn put_tenant_location_config_handler(
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
||||
|
||||
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
|
||||
|
||||
let tenant_id = request_data.tenant_id;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
@@ -1068,7 +1142,7 @@ async fn put_tenant_location_config_handler(
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
TenantStateError::NotFound(_) => {
|
||||
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
||||
// This API is idempotent: a NotFound on a detach is fine.
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
@@ -1080,20 +1154,14 @@ async fn put_tenant_location_config_handler(
|
||||
let location_conf =
|
||||
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
mgr::upsert_location(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
location_conf,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
state.deletion_queue_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||
// which is not a 400 but a 409.
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_id, location_conf, flush, &ctx)
|
||||
.await
|
||||
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||
// which is not a 400 but a 409.
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1106,7 +1174,6 @@ async fn handle_tenant_break(
|
||||
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
||||
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test".to_owned()).await;
|
||||
@@ -1179,7 +1246,7 @@ async fn timeline_compact_handler(
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
|
||||
@@ -1204,7 +1271,7 @@ async fn timeline_checkpoint_handler(
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1310,7 +1377,7 @@ async fn getpage_at_lsn_handler(
|
||||
Result::<_, ApiError>::Ok(
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, "application/octet-stream")
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(page))
|
||||
.unwrap(),
|
||||
)
|
||||
@@ -1411,7 +1478,7 @@ async fn active_timeline_of_active_tenant(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, ApiError> {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
@@ -1474,17 +1541,18 @@ async fn disk_usage_eviction_run(
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
if state.remote_storage.as_ref().is_none() {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)));
|
||||
};
|
||||
}
|
||||
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
let eviction_state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let child_cancel = cancel.clone();
|
||||
let _g = cancel.drop_guard();
|
||||
let tenant_manager = state.tenant_manager.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
@@ -1495,9 +1563,9 @@ async fn disk_usage_eviction_run(
|
||||
false,
|
||||
async move {
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&state,
|
||||
&storage,
|
||||
&eviction_state,
|
||||
usage,
|
||||
&tenant_manager,
|
||||
&child_cancel,
|
||||
)
|
||||
.await;
|
||||
@@ -1515,6 +1583,36 @@ async fn disk_usage_eviction_run(
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn secondary_download_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.download_tenant(tenant_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn secondary_upload_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.upload_tenant(tenant_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1751,6 +1849,16 @@ pub fn make_router(
|
||||
.put("/v1/deletion_queue/flush", |r| {
|
||||
api_handler(r, deletion_queue_flush)
|
||||
})
|
||||
.post("/v1/secondary/:tenant_id/upload", |r| {
|
||||
testing_api_handler("force heatmap upload", r, secondary_upload_handler)
|
||||
})
|
||||
.post("/v1/secondary/:tenant_id/download", |r| {
|
||||
testing_api_handler(
|
||||
"force secondary layer download",
|
||||
r,
|
||||
secondary_download_handler,
|
||||
)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
|
||||
@@ -149,6 +149,10 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
|
||||
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
||||
// from the name.
|
||||
|
||||
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
@@ -1388,28 +1388,23 @@ impl TimelineMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_new_file_metrics(&self, sz: u64) {
|
||||
pub(crate) fn record_new_file_metrics(&self, sz: u64) {
|
||||
self.resident_physical_size_add(sz);
|
||||
self.num_persistent_files_created.inc_by(1);
|
||||
self.persistent_bytes_written.inc_by(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_sub(&self, sz: u64) {
|
||||
pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
|
||||
self.resident_physical_size_gauge.sub(sz);
|
||||
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_add(&self, sz: u64) {
|
||||
pub(crate) fn resident_physical_size_add(&self, sz: u64) {
|
||||
self.resident_physical_size_gauge.add(sz);
|
||||
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_set(&self, sz: u64) {
|
||||
self.resident_physical_size_gauge.set(sz);
|
||||
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_get(&self) -> u64 {
|
||||
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
||||
self.resident_physical_size_gauge.get()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1314,7 +1314,7 @@ async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
let tenant = match mgr::get_tenant(tenant_id, false).await {
|
||||
let tenant = match mgr::get_tenant(tenant_id, false) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
|
||||
Err(GetTenantError::NotActive(_)) => {
|
||||
|
||||
@@ -675,8 +675,9 @@ impl Timeline {
|
||||
|
||||
result.add_key(CONTROLFILE_KEY);
|
||||
result.add_key(CHECKPOINT_KEY);
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
|
||||
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
Ok(result.to_keyspace())
|
||||
}
|
||||
|
||||
@@ -1201,7 +1202,8 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(buf) => AuxFilesDirectory::des(&buf)?,
|
||||
Err(e) => {
|
||||
warn!("Failed to get info about AUX files: {}", e);
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
}
|
||||
|
||||
@@ -257,6 +257,12 @@ pub enum TaskKind {
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryDownloads,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryUploads,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
|
||||
@@ -63,6 +63,7 @@ use self::timeline::TimelineResources;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::deletion_queue::DeletionQueueError;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT_ACTIVATION;
|
||||
@@ -130,6 +131,7 @@ pub mod storage_layer;
|
||||
pub mod config;
|
||||
pub mod delete;
|
||||
pub mod mgr;
|
||||
pub mod secondary;
|
||||
pub mod tasks;
|
||||
pub mod upload_queue;
|
||||
|
||||
@@ -138,9 +140,7 @@ pub(crate) mod timeline;
|
||||
pub mod size;
|
||||
|
||||
pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub use timeline::{
|
||||
LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
|
||||
};
|
||||
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
|
||||
// re-export for use in remote_timeline_client.rs
|
||||
pub use crate::tenant::metadata::save_metadata;
|
||||
@@ -194,6 +194,45 @@ struct TimelinePreload {
|
||||
index_part: Result<MaybeDeletedIndexPart, DownloadError>,
|
||||
}
|
||||
|
||||
/// To include the HeatmapWriter in tenant shutdown, we provide a hook
|
||||
/// for it to publish a barrier when upload is going on. We will take
|
||||
/// this and wait on it during shutdown, ensuring that there is no
|
||||
/// upload going on once shutdown() returns.
|
||||
pub struct HeatmapHook {
|
||||
// Mutually exclude shutdown and any in-flight uploads
|
||||
//
|
||||
// If this is None, we are shutting down
|
||||
in_progress: Option<Arc<tokio::sync::Mutex<()>>>,
|
||||
}
|
||||
|
||||
impl Default for HeatmapHook {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
in_progress: Some(Arc::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HeatmapHook {
|
||||
pub(crate) fn enter(&self) -> Option<tokio::sync::OwnedMutexGuard<()>> {
|
||||
self.in_progress.as_ref().map(|l| {
|
||||
l.clone()
|
||||
.try_lock_owned()
|
||||
// expect: shutdown cannot have started yet or in_progress would have been None,
|
||||
// so we expect that only one HeatmapWriter may take this lock at once.
|
||||
// Depends on the invariant that HeatmapWriter is the only thing that calls
|
||||
// enter(), and that it will never try and do uploads concurrently for the same
|
||||
// tenant.
|
||||
.expect("Tried to double-lock HeatmapHook")
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a lock that the caller should wait on before proceeding with shutdown
|
||||
fn shutdown(&mut self) -> Arc<tokio::sync::Mutex<()>> {
|
||||
self.in_progress.take().expect("Called shutdown twice")
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Tenant consists of multiple timelines. Keep them in a hash table.
|
||||
///
|
||||
@@ -245,6 +284,16 @@ pub struct Tenant {
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
|
||||
pub(crate) heatmap_hook: Mutex<HeatmapHook>,
|
||||
|
||||
pub(crate) cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Tenant {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{} ({})", self.tenant_id, self.current_state())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum WalRedoManager {
|
||||
@@ -525,7 +574,7 @@ impl Tenant {
|
||||
tenant_id: TenantId,
|
||||
resources: TenantSharedResources,
|
||||
attached_conf: AttachedTenantConf,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
expect_marker: AttachMarkerMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
@@ -892,7 +941,7 @@ impl Tenant {
|
||||
attached_conf: AttachedTenantConf,
|
||||
resources: TenantSharedResources,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Arc<Tenant> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
@@ -1057,8 +1106,8 @@ impl Tenant {
|
||||
TimelineId::try_from(timeline_uninit_mark_file.file_stem())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
|
||||
)
|
||||
"Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
|
||||
)
|
||||
})?;
|
||||
let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
|
||||
if let Err(e) =
|
||||
@@ -1824,6 +1873,11 @@ impl Tenant {
|
||||
pitr: Duration,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
// Don't start doing work during shutdown
|
||||
if let TenantState::Stopping { .. } = self.current_state() {
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
|
||||
// there is a global allowed_error for this
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
@@ -1852,6 +1906,12 @@ impl Tenant {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Don't start doing work during shutdown
|
||||
if let TenantState::Stopping { .. } = self.current_state() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We should only be called once the tenant has activated.
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot run compaction iteration on inactive tenant"
|
||||
@@ -2014,6 +2074,8 @@ impl Tenant {
|
||||
// It's mesed up.
|
||||
// we just ignore the failure to stop
|
||||
|
||||
tracing::debug!("shutting down...");
|
||||
self.cancel.cancel();
|
||||
match self.set_stopping(shutdown_progress, false, false).await {
|
||||
Ok(()) => {}
|
||||
Err(SetStoppingError::Broken) => {
|
||||
@@ -2021,6 +2083,7 @@ impl Tenant {
|
||||
}
|
||||
Err(SetStoppingError::AlreadyStopping(other)) => {
|
||||
// give caller the option to wait for this this shutdown
|
||||
info!("Tenant::shutdown: AlreadyStopping");
|
||||
return Err(other);
|
||||
}
|
||||
};
|
||||
@@ -2034,6 +2097,7 @@ impl Tenant {
|
||||
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
|
||||
})
|
||||
};
|
||||
tracing::debug!("shutdown waiting for timelines...");
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
@@ -2043,12 +2107,23 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let heatmap_hook_lock = {
|
||||
let mut hook = self.heatmap_hook.lock().unwrap();
|
||||
hook.shutdown()
|
||||
};
|
||||
tracing::debug!("shutdown waiting heatmap uploads...");
|
||||
// Take & drop lock to ensure any heatmap upload is complete.
|
||||
drop(heatmap_hook_lock.lock().await);
|
||||
|
||||
tracing::debug!("shutdown waiting for tasks...");
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service
|
||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
||||
//
|
||||
// this will additionally shutdown and await all timeline tasks.
|
||||
task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;
|
||||
|
||||
tracing::debug!("shutdown complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2298,6 +2373,9 @@ where
|
||||
}
|
||||
|
||||
impl Tenant {
|
||||
pub fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.read().unwrap().tenant_conf
|
||||
}
|
||||
@@ -2544,6 +2622,8 @@ impl Tenant {
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
heatmap_hook: Mutex::default(),
|
||||
cancel: CancellationToken::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3353,6 +3433,30 @@ impl Tenant {
|
||||
pub fn cached_synthetic_size(&self) -> u64 {
|
||||
self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Flush any in-progress layers, schedule uploads, and wait for uploads to complete.
|
||||
///
|
||||
/// This function can take a long time: callers should wrap it in a timeout if calling
|
||||
/// from an external API handler.
|
||||
pub async fn flush_remote(&self) -> anyhow::Result<()> {
|
||||
let timelines = self.timelines.lock().unwrap().clone();
|
||||
|
||||
for (timeline_id, timeline) in timelines {
|
||||
tracing::info!(%timeline_id, "Flushing...");
|
||||
timeline.freeze_and_flush().await?;
|
||||
tracing::info!(%timeline_id, "Waiting for uploads...");
|
||||
if let Some(client) = &timeline.remote_client {
|
||||
client.wait_completion().await?;
|
||||
}
|
||||
}
|
||||
|
||||
match self.deletion_queue_client.flush_execute().await {
|
||||
Ok(_) => {}
|
||||
Err(DeletionQueueError::ShuttingDown) => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_timeline_and_uninit_mark(
|
||||
@@ -4328,6 +4432,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_layer_dumping() -> anyhow::Result<()> {
|
||||
use storage_layer::AsLayerDesc;
|
||||
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
@@ -4335,16 +4440,18 @@ mod tests {
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
|
||||
|
||||
let layer_map = tline.layers.read().await;
|
||||
let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
|
||||
let level0_deltas = layer_map
|
||||
.layer_map()
|
||||
.get_level0_deltas()?
|
||||
.into_iter()
|
||||
.map(|desc| layer_map.get_from_desc(&desc))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert!(!level0_deltas.is_empty());
|
||||
|
||||
for delta in level0_deltas {
|
||||
let delta = layer_map.get_from_desc(&delta);
|
||||
// Ensure we are dumping a delta layer here
|
||||
let delta = delta.downcast_delta_layer().unwrap();
|
||||
|
||||
delta.dump(false, &ctx).await.unwrap();
|
||||
assert!(delta.layer_desc().is_delta);
|
||||
delta.dump(true, &ctx).await.unwrap();
|
||||
}
|
||||
|
||||
@@ -4379,7 +4486,7 @@ mod tests {
|
||||
metadata_bytes[8] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let err = harness.try_load(&ctx).await.err().expect("should fail");
|
||||
let err = harness.try_load(&ctx).await.expect_err("should fail");
|
||||
// get all the stack with all .context, not only the last one
|
||||
let message = format!("{err:#}");
|
||||
let expected = "failed to load metadata";
|
||||
|
||||
@@ -21,7 +21,7 @@ use crate::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantsMap},
|
||||
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
@@ -35,12 +35,21 @@ pub(crate) enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
Get(#[from] GetTenantError),
|
||||
|
||||
#[error("Tenant not attached")]
|
||||
NotAttached,
|
||||
|
||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||
InvalidState(TenantState),
|
||||
|
||||
#[error("Tenant deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error("Tenant map slot error {0}")]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
|
||||
#[error("Tenant map slot upsert error {0}")]
|
||||
SlotUpsertError(#[from] TenantSlotUpsertError),
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
@@ -301,12 +310,12 @@ impl DeleteTenantFlow {
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
@@ -411,7 +420,7 @@ impl DeleteTenantFlow {
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
@@ -448,7 +457,7 @@ impl DeleteTenantFlow {
|
||||
pub(crate) async fn resume_from_attach(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
@@ -474,15 +483,8 @@ impl DeleteTenantFlow {
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
|
||||
let m = tenants.read().await;
|
||||
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.ok_or(GetTenantError::NotFound(tenant_id))?;
|
||||
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||
// Broken and Stopping is needed for retries.
|
||||
@@ -516,14 +518,14 @@ impl DeleteTenantFlow {
|
||||
)));
|
||||
}
|
||||
|
||||
Ok((Arc::clone(tenant), guard))
|
||||
Ok(guard)
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
@@ -556,7 +558,7 @@ impl DeleteTenantFlow {
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||
@@ -604,7 +606,7 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
let mut locked = tenants.write().await;
|
||||
let mut locked = tenants.write().unwrap();
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
|
||||
@@ -639,147 +639,10 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
println!("historic_layers:");
|
||||
for layer in self.iter_historic_layers() {
|
||||
layer.dump(verbose, ctx)?;
|
||||
for desc in self.iter_historic_layers() {
|
||||
desc.dump();
|
||||
}
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::LayerMap;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
mod l0_delta_layers_updated {
|
||||
|
||||
use crate::tenant::{
|
||||
storage_layer::{AsLayerDesc, PersistentLayerDesc},
|
||||
timeline::layer_manager::LayerFileManager,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
struct LayerObject(PersistentLayerDesc);
|
||||
|
||||
impl AsLayerDesc for LayerObject {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerObject {
|
||||
fn new(desc: PersistentLayerDesc) -> Self {
|
||||
LayerObject(desc)
|
||||
}
|
||||
}
|
||||
|
||||
type TestLayerFileManager = LayerFileManager<LayerObject>;
|
||||
|
||||
#[test]
|
||||
fn for_full_range_delta() {
|
||||
// l0_delta_layers are used by compaction, and should observe all buffered updates
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_non_full_range_delta() {
|
||||
// has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
|
||||
// because not full range
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_image() {
|
||||
l0_delta_layers_updated_scenario(
|
||||
"000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
|
||||
// code only checks if it is a full range layer, doesn't care about images, which must
|
||||
// mean we should in practice never have full range images
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replacing_missing_l0_is_notfound() {
|
||||
// original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
|
||||
// however only happen for precondition failures.
|
||||
|
||||
let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
|
||||
let layer = LayerFileName::from_str(layer).unwrap();
|
||||
let layer = PersistentLayerDesc::from(layer);
|
||||
|
||||
// same skeletan construction; see scenario below
|
||||
let not_found = Arc::new(LayerObject::new(layer.clone()));
|
||||
let new_version = Arc::new(LayerObject::new(layer));
|
||||
|
||||
// after the immutable storage state refactor, the replace operation
|
||||
// will not use layer map any more. We keep it here for consistency in test cases
|
||||
// and can remove it in the future.
|
||||
let _map = LayerMap::default();
|
||||
|
||||
let mut mapping = TestLayerFileManager::new();
|
||||
|
||||
mapping
|
||||
.replace_and_verify(not_found, new_version)
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
|
||||
let name = LayerFileName::from_str(layer_name).unwrap();
|
||||
let skeleton = PersistentLayerDesc::from(name);
|
||||
|
||||
let remote = Arc::new(LayerObject::new(skeleton.clone()));
|
||||
let downloaded = Arc::new(LayerObject::new(skeleton));
|
||||
|
||||
let mut map = LayerMap::default();
|
||||
let mut mapping = LayerFileManager::new();
|
||||
|
||||
// two disjoint Arcs in different lifecycle phases. even if it seems they must be the
|
||||
// same layer, we use LayerMap::compare_arced_layers as the identity of layers.
|
||||
assert_eq!(remote.layer_desc(), downloaded.layer_desc());
|
||||
|
||||
let expected_in_counts = (1, usize::from(expected_l0));
|
||||
|
||||
map.batch_update()
|
||||
.insert_historic(remote.layer_desc().clone());
|
||||
mapping.insert(remote.clone());
|
||||
assert_eq!(
|
||||
count_layer_in(&map, remote.layer_desc()),
|
||||
expected_in_counts
|
||||
);
|
||||
|
||||
mapping
|
||||
.replace_and_verify(remote, downloaded.clone())
|
||||
.expect("name derived attributes are the same");
|
||||
assert_eq!(
|
||||
count_layer_in(&map, downloaded.layer_desc()),
|
||||
expected_in_counts
|
||||
);
|
||||
|
||||
map.batch_update().remove_historic(downloaded.layer_desc());
|
||||
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
|
||||
}
|
||||
|
||||
fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
|
||||
let historic = map
|
||||
.iter_historic_layers()
|
||||
.filter(|x| x.key() == layer.key())
|
||||
.count();
|
||||
let l0s = map
|
||||
.get_level0_deltas()
|
||||
.expect("why does this return a result");
|
||||
let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
|
||||
|
||||
(historic, l0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -57,8 +57,7 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
|
||||
fsync_in_thread_pool(paths)
|
||||
}
|
||||
|
||||
/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
|
||||
/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
|
||||
/// Parallel fsync asynchronously.
|
||||
pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
|
||||
const MAX_CONCURRENT_FSYNC: usize = 64;
|
||||
let mut next = paths.iter().peekable();
|
||||
|
||||
@@ -167,8 +167,6 @@
|
||||
//! - download their remote [`IndexPart`]s
|
||||
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
||||
//! - initialize the client's upload queue with its `IndexPart`
|
||||
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
|
||||
//! for layers that are referenced by `IndexPart` but not present locally
|
||||
//! - schedule uploads for layers that are only present locally.
|
||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||
//! the local filesystem, write the remote metadata to the local filesystem
|
||||
@@ -204,15 +202,14 @@
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
|
||||
|
||||
mod download;
|
||||
pub(crate) mod download;
|
||||
pub mod index;
|
||||
mod upload;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff::{
|
||||
@@ -237,7 +234,7 @@ use crate::metrics::{
|
||||
};
|
||||
use crate::task_mgr::shutdown_token;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::AsLayerDesc;
|
||||
use crate::tenant::upload_queue::Delete;
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::{
|
||||
@@ -255,10 +252,13 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use self::index::IndexPart;
|
||||
|
||||
use super::storage_layer::LayerFileName;
|
||||
use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
|
||||
use super::upload_queue::SetDeletedFlagProgress;
|
||||
use super::Generation;
|
||||
|
||||
pub(crate) use download::{is_temp_download_file, list_remote_timelines};
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||
@@ -627,101 +627,181 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
/// Launch an upload operation in the background.
|
||||
///
|
||||
pub fn schedule_layer_file_upload(
|
||||
pub(crate) fn schedule_layer_file_upload(
|
||||
self: &Arc<Self>,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
layer: ResidentLayer,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(layer_file_name.clone(), layer_metadata.clone());
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
info!("scheduled layer file upload {layer_file_name}");
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.schedule_layer_file_upload0(upload_queue, layer);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn schedule_layer_file_upload0(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
layer: ResidentLayer,
|
||||
) {
|
||||
let metadata = layer.metadata();
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(layer.layer_desc().filename(), metadata.clone());
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
info!("scheduled layer file upload {layer}");
|
||||
let op = UploadOp::UploadLayer(layer, metadata);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
|
||||
/// Launch a delete operation in the background.
|
||||
///
|
||||
/// The operation does not modify local state but assumes the local files have already been
|
||||
/// deleted, and is used to mirror those changes to remote.
|
||||
/// The operation does not modify local filesystem state.
|
||||
///
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// deletion won't actually be performed, until all previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// successfully.
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: Vec<LayerFileName>,
|
||||
names: &[LayerFileName],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
let with_generations =
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
|
||||
|
||||
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
|
||||
|
||||
// Launch the tasks immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
|
||||
/// layer files, leaving them dangling.
|
||||
///
|
||||
/// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
|
||||
/// is invoked on them.
|
||||
pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
// just forget the return value; after uploading the next index_part.json, we can consider
|
||||
// the layer files as "dangling". this is fine, at worst case we create work for the
|
||||
// scrubber.
|
||||
|
||||
let names = gc_layers.iter().map(|x| x.layer_desc().filename());
|
||||
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
|
||||
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update the remote index file, removing the to-be-deleted files from the index,
|
||||
/// allowing scheduling of actual deletions later.
|
||||
fn schedule_unlinking_of_layers_from_index_part0<I>(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
names: I,
|
||||
) -> Vec<(LayerFileName, Generation)>
|
||||
where
|
||||
I: IntoIterator<Item = LayerFileName>,
|
||||
{
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata = upload_queue.latest_metadata.clone();
|
||||
|
||||
// Update the remote index file, removing the to-be-deleted files from the index,
|
||||
// before deleting the actual files.
|
||||
//
|
||||
// Once we start removing files from upload_queue.latest_files, there's
|
||||
// no going back! Otherwise, some of the files would already be removed
|
||||
// from latest_files, but not yet scheduled for deletion. Use a closure
|
||||
// to syntactically forbid ? or bail! calls here.
|
||||
let no_bail_here = || {
|
||||
// Decorate our list of names with each name's generation, dropping
|
||||
// makes that are unexpectedly missing from our metadata.
|
||||
let with_generations: Vec<_> = names
|
||||
.into_iter()
|
||||
.filter_map(|name| {
|
||||
// Remove from latest_files, learning the file's remote generation in the process
|
||||
let meta = upload_queue.latest_files.remove(&name);
|
||||
// Decorate our list of names with each name's generation, dropping
|
||||
// names that are unexpectedly missing from our metadata.
|
||||
let with_generations: Vec<_> = names
|
||||
.into_iter()
|
||||
.filter_map(|name| {
|
||||
let meta = upload_queue.latest_files.remove(&name);
|
||||
|
||||
if let Some(meta) = meta {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
Some((name, meta.generation))
|
||||
} else {
|
||||
// This can only happen if we forgot to to schedule the file upload
|
||||
// before scheduling the delete. Log it because it is a rare/strange
|
||||
// situation, and in case something is misbehaving, we'd like to know which
|
||||
// layers experienced this.
|
||||
info!(
|
||||
"Deleting layer {name} not found in latest_files list, never uploaded?"
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
if let Some(meta) = meta {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
Some((name, meta.generation))
|
||||
} else {
|
||||
// This can only happen if we forgot to to schedule the file upload
|
||||
// before scheduling the delete. Log it because it is a rare/strange
|
||||
// situation, and in case something is misbehaving, we'd like to know which
|
||||
// layers experienced this.
|
||||
info!("Deleting layer {name} not found in latest_files list, never uploaded?");
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue, metadata);
|
||||
}
|
||||
// after unlinking files from the upload_queue.latest_files we must always schedule an
|
||||
// index_part update, because that needs to be uploaded before we can actually delete the
|
||||
// files.
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue, metadata);
|
||||
}
|
||||
|
||||
for (name, gen) in &with_generations {
|
||||
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
|
||||
}
|
||||
with_generations
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
let op = UploadOp::Delete(Delete {
|
||||
layers: with_generations,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
/// Schedules deletion for layer files which have previously been unlinked from the
|
||||
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
|
||||
pub(crate) fn schedule_deletion_of_unlinked(
|
||||
self: &Arc<Self>,
|
||||
layers: Vec<(LayerFileName, Generation)>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
self.schedule_deletion_of_unlinked0(upload_queue, layers);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn schedule_deletion_of_unlinked0(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
with_generations: Vec<(LayerFileName, Generation)>,
|
||||
) {
|
||||
for (name, gen) in &with_generations {
|
||||
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
let op = UploadOp::Delete(Delete {
|
||||
layers: with_generations,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
|
||||
/// Schedules a compaction update to the remote `index_part.json`.
|
||||
///
|
||||
/// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
|
||||
pub(crate) fn schedule_compaction_update(
|
||||
self: &Arc<Self>,
|
||||
compacted_from: &[Layer],
|
||||
compacted_to: &[ResidentLayer],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
for layer in compacted_to {
|
||||
self.schedule_layer_file_upload0(upload_queue, layer.clone());
|
||||
}
|
||||
|
||||
let names = compacted_from.iter().map(|x| x.layer_desc().filename());
|
||||
|
||||
let with_generations =
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
|
||||
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
|
||||
// Launch the tasks immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
};
|
||||
no_bail_here();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1093,16 +1173,12 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
|
||||
let path = self
|
||||
.conf
|
||||
.timeline_path(&self.tenant_id, &self.timeline_id)
|
||||
.join(layer_file_name.file_name());
|
||||
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
|
||||
let path = layer.local_path();
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&path,
|
||||
path,
|
||||
layer_metadata,
|
||||
self.generation,
|
||||
)
|
||||
@@ -1420,11 +1496,21 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_layer_metadata(
|
||||
pub(crate) fn get_layers_metadata(
|
||||
&self,
|
||||
name: &LayerFileName,
|
||||
) -> anyhow::Result<Option<LayerFileMetadata>> {
|
||||
self.upload_queue.lock().unwrap().get_layer_metadata(name)
|
||||
layers: Vec<LayerFileName>,
|
||||
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
|
||||
let q = self.upload_queue.lock().unwrap();
|
||||
let q = match &*q {
|
||||
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
|
||||
anyhow::bail!("queue is in state {}", q.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(inner) => inner,
|
||||
};
|
||||
|
||||
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
|
||||
|
||||
Ok(decorated.collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1466,6 +1552,13 @@ pub fn remote_index_path(
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub const HEATMAP_BASENAME: &str = "heatmap";
|
||||
|
||||
pub fn remote_heatmap_path(tenant_id: &TenantId) -> RemotePath {
|
||||
RemotePath::from_string(&format!("tenants/{tenant_id}/{HEATMAP_BASENAME}-v01"))
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
/// Given the key of an index, parse out the generation part of the name
|
||||
pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
let file_name = match path.get_path().file_name() {
|
||||
@@ -1513,6 +1606,7 @@ mod tests {
|
||||
context::RequestContext,
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::Layer,
|
||||
Generation, Tenant, Timeline,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
@@ -1681,32 +1775,29 @@ mod tests {
|
||||
let generation = harness.generation;
|
||||
|
||||
// Create a couple of dummy files, schedule upload for them
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
|
||||
let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
|
||||
let content_1 = dummy_contents("foo");
|
||||
let content_2 = dummy_contents("bar");
|
||||
let content_3 = dummy_contents("baz");
|
||||
|
||||
for (filename, content) in [
|
||||
(&layer_file_name_1, &content_1),
|
||||
(&layer_file_name_2, &content_2),
|
||||
(&layer_file_name_3, &content_3),
|
||||
] {
|
||||
std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
|
||||
}
|
||||
let layers = [
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
|
||||
]
|
||||
.into_iter()
|
||||
.map(|(name, contents): (LayerFileName, Vec<u8>)| {
|
||||
std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
|
||||
|
||||
Layer::for_resident(
|
||||
harness.conf,
|
||||
&timeline,
|
||||
name,
|
||||
LayerFileMetadata::new(contents.len() as u64, generation),
|
||||
)
|
||||
}).collect::<Vec<_>>();
|
||||
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64, generation),
|
||||
)
|
||||
.schedule_layer_file_upload(layers[0].clone())
|
||||
.unwrap();
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_2,
|
||||
&LayerFileMetadata::new(content_2.len() as u64, generation),
|
||||
)
|
||||
.schedule_layer_file_upload(layers[1].clone())
|
||||
.unwrap();
|
||||
|
||||
// Check that they are started immediately, not queued
|
||||
@@ -1760,38 +1851,42 @@ mod tests {
|
||||
.collect(),
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
&layers[0].layer_desc().filename().file_name(),
|
||||
&layers[1].layer_desc().filename().file_name(),
|
||||
],
|
||||
);
|
||||
assert_eq!(index_part.metadata, metadata);
|
||||
|
||||
// Schedule upload and then a deletion. Check that the deletion is queued
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_3,
|
||||
&LayerFileMetadata::new(content_3.len() as u64, generation),
|
||||
)
|
||||
.schedule_layer_file_upload(layers[2].clone())
|
||||
.unwrap();
|
||||
|
||||
// this is no longer consistent with how deletion works with Layer::drop, but in this test
|
||||
// keep using schedule_layer_file_deletion because we don't have a way to wait for the
|
||||
// spawn_blocking started by the drop.
|
||||
client
|
||||
.schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
|
||||
.schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
|
||||
.unwrap();
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
|
||||
// Deletion schedules upload of the index file, and the file deletion itself
|
||||
assert!(upload_queue.queued_operations.len() == 2);
|
||||
assert!(upload_queue.inprogress_tasks.len() == 1);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 1);
|
||||
assert!(upload_queue.num_inprogress_deletions == 0);
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
|
||||
assert_eq!(upload_queue.queued_operations.len(), 2);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions, 0);
|
||||
assert_eq!(
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
0
|
||||
);
|
||||
}
|
||||
assert_remote_files(
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_1.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
&layers[0].layer_desc().filename().file_name(),
|
||||
&layers[1].layer_desc().filename().file_name(),
|
||||
"index_part.json",
|
||||
],
|
||||
&remote_timeline_dir,
|
||||
@@ -1805,8 +1900,8 @@ mod tests {
|
||||
assert_remote_files(
|
||||
&[
|
||||
&initial_layer.file_name(),
|
||||
&layer_file_name_2.file_name(),
|
||||
&layer_file_name_3.file_name(),
|
||||
&layers[1].layer_desc().filename().file_name(),
|
||||
&layers[2].layer_desc().filename().file_name(),
|
||||
"index_part.json",
|
||||
],
|
||||
&remote_timeline_dir,
|
||||
@@ -1835,6 +1930,13 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let layer_file_1 = Layer::for_resident(
|
||||
harness.conf,
|
||||
&timeline,
|
||||
layer_file_name_1.clone(),
|
||||
LayerFileMetadata::new(content_1.len() as u64, harness.generation),
|
||||
);
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
struct BytesStartedFinished {
|
||||
started: Option<usize>,
|
||||
@@ -1870,10 +1972,7 @@ mod tests {
|
||||
let actual_a = get_bytes_started_stopped();
|
||||
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64, harness.generation),
|
||||
)
|
||||
.schedule_layer_file_upload(layer_file_1.clone())
|
||||
.unwrap();
|
||||
|
||||
let actual_b = get_bytes_started_stopped();
|
||||
|
||||
@@ -60,6 +60,8 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
bail!("failpoint before-upload-layer")
|
||||
});
|
||||
|
||||
pausable_failpoint!("before-upload-layer-pausable");
|
||||
|
||||
let storage_path = remote_path(conf, source_path, generation)?;
|
||||
let source_file_res = fs::File::open(&source_path).await;
|
||||
let source_file = match source_file_res {
|
||||
@@ -70,6 +72,8 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
// upload. However, a nonexistent file can also be indicative of
|
||||
// something worse, like when a file is scheduled for upload before
|
||||
// it has been written to disk yet.
|
||||
//
|
||||
// This is tested against `test_compaction_delete_before_upload`
|
||||
info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
268
pageserver/src/tenant/secondary.rs
Normal file
268
pageserver/src/tenant/secondary.rs
Normal file
@@ -0,0 +1,268 @@
|
||||
pub mod downloader;
|
||||
pub mod heatmap;
|
||||
pub mod heatmap_writer;
|
||||
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
};
|
||||
|
||||
use self::{
|
||||
downloader::{downloader_task, SecondaryDetail},
|
||||
heatmap_writer::heatmap_writer_task,
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::TenantManager,
|
||||
storage_layer::{AsLayerDesc, Layer},
|
||||
timeline::DiskUsageEvictionInfo,
|
||||
};
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
completion::Barrier,
|
||||
fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
enum DownloadCommand {
|
||||
Download(TenantId),
|
||||
}
|
||||
enum UploadCommand {
|
||||
Upload(TenantId),
|
||||
}
|
||||
|
||||
struct CommandRequest<T> {
|
||||
payload: T,
|
||||
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
||||
}
|
||||
|
||||
struct CommandResponse {
|
||||
result: anyhow::Result<()>,
|
||||
}
|
||||
|
||||
// Whereas [`Tenant`] represents an attached tenant, this type represents the work
|
||||
// we do for secondary tenant locations: where we are not serving clients or
|
||||
// ingesting WAL, but we are maintaining a warm cache of layer files.
|
||||
//
|
||||
// This type is all about the _download_ path for secondary mode. The upload path
|
||||
// runs while a regular attached `Tenant` exists.
|
||||
//
|
||||
// This structure coordinates TenantManager and SecondaryDownloader,
|
||||
// so that the downloader can indicate which tenants it is currently
|
||||
// operating on, and the manager can indicate when a particular
|
||||
// secondary tenant should cancel any work in flight.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SecondaryTenant {
|
||||
/// Cancellation token indicates to SecondaryDownloader that it should stop doing
|
||||
/// any work for this tenant at the next opportunity.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
|
||||
/// Lock must be held by SecondaryDownloader at any time that it might be operating
|
||||
/// on the local filesystem directory for this tenant ID.
|
||||
// Ordering: the TenantManager must set the cancellation token _before_
|
||||
// taking the lock. The SecondaryDownloader must always check the cancellation
|
||||
// token immediately _after_ taking the lock (and at appropriate intervals
|
||||
// while holding it).
|
||||
pub(crate) busy: Arc<tokio::sync::Mutex<()>>,
|
||||
|
||||
detail: std::sync::Mutex<SecondaryDetail>,
|
||||
// TODO: propagate the `warm` from LocationConf into here, and respect it when doing downloads
|
||||
}
|
||||
|
||||
impl SecondaryTenant {
|
||||
pub(crate) fn new() -> Arc<Self> {
|
||||
// TODO; consider whether we really need to Arc this
|
||||
Arc::new(Self {
|
||||
busy: Arc::new(tokio::sync::Mutex::new(())),
|
||||
// todo: shall we make this a descendent of the
|
||||
// main cancellation token, or is it sufficient that
|
||||
// on shutdown we walk the tenants and fire their
|
||||
// individual cancellations?
|
||||
cancel: CancellationToken::new(),
|
||||
|
||||
detail: std::sync::Mutex::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
self.cancel.cancel();
|
||||
|
||||
// Wait for any secondary downloader work to complete: once we
|
||||
// acquire this lock, we are guaranteed that the secondary downloader
|
||||
// won't touch the local filesystem again for this instance: it is safe
|
||||
// to e.g. construct a `Tenant` for the same TenantId
|
||||
drop(self.busy.lock().await);
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
|
||||
self.detail.lock().unwrap().get_layers_for_eviction()
|
||||
}
|
||||
|
||||
pub(crate) async fn evict_layers(
|
||||
&self,
|
||||
_guard: tokio::sync::OwnedMutexGuard<()>,
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
layers: Vec<(TimelineId, Layer)>,
|
||||
) {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
// Eviction is a no-op if shutdown() was already called.
|
||||
tracing::info!(
|
||||
"Dropping {} layer evictions, secondary tenant shutting down",
|
||||
layers.len()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
for (timeline_id, layer) in layers {
|
||||
let layer_name = layer.layer_desc().filename();
|
||||
let path = conf
|
||||
.timeline_path(tenant_id, &timeline_id)
|
||||
.join(&layer_name.file_name());
|
||||
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
tokio::fs::remove_file(path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.expect("TODO: terminate process on local I/O errors");
|
||||
|
||||
// TODO: batch up updates instead of acquiring lock in inner loop
|
||||
let mut detail = self.detail.lock().unwrap();
|
||||
// If there is no timeline detail for what we just deleted, that indicates that
|
||||
// the secondary downloader did some work (perhaps removing all)
|
||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
||||
timeline_detail.on_disk_layers.remove(&layer_name);
|
||||
timeline_detail.evicted_at.insert(layer_name, now);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
||||
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
||||
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
||||
/// uploads & downloads are autonomous and not driven by this interface.
|
||||
pub struct SecondaryController {
|
||||
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
||||
|
||||
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
||||
}
|
||||
|
||||
impl SecondaryController {
|
||||
async fn dispatch<T>(
|
||||
&self,
|
||||
queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
|
||||
payload: T,
|
||||
) -> anyhow::Result<()> {
|
||||
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
queue
|
||||
.send(CommandRequest {
|
||||
payload,
|
||||
response_tx,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
|
||||
|
||||
let response = response_rx
|
||||
.await
|
||||
.map_err(|_| anyhow::anyhow!("Request dropped"))?;
|
||||
|
||||
response.result
|
||||
}
|
||||
|
||||
pub async fn download_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
|
||||
self.dispatch(&self.download_req_tx, DownloadCommand::Download(tenant_id))
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn upload_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
|
||||
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_id))
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_tasks(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) -> SecondaryController {
|
||||
let mgr_clone = tenant_manager.clone();
|
||||
let storage_clone = remote_storage.clone();
|
||||
let cancel_clone = cancel.clone();
|
||||
let bg_jobs_clone = background_jobs_can_start.clone();
|
||||
|
||||
let (download_req_tx, download_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
|
||||
let (upload_req_tx, upload_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::SecondaryDownloads,
|
||||
None,
|
||||
None,
|
||||
"secondary tenant downloads",
|
||||
false,
|
||||
async move {
|
||||
downloader_task(
|
||||
conf,
|
||||
mgr_clone,
|
||||
storage_clone,
|
||||
download_req_rx,
|
||||
bg_jobs_clone,
|
||||
cancel_clone,
|
||||
)
|
||||
.await
|
||||
},
|
||||
);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::SecondaryDownloads,
|
||||
None,
|
||||
None,
|
||||
"heatmap uploads",
|
||||
false,
|
||||
async move {
|
||||
heatmap_writer_task(
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
upload_req_rx,
|
||||
background_jobs_can_start,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
);
|
||||
|
||||
SecondaryController {
|
||||
download_req_tx,
|
||||
upload_req_tx,
|
||||
}
|
||||
}
|
||||
|
||||
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
|
||||
pub fn null_controller() -> SecondaryController {
|
||||
let (download_req_tx, _download_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
|
||||
let (upload_req_tx, _upload_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||
SecondaryController {
|
||||
upload_req_tx,
|
||||
download_req_tx,
|
||||
}
|
||||
}
|
||||
579
pageserver/src/tenant/secondary/downloader.rs
Normal file
579
pageserver/src/tenant/secondary/downloader.rs
Normal file
@@ -0,0 +1,579 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::{
|
||||
remote_timeline_client::index::LayerFileMetadata,
|
||||
secondary::CommandResponse,
|
||||
storage_layer::{Layer, LayerFileName},
|
||||
timeline::{DiskUsageEvictionInfo, LocalLayerInfoForDiskUsageEviction},
|
||||
},
|
||||
METADATA_FILE_NAME,
|
||||
};
|
||||
|
||||
use super::SecondaryTenant;
|
||||
use crate::tenant::{
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
|
||||
};
|
||||
use anyhow::Context;
|
||||
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
completion::Barrier,
|
||||
fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
use super::{
|
||||
heatmap::{HeatMapTenant, HeatMapTimeline},
|
||||
CommandRequest, DownloadCommand,
|
||||
};
|
||||
|
||||
/// Interval between checking if any Secondary tenants have download work to do:
|
||||
/// note that this is _not_ the frequency with which we actually freshen the tenants,
|
||||
/// just the frequency with which we wake up to decide whether anyone needs freshening.
|
||||
///
|
||||
/// Making this somewhat infrequent reduces the load on mutexes inside TenantManager
|
||||
/// and SecondaryTenant for reads when checking for work to do.
|
||||
const DOWNLOAD_CHECK_INTERVAL: Duration = Duration::from_millis(10000);
|
||||
|
||||
/// For each tenant, how long must have passed since the last freshen_tenant call before
|
||||
/// calling it again. This is approximately the time by which local data is allowed
|
||||
/// to fall behind remote data.
|
||||
///
|
||||
/// TODO: this should be an upper bound, and tenants that are uploading regularly
|
||||
/// should adaptively freshen more often (e.g. a tenant writing 1 layer per second
|
||||
/// should not wait a minute between freshens)
|
||||
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(super) struct OnDiskState {
|
||||
layer: Layer,
|
||||
access_time: SystemTime,
|
||||
}
|
||||
|
||||
impl OnDiskState {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
name: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
) -> Self {
|
||||
Self {
|
||||
layer: Layer::for_secondary(conf, tenant_id, timeline_id, name, metadata),
|
||||
access_time,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(super) struct SecondaryDetailTimeline {
|
||||
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
|
||||
|
||||
/// We remember when layers were evicted, to prevent re-downloading them.
|
||||
/// TODO: persist this, so that we don't try and re-download everything on restart.
|
||||
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
|
||||
}
|
||||
|
||||
/// This state is written by the secondary downloader, it is opaque
|
||||
/// to TenantManager
|
||||
#[derive(Default, Debug)]
|
||||
pub(super) struct SecondaryDetail {
|
||||
freshened_at: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
|
||||
/// Helper for logging SystemTime
|
||||
fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
|
||||
let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
|
||||
datetime.format("%d/%m/%Y %T")
|
||||
}
|
||||
|
||||
impl SecondaryDetail {
|
||||
pub(super) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
|
||||
let mut result = Vec::new();
|
||||
for (timeline_id, timeline_detail) in &self.timelines {
|
||||
let layers: Vec<_> = timeline_detail
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|ods| LocalLayerInfoForDiskUsageEviction {
|
||||
layer: ods.layer.clone(),
|
||||
last_activity_ts: ods.access_time,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let max_layer_size = layers.iter().map(|l| l.layer.metadata().file_size()).max();
|
||||
|
||||
result.push((
|
||||
*timeline_id,
|
||||
DiskUsageEvictionInfo {
|
||||
resident_layers: layers,
|
||||
max_layer_size,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// Keep trying to do downloads until the cancellation token is fired. Remote storage
|
||||
/// errors are handled internally: any error returned by this function is an unexpected
|
||||
/// internal error of some kind.
|
||||
pub(super) async fn downloader_task(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let downloader = SecondaryDownloader {
|
||||
conf,
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
cancel: cancel.clone(),
|
||||
};
|
||||
|
||||
tracing::info!("Waiting for background_jobs_can start...");
|
||||
background_jobs_can_start.wait().await;
|
||||
tracing::info!("background_jobs_can is ready, proceeding.");
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
downloader.iteration().await?;
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("Heatmap writer terminating");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(DOWNLOAD_CHECK_INTERVAL) => {},
|
||||
cmd = command_queue.recv() => {
|
||||
let cmd = match cmd {
|
||||
Some(c) =>c,
|
||||
None => {
|
||||
// SecondaryController was destroyed, and this has raced with
|
||||
// our CancellationToken
|
||||
tracing::info!("Heatmap writer terminating");
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
let CommandRequest{
|
||||
response_tx,
|
||||
payload
|
||||
} = cmd;
|
||||
let result = downloader.handle_command(payload).await;
|
||||
if response_tx.send(CommandResponse{result}).is_err() {
|
||||
// Caller went away, e.g. because an HTTP request timed out
|
||||
tracing::info!("Dropping response to administrative command")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
struct SecondaryDownloader {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
struct TenantJob {
|
||||
tenant_id: TenantId,
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
|
||||
// This mutex guard conveys the right to write to the tenant's local directory: it must
|
||||
// be taken before doing downloads, and TenantManager must ensure it has been released
|
||||
// before it considers shutdown complete for the secondary state -- [`SecondaryDownloader`]
|
||||
// will thereby never be racing with [`Tenant`] for access to local files.
|
||||
_guard: tokio::sync::OwnedMutexGuard<()>,
|
||||
}
|
||||
|
||||
impl SecondaryDownloader {
|
||||
async fn iteration(&self) -> anyhow::Result<()> {
|
||||
// Step 1: identify some tenants that we may work on
|
||||
let mut candidates: Vec<TenantJob> = Vec::new();
|
||||
self.tenant_manager
|
||||
.foreach_secondary_tenants(|tenant_id, secondary_state| {
|
||||
let guard = match secondary_state.busy.clone().try_lock_owned() {
|
||||
Ok(guard) => guard,
|
||||
// If we can't lock, someone is in the process of shutting it down, or we are
|
||||
// already working on it. We may ignore it when scanning for new work to do.
|
||||
Err(_) => return,
|
||||
};
|
||||
|
||||
candidates.push(TenantJob {
|
||||
tenant_id: *tenant_id,
|
||||
secondary_state: secondary_state.clone(),
|
||||
_guard: guard,
|
||||
});
|
||||
});
|
||||
|
||||
// Step 2: prioritized selection of next batch of tenants to freshen
|
||||
let now = Instant::now();
|
||||
let candidates = candidates.into_iter().filter(|c| {
|
||||
let detail = c.secondary_state.detail.lock().unwrap();
|
||||
match detail.freshened_at {
|
||||
None => true, // Not yet freshened, therefore elegible to run
|
||||
Some(t) => {
|
||||
let since = now.duration_since(t);
|
||||
since > DOWNLOAD_FRESHEN_INTERVAL
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// TODO: don't just cut down the list, prioritize it to freshen the stalest tenants first
|
||||
// TODO: bounded parallelism
|
||||
|
||||
// Step 3: spawn freshen_tenant tasks
|
||||
for job in candidates {
|
||||
if job.secondary_state.cancel.is_cancelled() {
|
||||
continue;
|
||||
}
|
||||
|
||||
async {
|
||||
if let Err(e) = self.freshen_tenant(&job).await {
|
||||
tracing::info!("Failed to freshen secondary content: {e:#}")
|
||||
};
|
||||
|
||||
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
|
||||
// take priority to run again.
|
||||
let mut detail = job.secondary_state.detail.lock().unwrap();
|
||||
detail.freshened_at = Some(Instant::now());
|
||||
}
|
||||
.instrument(tracing::info_span!(
|
||||
"freshen_tenant",
|
||||
tenant_id = %job.tenant_id
|
||||
))
|
||||
.await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_command(&self, command: DownloadCommand) -> anyhow::Result<()> {
|
||||
match command {
|
||||
DownloadCommand::Download(req_tenant_id) => {
|
||||
let mut candidates: Vec<TenantJob> = Vec::new();
|
||||
self.tenant_manager
|
||||
.foreach_secondary_tenants(|tenant_id, secondary_state| {
|
||||
tracing::info!("foreach_secondary: {tenant_id} ({req_tenant_id})");
|
||||
if tenant_id == &req_tenant_id {
|
||||
let guard = match secondary_state.busy.clone().try_lock_owned() {
|
||||
Ok(guard) => guard,
|
||||
// If we can't lock, someone is in the process of shutting it down, or we are
|
||||
// already working on it. We may ignore it when scanning for new work to do.
|
||||
Err(_) => return,
|
||||
};
|
||||
|
||||
candidates.push(TenantJob {
|
||||
tenant_id: *tenant_id,
|
||||
secondary_state: secondary_state.clone(),
|
||||
_guard: guard,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
let tenant_job = if candidates.len() != 1 {
|
||||
anyhow::bail!("Tenant not found in secondary mode");
|
||||
} else {
|
||||
candidates.pop().unwrap()
|
||||
};
|
||||
|
||||
self.freshen_tenant(&tenant_job).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_heatmap(&self, tenant_id: &TenantId) -> anyhow::Result<HeatMapTenant> {
|
||||
// TODO: make download conditional on ETag having changed since last download
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_id);
|
||||
// TODO: wrap this download in a select! that checks self.cancel
|
||||
let mut download = self.remote_storage.download(&heatmap_path).await?;
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let _size = tokio::io::copy(&mut download.download_stream, &mut heatmap_bytes)
|
||||
.await
|
||||
.with_context(|| format!("download heatmap {heatmap_path:?}"))?;
|
||||
|
||||
Ok(serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?)
|
||||
}
|
||||
|
||||
async fn init_timeline_state(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
heatmap: &HeatMapTimeline,
|
||||
) -> anyhow::Result<SecondaryDetailTimeline> {
|
||||
let timeline_path = self.conf.timeline_path(tenant_id, timeline_id);
|
||||
let mut detail = SecondaryDetailTimeline::default();
|
||||
|
||||
let mut dir = match tokio::fs::read_dir(&timeline_path).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
tracing::info!("Creating timeline directory {timeline_path}");
|
||||
tokio::fs::create_dir(&timeline_path).await?;
|
||||
|
||||
// No entries to report: drop out.
|
||||
return Ok(detail);
|
||||
} else {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let heatmap_metadata: HashMap<_, _> = heatmap.layers.iter().map(|l| (&l.name, l)).collect();
|
||||
|
||||
while let Some(dentry) = dir.next_entry().await? {
|
||||
let dentry_file_name = dentry.file_name();
|
||||
let file_name = dentry_file_name.to_string_lossy();
|
||||
let local_meta = dentry.metadata().await?;
|
||||
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
continue;
|
||||
}
|
||||
|
||||
match LayerFileName::from_str(&file_name) {
|
||||
Ok(name) => {
|
||||
let remote_meta = heatmap_metadata.get(&name);
|
||||
match remote_meta {
|
||||
Some(remote_meta) => {
|
||||
// TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
|
||||
if local_meta.len() != remote_meta.metadata.file_size {
|
||||
// This should not happen, because we do crashsafe write-then-rename when downloading
|
||||
// layers, and layers in remote storage are immutable. Remove the local file because
|
||||
// we cannot trust it.
|
||||
tracing::warn!("Removing local layer {name} with unexpected local size {} != {}",
|
||||
local_meta.len(), remote_meta.metadata.file_size);
|
||||
} else {
|
||||
// We expect the access time to be initialized immediately afterwards, when
|
||||
// the latest heatmap is applied to the state.
|
||||
detail.on_disk_layers.insert(
|
||||
name.clone(),
|
||||
OnDiskState::new(
|
||||
self.conf,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
name,
|
||||
LayerFileMetadata::from(&remote_meta.metadata),
|
||||
remote_meta.access_time,
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// FIXME: consider some optimization when transitioning from attached to secondary: maybe
|
||||
// wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise
|
||||
// we will end up deleting any layers which were created+uploaded more recently than the heatmap.
|
||||
tracing::info!(
|
||||
"Removing secondary local layer {} because it's absent in heatmap",
|
||||
name
|
||||
);
|
||||
tokio::fs::remove_file(dentry.path()).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
// Ignore it.
|
||||
tracing::warn!("Unexpected file in timeline directory: {file_name}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(detail)
|
||||
}
|
||||
|
||||
async fn freshen_timeline(
|
||||
&self,
|
||||
job: &TenantJob,
|
||||
timeline: HeatMapTimeline,
|
||||
) -> anyhow::Result<()> {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(&job.tenant_id, &timeline.timeline_id);
|
||||
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
// Clone a view of what layers already exist on disk
|
||||
let timeline_state = job
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
self.init_timeline_state(&job.tenant_id, &timeline.timeline_id, &timeline)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
let layers_in_heatmap = timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
let local_path = timeline_path.join(layer.to_string());
|
||||
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
}
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Existing on-disk layers: just update their access time.
|
||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||
if on_disk.layer.metadata() != LayerFileMetadata::from(&layer.metadata)
|
||||
|| on_disk.access_time != layer.access_time
|
||||
{
|
||||
// We already have this layer on disk. Update its access time.
|
||||
tracing::trace!(
|
||||
"Access time updated for layer {}: {} -> {}",
|
||||
layer.name,
|
||||
strftime(&on_disk.access_time),
|
||||
strftime(&layer.access_time)
|
||||
);
|
||||
touched.push(layer);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
|
||||
// recently than it was evicted.
|
||||
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
|
||||
if &layer.access_time > evicted_at {
|
||||
tracing::info!(
|
||||
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
} else {
|
||||
tracing::trace!(
|
||||
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
match download_layer_file(
|
||||
self.conf,
|
||||
&self.remote_storage,
|
||||
job.tenant_id,
|
||||
timeline.timeline_id,
|
||||
&layer.name,
|
||||
&LayerFileMetadata::from(&layer.metadata),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(downloaded_bytes) => {
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
let local_path = timeline_path.join(layer.name.to_string());
|
||||
|
||||
tracing::error!(
|
||||
"Downloaded layer {} with unexpected size {} != {}",
|
||||
layer.name,
|
||||
downloaded_bytes,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
}
|
||||
|
||||
touched.push(layer)
|
||||
}
|
||||
Err(e) => {
|
||||
// No retries here: secondary downloads don't have to succeed: if they fail we just proceed and expect
|
||||
// that on some future call to freshen the download will work.
|
||||
// TODO: refine this behavior.
|
||||
tracing::info!("Failed to download layer {}: {}", layer.name, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write updates to state to record layers we just downloaded or touched.
|
||||
{
|
||||
let mut detail = job.secondary_state.detail.lock().unwrap();
|
||||
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
|
||||
|
||||
for t in touched {
|
||||
use std::collections::hash_map::Entry;
|
||||
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
|
||||
Entry::Occupied(mut v) => {
|
||||
v.get_mut().access_time = t.access_time;
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
e.insert(OnDiskState::new(
|
||||
self.conf,
|
||||
&job.tenant_id,
|
||||
&timeline.timeline_id,
|
||||
t.name,
|
||||
LayerFileMetadata::from(&t.metadata),
|
||||
t.access_time,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn freshen_tenant(&self, job: &TenantJob) -> anyhow::Result<()> {
|
||||
// Download the tenant's heatmap
|
||||
let heatmap = self.download_heatmap(&job.tenant_id).await?;
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.freshen_timeline(job, timeline).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
57
pageserver/src/tenant/secondary/heatmap.rs
Normal file
57
pageserver/src/tenant/secondary/heatmap.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
use crate::tenant::{
|
||||
remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
|
||||
};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(super) struct HeatMapTenant {
|
||||
pub(super) timelines: Vec<HeatMapTimeline>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(super) name: LayerFileName,
|
||||
pub(super) metadata: IndexLayerMetadata,
|
||||
|
||||
pub(super) access_time: SystemTime,
|
||||
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
}
|
||||
|
||||
impl HeatMapLayer {
|
||||
pub(crate) fn new(
|
||||
name: LayerFileName,
|
||||
metadata: IndexLayerMetadata,
|
||||
access_time: SystemTime,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
metadata,
|
||||
access_time,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(super) timeline_id: TimelineId,
|
||||
|
||||
pub(super) layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
impl HeatMapTimeline {
|
||||
pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
|
||||
Self {
|
||||
timeline_id,
|
||||
layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
207
pageserver/src/tenant/secondary/heatmap_writer.rs
Normal file
207
pageserver/src/tenant/secondary/heatmap_writer.rs
Normal file
@@ -0,0 +1,207 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use crate::tenant::{
|
||||
mgr::TenantManager, remote_timeline_client::remote_heatmap_path, secondary::CommandResponse,
|
||||
Tenant,
|
||||
};
|
||||
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{backoff, completion::Barrier};
|
||||
|
||||
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
|
||||
|
||||
const HEATMAP_UPLOAD_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
pub(super) async fn heatmap_writer_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let writer = HeatmapWriter {
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
cancel: cancel.clone(),
|
||||
};
|
||||
|
||||
tracing::info!("Waiting for background_jobs_can start...");
|
||||
background_jobs_can_start.wait().await;
|
||||
tracing::info!("background_jobs_can is ready, proceeding.");
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
writer.iteration().await?;
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("Heatmap writer terminating");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(HEATMAP_UPLOAD_INTERVAL) => {},
|
||||
cmd = command_queue.recv() => {
|
||||
let cmd = match cmd {
|
||||
Some(c) =>c,
|
||||
None => {
|
||||
// SecondaryController was destroyed, and this has raced with
|
||||
// our CancellationToken
|
||||
tracing::info!("Heatmap writer terminating");
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
let CommandRequest{
|
||||
response_tx,
|
||||
payload
|
||||
} = cmd;
|
||||
let result = writer.handle_command(payload).await;
|
||||
if response_tx.send(CommandResponse{result}).is_err() {
|
||||
// Caller went away, e.g. because an HTTP request timed out
|
||||
tracing::info!("Dropping response to administrative command")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct HeatmapWriter {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl HeatmapWriter {
|
||||
async fn iteration(&self) -> anyhow::Result<()> {
|
||||
let tenants = self.tenant_manager.get_attached_tenants();
|
||||
|
||||
for tenant in tenants {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if tenant.current_state() != TenantState::Active {
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: add a mechanism to check whether the active layer set has
|
||||
// changed since our last write
|
||||
|
||||
// TODO: add a minimum time between uploads
|
||||
|
||||
match self
|
||||
.write_tenant(&tenant)
|
||||
.instrument(tracing::info_span!(
|
||||
"write_tenant",
|
||||
tenant_id = %tenant.get_tenant_id()
|
||||
))
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Failed to upload heatmap for tenant {}: {e:#}",
|
||||
tenant.get_tenant_id(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_command(&self, command: UploadCommand) -> anyhow::Result<()> {
|
||||
match command {
|
||||
UploadCommand::Upload(tenant_id) => {
|
||||
let tenants = self.tenant_manager.get_attached_tenants();
|
||||
|
||||
let map = tenants
|
||||
.iter()
|
||||
.map(|t| (t.get_tenant_id(), t))
|
||||
.collect::<HashMap<_, _>>();
|
||||
match map.get(&tenant_id) {
|
||||
Some(tenant) => self.write_tenant(tenant).await,
|
||||
None => {
|
||||
anyhow::bail!("Tenant is not attached");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn write_tenant(&self, tenant: &Arc<Tenant>) -> anyhow::Result<()> {
|
||||
let mut heatmap = HeatMapTenant {
|
||||
timelines: Vec::new(),
|
||||
};
|
||||
let timelines = tenant.timelines.lock().unwrap().clone();
|
||||
|
||||
let tenant_cancel = tenant.cancel.clone();
|
||||
|
||||
// Ensure that Tenant::shutdown waits for any upload in flight
|
||||
let _guard = {
|
||||
let hook = tenant.heatmap_hook.lock().unwrap();
|
||||
match hook.enter() {
|
||||
Some(g) => g,
|
||||
None => {
|
||||
// Tenant is shutting down
|
||||
tracing::info!("Skipping, tenant is shutting down");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (timeline_id, timeline) in timelines {
|
||||
let heatmap_timeline = timeline.generate_heatmap().await;
|
||||
match heatmap_timeline {
|
||||
None => {
|
||||
tracing::debug!(
|
||||
"Skipping heatmap upload because timeline {timeline_id} is not ready"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Some(heatmap_timeline) => {
|
||||
heatmap.timelines.push(heatmap_timeline);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize the heatmap
|
||||
let bytes = serde_json::to_vec(&heatmap)?;
|
||||
let size = bytes.len();
|
||||
|
||||
let path = remote_heatmap_path(&tenant.get_tenant_id());
|
||||
|
||||
// Write the heatmap.
|
||||
tracing::debug!("Uploading {size} byte heatmap to {path}");
|
||||
if let Err(e) = backoff::retry(
|
||||
|| async {
|
||||
let bytes = tokio::io::BufReader::new(std::io::Cursor::new(bytes.clone()));
|
||||
let bytes = Box::new(bytes);
|
||||
self.remote_storage
|
||||
.upload_storage_object(bytes, size, &path)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"Uploading heatmap",
|
||||
backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
|
||||
)
|
||||
.await
|
||||
{
|
||||
if tenant_cancel.is_cancelled() {
|
||||
return Ok(());
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Successfully uploading {size} byte heatmap to {path}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -4,26 +4,21 @@ pub mod delta_layer;
|
||||
mod filename;
|
||||
mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod layer;
|
||||
mod layer_desc;
|
||||
mod remote_layer;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Key;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use std::ops::Range;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tracing::warn;
|
||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||
@@ -39,7 +34,8 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
pub use inmemory_layer::InMemoryLayer;
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
@@ -74,7 +70,7 @@ pub struct ValueReconstructState {
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
/// Return value from Layer::get_page_reconstruct_data
|
||||
/// Return value from [`Layer::get_value_reconstruct_data`]
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
@@ -179,26 +175,6 @@ impl LayerAccessStats {
|
||||
new
|
||||
}
|
||||
|
||||
/// Creates a clone of `self` and records `new_status` in the clone.
|
||||
///
|
||||
/// The `new_status` is not recorded in `self`.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
new_status: LayerResidenceStatus,
|
||||
) -> LayerAccessStats {
|
||||
let clone = {
|
||||
let inner = self.0.lock().unwrap();
|
||||
inner.clone()
|
||||
};
|
||||
let new = LayerAccessStats(Mutex::new(clone));
|
||||
new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
|
||||
new
|
||||
}
|
||||
|
||||
/// Record a change in layer residency.
|
||||
///
|
||||
/// Recording the event must happen while holding the layer map lock to
|
||||
@@ -321,95 +297,12 @@ impl LayerAccessStats {
|
||||
}
|
||||
}
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
/// required by [`LayerMap`](super::layer_map::LayerMap).
|
||||
///
|
||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||
/// timeline names, because those are known in the context of which the layers
|
||||
/// are used in (timeline).
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
///
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
/// It is up to the caller to collect more data from previous layer and
|
||||
/// perform WAL redo, if necessary.
|
||||
///
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call, or a struct with a cached older image of the page if one
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
pub trait AsLayerDesc {
|
||||
/// Get the layer descriptor.
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc;
|
||||
}
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access to the
|
||||
/// recent page versions. On-disk layers are stored as files on disk, and are
|
||||
/// immutable. This trait presents the common functionality of in-memory and
|
||||
/// on-disk layers.
|
||||
///
|
||||
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN.
|
||||
pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
/// File name used for this layer, both in the pageserver's local filesystem
|
||||
/// state as well as in the remote storage.
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_desc().filename()
|
||||
}
|
||||
|
||||
// Path to the layer file in the local filesystem.
|
||||
// `None` for `RemoteLayer`.
|
||||
fn local_path(&self) -> Option<Utf8PathBuf>;
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
||||
|
||||
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn is_remote_layer(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats;
|
||||
}
|
||||
|
||||
pub fn downcast_remote_layer(
|
||||
layer: &Arc<dyn PersistentLayer>,
|
||||
) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
if layer.is_remote_layer() {
|
||||
Arc::clone(layer).downcast_remote_layer()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -447,19 +340,6 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper enum to hold a PageServerConf, or a path
|
||||
///
|
||||
/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
|
||||
/// global config, and paths to layer files are constructed using the tenant/timeline
|
||||
/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
|
||||
/// struct for a file on disk, without having a page server running, so that we have no
|
||||
/// config. In that case, we use the Path variant to hold the full path to the file on
|
||||
/// disk.
|
||||
enum PathOrConf {
|
||||
Path(Utf8PathBuf),
|
||||
Conf(&'static PageServerConf),
|
||||
}
|
||||
|
||||
/// Range wrapping newtype, which uses display to render Debug.
|
||||
///
|
||||
/// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
|
||||
|
||||
@@ -34,18 +34,17 @@ use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::{self, File};
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
@@ -59,10 +58,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
|
||||
PersistentLayerDesc,
|
||||
};
|
||||
use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -182,20 +178,12 @@ impl DeltaKey {
|
||||
}
|
||||
}
|
||||
|
||||
/// DeltaLayer is the in-memory data structure associated with an on-disk delta
|
||||
/// file.
|
||||
///
|
||||
/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
|
||||
/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
|
||||
/// Otherwise the struct is just a placeholder for a file that exists on disk,
|
||||
/// and it needs to be loaded before using it in queries.
|
||||
/// This is used only from `pagectl`. Within pageserver, all layers are
|
||||
/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
|
||||
pub struct DeltaLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
|
||||
path: Utf8PathBuf,
|
||||
pub desc: PersistentLayerDesc,
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: OnceCell<Arc<DeltaLayerInner>>,
|
||||
}
|
||||
|
||||
@@ -212,6 +200,8 @@ impl std::fmt::Debug for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
|
||||
/// file.
|
||||
pub struct DeltaLayerInner {
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
@@ -221,12 +211,6 @@ pub struct DeltaLayerInner {
|
||||
file: FileBlockReader,
|
||||
}
|
||||
|
||||
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
|
||||
fn as_ref(&self) -> &DeltaLayerInner {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DeltaLayerInner")
|
||||
@@ -236,19 +220,6 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for DeltaLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
@@ -262,40 +233,9 @@ impl AsLayerDesc for DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<Utf8PathBuf> {
|
||||
self.local_path()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
self.delete_resident_layer_file()
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.info(reset)
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
self.access_stats()
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.desc.lsn_range.start,
|
||||
self.desc.lsn_range.end,
|
||||
self.desc.file_size,
|
||||
);
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
@@ -303,119 +243,7 @@ impl DeltaLayer {
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
inner.index_start_blk, inner.index_root_blk
|
||||
);
|
||||
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
}
|
||||
|
||||
for entry in keys {
|
||||
let DeltaEntry { key, lsn, val, .. } = entry;
|
||||
let desc = match dump_blob(val, ctx).await {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => {
|
||||
let err: anyhow::Error = err;
|
||||
format!("ERROR: {err}")
|
||||
}
|
||||
};
|
||||
println!(" key {key} at {lsn}: {desc}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.desc.lsn_range.start);
|
||||
|
||||
ensure!(self.desc.key_range.contains(&key));
|
||||
|
||||
let inner = self
|
||||
.load(LayerAccessKind::GetValueReconstructData, ctx)
|
||||
.await?;
|
||||
inner
|
||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_range = self.layer_desc().lsn_range.clone();
|
||||
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
layer_file_size: self.desc.file_size,
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_end: lsn_range.end,
|
||||
remote: false,
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.access_stats
|
||||
}
|
||||
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
fname: &DeltaFileName,
|
||||
) -> Utf8PathBuf {
|
||||
match path_or_conf {
|
||||
PathOrConf::Path(path) => path.clone(),
|
||||
PathOrConf::Conf(conf) => conf
|
||||
.timeline_path(tenant_id, timeline_id)
|
||||
.join(fname.to_string()),
|
||||
}
|
||||
inner.dump(ctx).await
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
@@ -461,52 +289,21 @@ impl DeltaLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let summary = match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => Some(Summary::from(self)),
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_owned();
|
||||
let expected_filename = self.layer_desc().filename().file_name();
|
||||
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
|
||||
let actual_filename = path.file_name().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
|
||||
Ok(Arc::new(loaded))
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
filename: &DeltaFileName,
|
||||
file_size: u64,
|
||||
access_stats: LayerAccessStats,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
filename.key_range.clone(),
|
||||
filename.lsn_range.clone(),
|
||||
file_size,
|
||||
),
|
||||
access_stats,
|
||||
inner: OnceCell::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
|
||||
@@ -520,7 +317,7 @@ impl DeltaLayer {
|
||||
.context("get file metadata to determine size")?;
|
||||
|
||||
Ok(DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
path: path.to_path_buf(),
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
summary.tenant_id,
|
||||
summary.timeline_id,
|
||||
@@ -533,29 +330,9 @@ impl DeltaLayer {
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
self.desc.delta_file_name()
|
||||
}
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> Utf8PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
&self.desc.tenant_id,
|
||||
&self.desc.timeline_id,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
/// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.await
|
||||
.context("load delta layer keys")?;
|
||||
DeltaLayerInner::load_keys(inner, ctx)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
fn path(&self) -> Utf8PathBuf {
|
||||
self.path.clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -660,7 +437,7 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -717,37 +494,21 @@ impl DeltaLayerWriterInner {
|
||||
// Note: Because we opened the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
let layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_start..key_end,
|
||||
self.lsn_range.clone(),
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
};
|
||||
|
||||
let desc = PersistentLayerDesc::new_delta(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_start..key_end,
|
||||
self.lsn_range.clone(),
|
||||
metadata.len(),
|
||||
);
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&DeltaFileName {
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range,
|
||||
},
|
||||
);
|
||||
std::fs::rename(self.path, &final_path)?;
|
||||
|
||||
trace!("created delta layer {final_path}");
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
|
||||
trace!("created delta layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
@@ -828,8 +589,12 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
self.inner.take().unwrap().finish(key_end).await
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
self.inner.take().unwrap().finish(key_end, timeline).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -967,15 +732,17 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
|
||||
this: &'a T,
|
||||
ctx: &'b RequestContext,
|
||||
pub(super) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<DeltaEntry<'a>>> {
|
||||
let dl = this.as_ref();
|
||||
let file = &dl.file;
|
||||
let file = &self.file;
|
||||
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
|
||||
|
||||
@@ -988,7 +755,7 @@ impl DeltaLayerInner {
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
|
||||
Adapter(dl),
|
||||
Adapter(self),
|
||||
)),
|
||||
};
|
||||
let pos = BlobRef(value).pos();
|
||||
@@ -1015,10 +782,61 @@ impl DeltaLayerInner {
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
// Last key occupies all space till end of value storage,
|
||||
// which corresponds to beginning of the index
|
||||
last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
|
||||
last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
|
||||
}
|
||||
Ok(all_keys)
|
||||
}
|
||||
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
self.index_start_blk, self.index_root_blk
|
||||
);
|
||||
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
let keys = self.load_keys(ctx).await?;
|
||||
|
||||
async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
}
|
||||
|
||||
for entry in keys {
|
||||
let DeltaEntry { key, lsn, val, .. } = entry;
|
||||
let desc = match dump_blob(val, ctx).await {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => {
|
||||
format!("ERROR: {err}")
|
||||
}
|
||||
};
|
||||
println!(" key {key} at {lsn}: {desc}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
@@ -1058,3 +876,9 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
|
||||
self.0.as_ref().file.read_blk(blknum, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
|
||||
fn as_ref(&self) -> &DeltaLayerInner {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,21 +31,23 @@ use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs::{self, File};
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tracing::*;
|
||||
|
||||
@@ -56,7 +58,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::ImageFileName;
|
||||
use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
|
||||
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -114,22 +116,14 @@ impl Summary {
|
||||
}
|
||||
}
|
||||
|
||||
/// ImageLayer is the in-memory data structure associated with an on-disk image
|
||||
/// file.
|
||||
///
|
||||
/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
|
||||
/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
|
||||
/// Otherwise the struct is just a placeholder for a file that exists on disk,
|
||||
/// and it needs to be loaded before using it in queries.
|
||||
/// This is used only from `pagectl`. Within pageserver, all layers are
|
||||
/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
|
||||
pub struct ImageLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
|
||||
path: Utf8PathBuf,
|
||||
pub desc: PersistentLayerDesc,
|
||||
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
|
||||
pub lsn: Lsn,
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: OnceCell<ImageLayerInner>,
|
||||
}
|
||||
|
||||
@@ -146,6 +140,8 @@ impl std::fmt::Debug for ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// ImageLayer is the in-memory data structure associated with an on-disk image
|
||||
/// file.
|
||||
pub struct ImageLayerInner {
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
@@ -166,73 +162,11 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// Look up given page in the file
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn local_path(&self) -> Option<Utf8PathBuf> {
|
||||
self.local_path()
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
self.delete_resident_layer_file()
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.info(reset)
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
self.access_stats()
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
self.desc.timeline_id,
|
||||
self.desc.key_range.start,
|
||||
self.desc.key_range.end,
|
||||
self.lsn,
|
||||
self.desc.is_incremental(),
|
||||
self.desc.file_size
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
let file = &inner.file;
|
||||
impl ImageLayerInner {
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let file = &self.file;
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
|
||||
@@ -250,69 +184,36 @@ impl ImageLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.desc.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self
|
||||
.load(LayerAccessKind::GetValueReconstructData, ctx)
|
||||
.await?;
|
||||
inner
|
||||
.get_value_reconstruct_data(key, reconstruct_state, ctx)
|
||||
.await
|
||||
// FIXME: makes no sense to dump paths
|
||||
.with_context(|| format!("read {}", self.path()))
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for ImageLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
|
||||
Some(self.path())
|
||||
impl AsLayerDesc for ImageLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
|
||||
inner.dump(ctx).await?;
|
||||
|
||||
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_start = self.layer_desc().image_layer_lsn();
|
||||
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name,
|
||||
layer_file_size: self.desc.file_size,
|
||||
lsn_start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.access_stats
|
||||
}
|
||||
|
||||
fn path_for(
|
||||
path_or_conf: &PathOrConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
fname: &ImageFileName,
|
||||
) -> Utf8PathBuf {
|
||||
match path_or_conf {
|
||||
PathOrConf::Path(path) => path.to_path_buf(),
|
||||
PathOrConf::Conf(conf) => conf
|
||||
.timeline_path(&tenant_id, &timeline_id)
|
||||
.join(fname.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -348,54 +249,21 @@ impl ImageLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let expected_summary = match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => Some(Summary::from(self)),
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
|
||||
|
||||
let loaded =
|
||||
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
|
||||
.await?;
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_owned();
|
||||
let expected_filename = self.layer_desc().filename().file_name();
|
||||
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
|
||||
Ok(loaded)
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
filename: &ImageFileName,
|
||||
file_size: u64,
|
||||
access_stats: LayerAccessStats,
|
||||
) -> ImageLayer {
|
||||
ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
desc: PersistentLayerDesc::new_img(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
filename.key_range.clone(),
|
||||
filename.lsn,
|
||||
file_size,
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: filename.lsn,
|
||||
access_stats,
|
||||
inner: OnceCell::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'pagectl' binary.
|
||||
@@ -407,7 +275,7 @@ impl ImageLayer {
|
||||
.metadata()
|
||||
.context("get file metadata to determine size")?;
|
||||
Ok(ImageLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
path: path.to_path_buf(),
|
||||
desc: PersistentLayerDesc::new_img(
|
||||
summary.tenant_id,
|
||||
summary.timeline_id,
|
||||
@@ -421,18 +289,8 @@ impl ImageLayer {
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
self.desc.image_file_name()
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> Utf8PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&self.layer_name(),
|
||||
)
|
||||
fn path(&self) -> Utf8PathBuf {
|
||||
self.path.clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -604,7 +462,7 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
async fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -658,33 +516,14 @@ impl ImageLayerWriterInner {
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
let layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
desc,
|
||||
lsn: self.lsn,
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = ImageLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
std::fs::rename(self.path, final_path)?;
|
||||
// FIXME: why not carry the virtualfile here, it supports renaming?
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
|
||||
trace!("created image layer {}", layer.path());
|
||||
trace!("created image layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
@@ -746,8 +585,11 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
|
||||
self.inner.take().unwrap().finish().await
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner.take().unwrap().finish(timeline).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,11 +10,12 @@ use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord;
|
||||
use anyhow::{ensure, Result};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::OnceLock;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
@@ -28,7 +29,7 @@ use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::{DeltaLayer, DeltaLayerWriter, Layer};
|
||||
use super::{DeltaLayerWriter, ResidentLayer};
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -207,20 +208,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let end_lsn = self.end_lsn_or_max();
|
||||
@@ -229,17 +216,13 @@ impl std::fmt::Display for InMemoryLayer {
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
///
|
||||
/// Get layer size.
|
||||
///
|
||||
pub async fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().await;
|
||||
Ok(inner.file.len())
|
||||
}
|
||||
|
||||
///
|
||||
/// Create a new, empty, in-memory layer
|
||||
///
|
||||
pub async fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -331,7 +314,11 @@ impl InMemoryLayer {
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
|
||||
pub(crate) async fn write_to_disk(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ResidentLayer> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -376,7 +363,8 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
|
||||
Ok(delta_layer)
|
||||
}
|
||||
}
|
||||
|
||||
1518
pageserver/src/tenant/storage_layer/layer.rs
Normal file
1518
pageserver/src/tenant/storage_layer/layer.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,3 @@
|
||||
use anyhow::Result;
|
||||
use core::fmt::Display;
|
||||
use std::ops::Range;
|
||||
use utils::{
|
||||
@@ -6,7 +5,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{context::RequestContext, repository::Key};
|
||||
use crate::repository::Key;
|
||||
|
||||
use super::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
|
||||
@@ -100,6 +99,22 @@ impl PersistentLayerDesc {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_filename(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
filename: LayerFileName,
|
||||
file_size: u64,
|
||||
) -> Self {
|
||||
match filename {
|
||||
LayerFileName::Image(i) => {
|
||||
Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
|
||||
}
|
||||
LayerFileName::Delta(d) => {
|
||||
Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the LSN that the image layer covers.
|
||||
pub fn image_layer_lsn(&self) -> Lsn {
|
||||
assert!(!self.is_delta);
|
||||
@@ -173,21 +188,31 @@ impl PersistentLayerDesc {
|
||||
self.is_delta
|
||||
}
|
||||
|
||||
pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end,
|
||||
self.is_delta,
|
||||
self.is_incremental(),
|
||||
self.file_size,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
pub fn dump(&self) {
|
||||
if self.is_delta {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end,
|
||||
self.is_incremental(),
|
||||
self.file_size,
|
||||
);
|
||||
} else {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.image_layer_lsn(),
|
||||
self.is_incremental(),
|
||||
self.file_size
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
|
||||
@@ -1,216 +0,0 @@
|
||||
//! A RemoteLayer is an in-memory placeholder for a layer file that exists
|
||||
//! in remote storage.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::layer_manager::LayerManager;
|
||||
use anyhow::{bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
/// [`DeltaLayer`].
|
||||
///
|
||||
/// RemoteLayer might be downloaded on-demand during operations which are
|
||||
/// allowed download remote layers and during which, it gets replaced with a
|
||||
/// concrete `DeltaLayer` or `ImageLayer`.
|
||||
///
|
||||
/// See: [`crate::context::RequestContext`] for authorization to download
|
||||
pub struct RemoteLayer {
|
||||
pub desc: PersistentLayerDesc,
|
||||
|
||||
pub layer_metadata: LayerFileMetadata,
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// Has `LayerMap::replace` failed for this (true) or not (false).
|
||||
///
|
||||
/// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
|
||||
/// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
|
||||
/// unprocessable, because a LayerMap::replace failed.
|
||||
///
|
||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||
/// `Timeline::download_remote_layer`, which also logs.
|
||||
///
|
||||
/// [`ongoing_download`]: Self::ongoing_download
|
||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for RemoteLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("RemoteLayer")
|
||||
.field("file_name", &self.desc.filename())
|
||||
.field("layer_metadata", &self.layer_metadata)
|
||||
.field("is_incremental", &self.desc.is_incremental())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for RemoteLayer {
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
impl std::fmt::Display for RemoteLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.layer_desc().short_id())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for RemoteLayer {
|
||||
fn layer_desc(&self) -> &PersistentLayerDesc {
|
||||
&self.desc
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for RemoteLayer {
|
||||
fn local_path(&self) -> Option<Utf8PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
bail!("remote layer has no layer file");
|
||||
}
|
||||
|
||||
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn is_remote_layer(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.layer_desc().filename().file_name();
|
||||
let lsn_range = self.layer_desc().lsn_range.clone();
|
||||
|
||||
if self.desc.is_delta {
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
layer_file_size: self.layer_metadata.file_size(),
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_end: lsn_range.end,
|
||||
remote: true,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
} else {
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name,
|
||||
layer_file_size: self.layer_metadata.file_size(),
|
||||
lsn_start: lsn_range.start,
|
||||
remote: true,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.access_stats
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteLayer {
|
||||
pub fn new_img(
|
||||
tenantid: TenantId,
|
||||
timelineid: TimelineId,
|
||||
fname: &ImageFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
access_stats: LayerAccessStats,
|
||||
) -> RemoteLayer {
|
||||
RemoteLayer {
|
||||
desc: PersistentLayerDesc::new_img(
|
||||
tenantid,
|
||||
timelineid,
|
||||
fname.key_range.clone(),
|
||||
fname.lsn,
|
||||
layer_metadata.file_size(),
|
||||
),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_delta(
|
||||
tenantid: TenantId,
|
||||
timelineid: TimelineId,
|
||||
fname: &DeltaFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
access_stats: LayerAccessStats,
|
||||
) -> RemoteLayer {
|
||||
RemoteLayer {
|
||||
desc: PersistentLayerDesc::new_delta(
|
||||
tenantid,
|
||||
timelineid,
|
||||
fname.key_range.clone(),
|
||||
fname.lsn_range.clone(),
|
||||
layer_metadata.file_size(),
|
||||
),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
|
||||
access_stats,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub(crate) fn create_downloaded_layer(
|
||||
&self,
|
||||
_layer_map_lock_held_witness: &LayerManager,
|
||||
conf: &'static PageServerConf,
|
||||
file_size: u64,
|
||||
) -> Arc<dyn PersistentLayer> {
|
||||
if self.desc.is_delta {
|
||||
let fname = self.desc.delta_file_name();
|
||||
Arc::new(DeltaLayer::new(
|
||||
conf,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
))
|
||||
} else {
|
||||
let fname = self.desc.image_file_name();
|
||||
Arc::new(ImageLayer::new(
|
||||
conf,
|
||||
self.desc.timeline_id,
|
||||
self.desc.tenant_id,
|
||||
&fname,
|
||||
file_size,
|
||||
self.access_stats
|
||||
.clone_for_residence_change(LayerResidenceStatus::Resident),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -159,11 +159,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
|
||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
if first && random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
@@ -183,7 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
||||
if !first {
|
||||
// The first iteration is typically much slower, because all tenants compete for the
|
||||
// compaction sempahore to run, and because of concurrent startup work like initializing
|
||||
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
|
||||
warn_when_period_overrun(
|
||||
started_at.elapsed(),
|
||||
period,
|
||||
BackgroundLoopKind::Compaction,
|
||||
);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
@@ -192,6 +198,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
.await;
|
||||
@@ -223,11 +231,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
if first && random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
@@ -251,7 +256,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
||||
if !first {
|
||||
// The first iteration is typically much slower, because all tenants compete for the
|
||||
// compaction sempahore to run, and because of concurrent startup work like initializing
|
||||
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
@@ -260,6 +270,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
.await;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -29,7 +29,6 @@ use crate::{
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
tasks::{BackgroundLoopKind, RateLimitError},
|
||||
timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
@@ -210,15 +209,26 @@ impl Timeline {
|
||||
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
||||
// We don't want to hold the layer map lock during eviction.
|
||||
// So, we just need to deal with this.
|
||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
||||
let candidates: Vec<_> = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let mut candidates = Vec::new();
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
let hist_layer = guard.get_from_desc(&hist_layer);
|
||||
if hist_layer.is_remote_layer() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// guard against eviction while we inspect it; it might be that eviction_task and
|
||||
// disk_usage_eviction_task both select the same layers to be evicted, and
|
||||
// seemingly free up double the space. both succeeding is of no consequence.
|
||||
let guard = match hist_layer.keep_resident().await {
|
||||
Ok(Some(l)) => l,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
// these should not happen, but we cannot make them statically impossible right
|
||||
// now.
|
||||
tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
|
||||
// We only use this fallback if there's an implementation error.
|
||||
@@ -249,7 +259,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
if no_activity_for > p.threshold {
|
||||
candidates.push(hist_layer)
|
||||
candidates.push(guard.drop_eviction_guard())
|
||||
}
|
||||
}
|
||||
candidates
|
||||
@@ -268,7 +278,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let results = match self
|
||||
.evict_layer_batch(remote_client, &candidates[..], cancel.clone())
|
||||
.evict_layer_batch(remote_client, &candidates, cancel)
|
||||
.await
|
||||
{
|
||||
Err(pre_err) => {
|
||||
@@ -279,7 +289,7 @@ impl Timeline {
|
||||
Ok(results) => results,
|
||||
};
|
||||
assert_eq!(results.len(), candidates.len());
|
||||
for (l, result) in candidates.iter().zip(results) {
|
||||
for result in results {
|
||||
match result {
|
||||
None => {
|
||||
stats.skipped_for_shutdown += 1;
|
||||
@@ -287,24 +297,10 @@ impl Timeline {
|
||||
Some(Ok(())) => {
|
||||
stats.evicted += 1;
|
||||
}
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
||||
// compaction/gc removed the file while we were waiting on layer_removal_cs
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(layer = %l, "failed to evict layer: {e}");
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(EvictionError::MetadataInconsistency(detail))) => {
|
||||
warn!(layer = %l, "failed to evict layer: {detail}");
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if stats.candidates == stats.not_evictable {
|
||||
@@ -348,20 +344,7 @@ impl Timeline {
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
//
|
||||
// It is critical we are responsive to cancellation here. Otherwise, we deadlock with
|
||||
// tenant deletion (holds TENANTS in read mode) any other task that attempts to
|
||||
// acquire TENANTS in write mode before we here call get_tenant.
|
||||
// See https://github.com/neondatabase/neon/issues/5284.
|
||||
let res = tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
|
||||
res
|
||||
}
|
||||
};
|
||||
let tenant = match res {
|
||||
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
|
||||
Ok(t) => t,
|
||||
Err(_) => {
|
||||
return ControlFlow::Break(());
|
||||
|
||||
@@ -12,27 +12,16 @@ use crate::{
|
||||
tenant::{
|
||||
layer_map::{BatchedUpdates, LayerMap},
|
||||
storage_layer::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
|
||||
PersistentLayerDesc, PersistentLayerKey,
|
||||
AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
|
||||
ResidentLayer,
|
||||
},
|
||||
timeline::compare_arced_layers,
|
||||
},
|
||||
};
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
pub(crate) struct LayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager,
|
||||
}
|
||||
|
||||
/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
|
||||
/// scheduling deletes in remote client.
|
||||
pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
|
||||
|
||||
impl ApplyGcResultGuard<'_> {
|
||||
pub(crate) fn flush(self) {
|
||||
self.0.flush();
|
||||
}
|
||||
layer_fmgr: LayerFileManager<Layer>,
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
@@ -43,7 +32,7 @@ impl LayerManager {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
}
|
||||
|
||||
@@ -55,21 +44,12 @@ impl LayerManager {
|
||||
&self.layer_map
|
||||
}
|
||||
|
||||
/// Replace layers in the layer file manager, used in evictions and layer downloads.
|
||||
pub(crate) fn replace_and_verify(
|
||||
&mut self,
|
||||
expected: Arc<dyn PersistentLayer>,
|
||||
new: Arc<dyn PersistentLayer>,
|
||||
) -> Result<()> {
|
||||
self.layer_fmgr.replace_and_verify(expected, new)
|
||||
}
|
||||
|
||||
/// Called from `load_layer_map`. Initialize the layer manager with:
|
||||
/// 1. all on-disk layers
|
||||
/// 2. next open layer (with disk disk_consistent_lsn LSN)
|
||||
pub(crate) fn initialize_local_layers(
|
||||
&mut self,
|
||||
on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
on_disk_layers: Vec<Layer>,
|
||||
next_open_layer_at: Lsn,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
@@ -164,10 +144,19 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
|
||||
pub(crate) fn track_new_image_layers(
|
||||
&mut self,
|
||||
image_layers: &[ResidentLayer],
|
||||
metrics: &TimelineMetrics,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in image_layers {
|
||||
Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
|
||||
Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
||||
|
||||
// record these here instead of Layer::finish_creating because otherwise partial
|
||||
// failure with create_image_layers would balloon up the physical size gauge. downside
|
||||
// is that all layers need to be created before metrics are updated.
|
||||
metrics.record_new_file_metrics(layer.layer_desc().file_size);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
@@ -175,76 +164,71 @@ impl LayerManager {
|
||||
/// Flush a frozen layer and add the written delta layer to the layer map.
|
||||
pub(crate) fn finish_flush_l0_layer(
|
||||
&mut self,
|
||||
delta_layer: Option<DeltaLayer>,
|
||||
delta_layer: Option<&ResidentLayer>,
|
||||
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
||||
metrics: &TimelineMetrics,
|
||||
) {
|
||||
let l = self.layer_map.frozen_layers.pop_front();
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
let inmem = self
|
||||
.layer_map
|
||||
.frozen_layers
|
||||
.pop_front()
|
||||
.expect("there must be a inmem layer to flush");
|
||||
|
||||
// Only one thread may call this function at a time (for this
|
||||
// timeline). If two threads tried to flush the same frozen
|
||||
// Only one task may call this function at a time (for this
|
||||
// timeline). If two tasks tried to flush the same frozen
|
||||
// layer to disk at the same time, that would not work.
|
||||
assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
|
||||
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
|
||||
|
||||
if let Some(delta_layer) = delta_layer {
|
||||
Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
|
||||
if let Some(l) = delta_layer {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
||||
updates.flush();
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
pub(crate) fn finish_compact_l0(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
compact_from: Vec<Arc<dyn PersistentLayer>>,
|
||||
compact_to: Vec<Arc<dyn PersistentLayer>>,
|
||||
layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
compact_from: &[Layer],
|
||||
compact_to: &[ResidentLayer],
|
||||
metrics: &TimelineMetrics,
|
||||
) -> Result<()> {
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for l in compact_to {
|
||||
Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
||||
}
|
||||
for l in compact_from {
|
||||
// NB: the layer file identified by descriptor `l` is guaranteed to be present
|
||||
// in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
|
||||
// time, even though we dropped `Timeline::layers` inbetween.
|
||||
Self::delete_historic_layer(
|
||||
layer_removal_cs.clone(),
|
||||
l,
|
||||
&mut updates,
|
||||
metrics,
|
||||
&mut self.layer_fmgr,
|
||||
)?;
|
||||
Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
|
||||
pub(crate) fn finish_gc_timeline(
|
||||
&mut self,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
gc_layers: Vec<Arc<dyn PersistentLayer>>,
|
||||
metrics: &TimelineMetrics,
|
||||
) -> Result<ApplyGcResultGuard> {
|
||||
layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
gc_layers: Vec<Layer>,
|
||||
) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for doomed_layer in gc_layers {
|
||||
Self::delete_historic_layer(
|
||||
layer_removal_cs.clone(),
|
||||
doomed_layer,
|
||||
layer_removal_cs,
|
||||
&doomed_layer,
|
||||
&mut updates,
|
||||
metrics,
|
||||
&mut self.layer_fmgr,
|
||||
)?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
|
||||
);
|
||||
}
|
||||
Ok(ApplyGcResultGuard(updates))
|
||||
updates.flush()
|
||||
}
|
||||
|
||||
/// Helper function to insert a layer into the layer map and file manager.
|
||||
fn insert_historic_layer(
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
layer: Layer,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
mapping: &mut LayerFileManager,
|
||||
mapping: &mut LayerFileManager<Layer>,
|
||||
) {
|
||||
updates.insert_historic(layer.layer_desc().clone());
|
||||
mapping.insert(layer);
|
||||
@@ -254,17 +238,12 @@ impl LayerManager {
|
||||
/// Remote storage is not affected by this operation.
|
||||
fn delete_historic_layer(
|
||||
// we cannot remove layers otherwise, since gc and compaction will race
|
||||
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
_layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
layer: &Layer,
|
||||
updates: &mut BatchedUpdates<'_>,
|
||||
metrics: &TimelineMetrics,
|
||||
mapping: &mut LayerFileManager,
|
||||
) -> anyhow::Result<()> {
|
||||
mapping: &mut LayerFileManager<Layer>,
|
||||
) {
|
||||
let desc = layer.layer_desc();
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
metrics.resident_physical_size_sub(desc.file_size);
|
||||
}
|
||||
|
||||
// TODO Removing from the bottom of the layer map is expensive.
|
||||
// Maybe instead discard all layer map historic versions that
|
||||
@@ -273,21 +252,18 @@ impl LayerManager {
|
||||
// map index without actually rebuilding the index.
|
||||
updates.remove_historic(desc);
|
||||
mapping.remove(layer);
|
||||
|
||||
Ok(())
|
||||
layer.garbage_collect_on_drop();
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||
HashMap<PersistentLayerKey, Arc<T>>,
|
||||
);
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
|
||||
impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
|
||||
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.0
|
||||
@@ -297,14 +273,14 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn insert(&mut self, layer: Arc<T>) {
|
||||
pub(crate) fn insert(&mut self, layer: T) {
|
||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||
if present.is_some() && cfg!(debug_assertions) {
|
||||
panic!("overwriting a layer: {:?}", layer.layer_desc())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
|
||||
pub(crate) fn contains(&self, layer: &T) -> bool {
|
||||
self.0.contains_key(&layer.layer_desc().key())
|
||||
}
|
||||
|
||||
@@ -312,7 +288,7 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
|
||||
pub(crate) fn remove(&mut self, layer: Arc<T>) {
|
||||
pub(crate) fn remove(&mut self, layer: &T) {
|
||||
let present = self.0.remove(&layer.layer_desc().key());
|
||||
if present.is_none() && cfg!(debug_assertions) {
|
||||
panic!(
|
||||
@@ -321,39 +297,4 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
|
||||
let key = expected.layer_desc().key();
|
||||
let other = new.layer_desc().key();
|
||||
|
||||
let expected_l0 = LayerMap::is_l0(expected.layer_desc());
|
||||
let new_l0 = LayerMap::is_l0(new.layer_desc());
|
||||
|
||||
fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
|
||||
"layermap-replace-notfound"
|
||||
));
|
||||
|
||||
anyhow::ensure!(
|
||||
key == other,
|
||||
"expected and new layer have different keys: {key:?} != {other:?}"
|
||||
);
|
||||
|
||||
anyhow::ensure!(
|
||||
expected_l0 == new_l0,
|
||||
"one layer is l0 while the other is not: {expected_l0} != {new_l0}"
|
||||
);
|
||||
|
||||
if let Some(layer) = self.0.get_mut(&key) {
|
||||
anyhow::ensure!(
|
||||
compare_arced_layers(&expected, layer),
|
||||
"another layer was found instead of expected, expected={expected:?}, new={new:?}",
|
||||
expected = Arc::as_ptr(&expected),
|
||||
new = Arc::as_ptr(layer),
|
||||
);
|
||||
*layer = new;
|
||||
Ok(())
|
||||
} else {
|
||||
anyhow::bail!("layer was not found");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,8 +26,7 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use storage_broker::Streaming;
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
@@ -137,8 +136,17 @@ pub(super) async fn connection_manager_loop_step(
|
||||
broker_update = broker_subscription.message() => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Err(e) => {
|
||||
error!("broker subscription failed: {e}");
|
||||
Err(status) => {
|
||||
match status.code() {
|
||||
Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
|
||||
// tonic's error handling doesn't provide a clear code for disconnections: we get
|
||||
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
|
||||
info!("broker disconnected: {status}");
|
||||
},
|
||||
_ => {
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Ok(None) => {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::storage_layer::LayerFileName;
|
||||
use super::storage_layer::ResidentLayer;
|
||||
use super::Generation;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
@@ -203,18 +204,6 @@ impl UploadQueue {
|
||||
UploadQueue::Stopped(stopped) => Ok(stopped),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_layer_metadata(
|
||||
&self,
|
||||
name: &LayerFileName,
|
||||
) -> anyhow::Result<Option<LayerFileMetadata>> {
|
||||
match self {
|
||||
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(inner) => Ok(inner.latest_files.get(name).cloned()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An in-progress upload or delete task.
|
||||
@@ -237,7 +226,7 @@ pub(crate) struct Delete {
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum UploadOp {
|
||||
/// Upload a layer file
|
||||
UploadLayer(LayerFileName, LayerFileMetadata),
|
||||
UploadLayer(ResidentLayer, LayerFileMetadata),
|
||||
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(IndexPart, Lsn),
|
||||
@@ -252,13 +241,13 @@ pub(crate) enum UploadOp {
|
||||
impl std::fmt::Display for UploadOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
UploadOp::UploadLayer(path, metadata) => {
|
||||
UploadOp::UploadLayer(layer, metadata) => {
|
||||
write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?}, gen={:?})",
|
||||
path.file_name(),
|
||||
layer,
|
||||
metadata.file_size(),
|
||||
metadata.generation,
|
||||
metadata.generation
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => {
|
||||
|
||||
12
poetry.lock
generated
12
poetry.lock
generated
@@ -2447,20 +2447,20 @@ test = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "2.2.3"
|
||||
version = "3.0.1"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
|
||||
{file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
|
||||
{file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
|
||||
{file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
MarkupSafe = ">=2.1.1"
|
||||
|
||||
[package.extras]
|
||||
watchdog = ["watchdog"]
|
||||
watchdog = ["watchdog (>=2.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "wrapt"
|
||||
@@ -2719,4 +2719,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "c5981d8d7c2deadd47c823bc35f86f830c8e320b653d2d3718bade1f4d2dabca"
|
||||
content-hash = "74649cf47c52f21b01b096a42044750b1c9677576b405be0489c2909127a9bf1"
|
||||
|
||||
@@ -3,7 +3,9 @@ mod hacks;
|
||||
mod link;
|
||||
|
||||
pub use link::LinkAuthError;
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
|
||||
use crate::proxy::{handle_try_wake, retry_after};
|
||||
use crate::{
|
||||
auth::{self, ClientCredentials},
|
||||
config::AuthenticationConfig,
|
||||
@@ -16,8 +18,9 @@ use crate::{
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use std::borrow::Cow;
|
||||
use std::ops::ControlFlow;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// A product of successful authentication.
|
||||
pub struct AuthSuccess<T> {
|
||||
@@ -117,22 +120,27 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ComputeCredentials {
|
||||
Password(Vec<u8>),
|
||||
AuthKeys(AuthKeys),
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
async fn auth_quirks_creds(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the endpoint (project) name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
if creds.project.is_none() {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::password_hack(api, extra, creds, client).await;
|
||||
return hacks::password_hack(creds, client).await;
|
||||
}
|
||||
|
||||
// Password hack should set the project name.
|
||||
@@ -143,13 +151,55 @@ async fn auth_quirks(
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::cleartext_hack(api, extra, creds, client).await;
|
||||
return hacks::cleartext_hack(client).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
classic::authenticate(api, extra, creds, client, config).await
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
let auth_stuff = auth_quirks_creds(api, extra, creds, client, allow_cleartext, config).await?;
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
return Err(e.into());
|
||||
}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
}
|
||||
Ok(ControlFlow::Break(n)) => break n,
|
||||
}
|
||||
|
||||
let wait_duration = retry_after(num_retries);
|
||||
num_retries += 1;
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
};
|
||||
|
||||
match auth_stuff.value {
|
||||
ComputeCredentials::Password(password) => node.config.password(password),
|
||||
ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
|
||||
};
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: auth_stuff.reported_auth_ok,
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<String> {
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use super::AuthSuccess;
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute,
|
||||
config::AuthenticationConfig,
|
||||
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
|
||||
proxy::{handle_try_wake, retry_after},
|
||||
console::{self, AuthInfo, ConsoleReqExtra},
|
||||
sasl, scram,
|
||||
stream::PqStream,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{info, warn};
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
api: &impl console::Api,
|
||||
@@ -19,7 +16,7 @@ pub(super) async fn authenticate(
|
||||
creds: &ClientCredentials<'_>,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
info!("fetching user's authentication info");
|
||||
let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
@@ -66,38 +63,17 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
Some(compute::ScramKeys {
|
||||
compute::ScramKeys {
|
||||
client_key: client_key.as_bytes(),
|
||||
server_key: secret.server_key.as_bytes(),
|
||||
})
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
return Err(e.into());
|
||||
}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
}
|
||||
Ok(ControlFlow::Break(n)) => break n,
|
||||
}
|
||||
|
||||
let wait_duration = retry_after(num_retries);
|
||||
num_retries += 1;
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
};
|
||||
if let Some(keys) = scram_keys {
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
node.config.auth_keys(AuthKeys::ScramSha256(keys));
|
||||
}
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
|
||||
scram_keys,
|
||||
)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
use super::AuthSuccess;
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
console::{
|
||||
self,
|
||||
provider::{CachedNodeInfo, ConsoleReqExtra},
|
||||
},
|
||||
stream,
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -15,11 +11,8 @@ use tracing::{info, warn};
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn cleartext_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
let password = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
@@ -27,24 +20,19 @@ pub async fn cleartext_hack(
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
value: ComputeCredentials::Password(password),
|
||||
})
|
||||
}
|
||||
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Very similar to [`cleartext_hack`], but there's a specific password format.
|
||||
pub async fn password_hack(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
@@ -55,12 +43,9 @@ pub async fn password_hack(
|
||||
info!(project = &payload.endpoint, "received missing parameter");
|
||||
creds.project = Some(payload.endpoint);
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(payload.password);
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: node,
|
||||
value: ComputeCredentials::Password(payload.password),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -4,10 +4,11 @@ use proxy::config::AuthenticationConfig;
|
||||
use proxy::config::HttpConfig;
|
||||
use proxy::console;
|
||||
use proxy::http;
|
||||
use proxy::metrics;
|
||||
use proxy::usage_metrics;
|
||||
|
||||
use anyhow::bail;
|
||||
use proxy::config::{self, ProxyConfig};
|
||||
use proxy::serverless;
|
||||
use std::pin::pin;
|
||||
use std::{borrow::Cow, net::SocketAddr};
|
||||
use tokio::net::TcpListener;
|
||||
@@ -129,14 +130,16 @@ async fn main() -> anyhow::Result<()> {
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
|
||||
if let Some(wss_address) = args.wss {
|
||||
let wss_address: SocketAddr = wss_address.parse()?;
|
||||
info!("Starting wss on {wss_address}");
|
||||
let wss_listener = TcpListener::bind(wss_address).await?;
|
||||
// TODO: rename the argument to something like serverless.
|
||||
// It now covers more than just websockets, it also covers SQL over HTTP.
|
||||
if let Some(serverless_address) = args.wss {
|
||||
let serverless_address: SocketAddr = serverless_address.parse()?;
|
||||
info!("Starting wss on {serverless_address}");
|
||||
let serverless_listener = TcpListener::bind(serverless_address).await?;
|
||||
|
||||
client_tasks.spawn(http::websocket::task_main(
|
||||
client_tasks.spawn(serverless::task_main(
|
||||
config,
|
||||
wss_listener,
|
||||
serverless_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
}
|
||||
@@ -144,11 +147,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
|
||||
maintenance_tasks.spawn(http::server::task_main(http_listener));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
|
||||
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
|
||||
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
maintenance_tasks.spawn(metrics::task_main(metrics_config));
|
||||
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
|
||||
}
|
||||
|
||||
let maintenance = loop {
|
||||
|
||||
@@ -90,7 +90,11 @@ pub mod errors {
|
||||
status: http::StatusCode::LOCKED,
|
||||
ref text,
|
||||
} => {
|
||||
!text.contains("written data quota exceeded")
|
||||
// written data quota exceeded
|
||||
// data transfer quota exceeded
|
||||
// compute time quota exceeded
|
||||
// logical size quota exceeded
|
||||
!text.contains("quota exceeded")
|
||||
&& !text.contains("the limit for current plan reached")
|
||||
}
|
||||
// retry server errors
|
||||
|
||||
@@ -2,10 +2,7 @@
|
||||
//! Other modules should use stuff from this module instead of
|
||||
//! directly relying on deps like `reqwest` (think loose coupling).
|
||||
|
||||
pub mod conn_pool;
|
||||
pub mod server;
|
||||
pub mod sql_over_http;
|
||||
pub mod websocket;
|
||||
pub mod health_server;
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
|
||||
@@ -14,14 +14,15 @@ pub mod console;
|
||||
pub mod error;
|
||||
pub mod http;
|
||||
pub mod logging;
|
||||
pub mod metrics;
|
||||
pub mod parse;
|
||||
pub mod protocol2;
|
||||
pub mod proxy;
|
||||
pub mod sasl;
|
||||
pub mod scram;
|
||||
pub mod serverless;
|
||||
pub mod stream;
|
||||
pub mod url;
|
||||
pub mod usage_metrics;
|
||||
pub mod waiters;
|
||||
|
||||
/// Handle unix signals appropriately.
|
||||
|
||||
@@ -8,9 +8,9 @@ use crate::{
|
||||
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
|
||||
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
|
||||
http::StatusCode,
|
||||
metrics::{Ids, USAGE_METRICS},
|
||||
protocol2::WithClientIp,
|
||||
stream::{PqStream, Stream},
|
||||
usage_metrics::{Ids, USAGE_METRICS},
|
||||
};
|
||||
use anyhow::{bail, Context};
|
||||
use async_trait::async_trait;
|
||||
|
||||
@@ -1,235 +1,36 @@
|
||||
use crate::{
|
||||
cancellation::CancelMap,
|
||||
config::ProxyConfig,
|
||||
error::io_error,
|
||||
protocol2::{ProxyProtocolAccept, WithClientIp},
|
||||
proxy::{
|
||||
handle_client, ClientMode, NUM_CLIENT_CONNECTION_CLOSED_COUNTER,
|
||||
NUM_CLIENT_CONNECTION_OPENED_COUNTER,
|
||||
},
|
||||
};
|
||||
//! Routers for our serverless APIs
|
||||
//!
|
||||
//! Handles both SQL over HTTP and SQL over Websockets.
|
||||
|
||||
mod conn_pool;
|
||||
mod sql_over_http;
|
||||
mod websocket;
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, Bytes};
|
||||
use futures::{Sink, Stream, StreamExt};
|
||||
use hyper::StatusCode;
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
|
||||
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
|
||||
use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
|
||||
use crate::{cancellation::CancelMap, config::ProxyConfig};
|
||||
use futures::StreamExt;
|
||||
use hyper::{
|
||||
server::{
|
||||
accept,
|
||||
conn::{AddrIncoming, AddrStream},
|
||||
},
|
||||
upgrade::Upgraded,
|
||||
Body, Method, Request, Response, StatusCode,
|
||||
Body, Method, Request, Response,
|
||||
};
|
||||
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
|
||||
use pin_project_lite::pin_project;
|
||||
|
||||
use std::{
|
||||
future::ready,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{ready, Context, Poll},
|
||||
};
|
||||
use std::task::Poll;
|
||||
use std::{future::ready, sync::Arc};
|
||||
use tls_listener::TlsListener;
|
||||
use tokio::{
|
||||
io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
|
||||
net::TcpListener,
|
||||
};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::http::{error::ApiError, json::json_response};
|
||||
|
||||
// TODO: use `std::sync::Exclusive` once it's stabilized.
|
||||
// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
|
||||
use sync_wrapper::SyncWrapper;
|
||||
|
||||
use super::{conn_pool::GlobalConnPool, sql_over_http};
|
||||
|
||||
pin_project! {
|
||||
/// This is a wrapper around a [`WebSocketStream`] that
|
||||
/// implements [`AsyncRead`] and [`AsyncWrite`].
|
||||
pub struct WebSocketRw {
|
||||
#[pin]
|
||||
stream: SyncWrapper<WebSocketStream<Upgraded>>,
|
||||
bytes: Bytes,
|
||||
}
|
||||
}
|
||||
|
||||
impl WebSocketRw {
|
||||
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
|
||||
Self {
|
||||
stream: stream.into(),
|
||||
bytes: Bytes::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncWrite for WebSocketRw {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<io::Result<usize>> {
|
||||
let mut stream = self.project().stream.get_pin_mut();
|
||||
|
||||
ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
|
||||
match stream.as_mut().start_send(Message::Binary(buf.into())) {
|
||||
Ok(()) => Poll::Ready(Ok(buf.len())),
|
||||
Err(e) => Poll::Ready(Err(io_error(e))),
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
stream.poll_flush(cx).map_err(io_error)
|
||||
}
|
||||
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
stream.poll_close(cx).map_err(io_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncRead for WebSocketRw {
|
||||
fn poll_read(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &mut ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
if buf.remaining() > 0 {
|
||||
let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
|
||||
let len = std::cmp::min(bytes.len(), buf.remaining());
|
||||
buf.put_slice(&bytes[..len]);
|
||||
self.consume(len);
|
||||
}
|
||||
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncBufRead for WebSocketRw {
|
||||
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
|
||||
// Please refer to poll_fill_buf's documentation.
|
||||
const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
|
||||
|
||||
let mut this = self.project();
|
||||
loop {
|
||||
if !this.bytes.chunk().is_empty() {
|
||||
let chunk = (*this.bytes).chunk();
|
||||
return Poll::Ready(Ok(chunk));
|
||||
}
|
||||
|
||||
let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
|
||||
match res.transpose().map_err(io_error)? {
|
||||
Some(message) => match message {
|
||||
Message::Ping(_) => {}
|
||||
Message::Pong(_) => {}
|
||||
Message::Text(text) => {
|
||||
// We expect to see only binary messages.
|
||||
let error = "unexpected text message in the websocket";
|
||||
warn!(length = text.len(), error);
|
||||
return Poll::Ready(Err(io_error(error)));
|
||||
}
|
||||
Message::Frame(_) => {
|
||||
// This case is impossible according to Frame's doc.
|
||||
panic!("unexpected raw frame in the websocket");
|
||||
}
|
||||
Message::Binary(chunk) => {
|
||||
assert!(this.bytes.is_empty());
|
||||
*this.bytes = Bytes::from(chunk);
|
||||
}
|
||||
Message::Close(_) => return EOF,
|
||||
},
|
||||
None => return EOF,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn consume(self: Pin<&mut Self>, amount: usize) {
|
||||
self.project().bytes.advance(amount);
|
||||
}
|
||||
}
|
||||
|
||||
async fn serve_websocket(
|
||||
websocket: HyperWebsocket,
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
hostname: Option<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
handle_client(
|
||||
config,
|
||||
cancel_map,
|
||||
session_id,
|
||||
WebSocketRw::new(websocket),
|
||||
ClientMode::Websockets { hostname },
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn ws_handler(
|
||||
mut request: Request<Body>,
|
||||
config: &'static ProxyConfig,
|
||||
conn_pool: Arc<GlobalConnPool>,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
session_id: uuid::Uuid,
|
||||
sni_hostname: Option<String>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let host = request
|
||||
.headers()
|
||||
.get("host")
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.and_then(|h| h.split(':').next())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if hyper_tungstenite::is_upgrade_request(&request) {
|
||||
info!(session_id = ?session_id, "performing websocket upgrade");
|
||||
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
if let Err(e) =
|
||||
serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!(session_id = ?session_id, "error in websocket connection: {e:#}");
|
||||
}
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
|
||||
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
|
||||
sql_over_http::handle(
|
||||
request,
|
||||
sni_hostname,
|
||||
conn_pool,
|
||||
session_id,
|
||||
&config.http_config,
|
||||
)
|
||||
.await
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
|
||||
Response::builder()
|
||||
.header("Allow", "OPTIONS, POST")
|
||||
.header("Access-Control-Allow-Origin", "*")
|
||||
.header(
|
||||
"Access-Control-Allow-Headers",
|
||||
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
|
||||
)
|
||||
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
|
||||
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
|
||||
.body(Body::empty())
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))
|
||||
} else {
|
||||
json_response(StatusCode::BAD_REQUEST, "query is not supported")
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
ws_listener: TcpListener,
|
||||
@@ -239,7 +40,7 @@ pub async fn task_main(
|
||||
info!("websocket server has shut down");
|
||||
}
|
||||
|
||||
let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
|
||||
let conn_pool = conn_pool::GlobalConnPool::new(config);
|
||||
|
||||
// shutdown the connection pool
|
||||
tokio::spawn({
|
||||
@@ -300,13 +101,15 @@ pub async fn task_main(
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
|
||||
ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
|
||||
.instrument(info_span!(
|
||||
"ws-client",
|
||||
session = %session_id,
|
||||
%peer_addr,
|
||||
))
|
||||
.await
|
||||
request_handler(
|
||||
req, config, conn_pool, cancel_map, session_id, sni_name,
|
||||
)
|
||||
.instrument(info_span!(
|
||||
"serverless",
|
||||
session = %session_id,
|
||||
%peer_addr,
|
||||
))
|
||||
.await
|
||||
}
|
||||
},
|
||||
)))
|
||||
@@ -359,3 +162,65 @@ where
|
||||
self.inner.call(req)
|
||||
}
|
||||
}
|
||||
|
||||
async fn request_handler(
|
||||
mut request: Request<Body>,
|
||||
config: &'static ProxyConfig,
|
||||
conn_pool: Arc<conn_pool::GlobalConnPool>,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
session_id: uuid::Uuid,
|
||||
sni_hostname: Option<String>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let host = request
|
||||
.headers()
|
||||
.get("host")
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.and_then(|h| h.split(':').next())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if hyper_tungstenite::is_upgrade_request(&request) {
|
||||
info!(session_id = ?session_id, "performing websocket upgrade");
|
||||
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
if let Err(e) =
|
||||
websocket::serve_websocket(websocket, config, &cancel_map, session_id, host)
|
||||
.await
|
||||
{
|
||||
error!(session_id = ?session_id, "error in websocket connection: {e:#}");
|
||||
}
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
|
||||
sql_over_http::handle(
|
||||
request,
|
||||
sni_hostname,
|
||||
conn_pool,
|
||||
session_id,
|
||||
&config.http_config,
|
||||
)
|
||||
.await
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
|
||||
Response::builder()
|
||||
.header("Allow", "OPTIONS, POST")
|
||||
.header("Access-Control-Allow-Origin", "*")
|
||||
.header(
|
||||
"Access-Control-Allow-Headers",
|
||||
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
|
||||
)
|
||||
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
|
||||
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
|
||||
.body(Body::empty())
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))
|
||||
} else {
|
||||
json_response(StatusCode::BAD_REQUEST, "query is not supported")
|
||||
}
|
||||
}
|
||||
@@ -22,8 +22,8 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
|
||||
use crate::{
|
||||
auth, console,
|
||||
metrics::{Ids, MetricCounter, USAGE_METRICS},
|
||||
proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
|
||||
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
|
||||
};
|
||||
use crate::{compute, config};
|
||||
|
||||
@@ -191,22 +191,39 @@ impl GlobalConnPool {
|
||||
// ok return cached connection if found and establish a new one otherwise
|
||||
let new_client = if let Some(client) = client {
|
||||
if client.inner.is_closed() {
|
||||
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
|
||||
connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
|
||||
connect_to_compute(
|
||||
self.proxy_config,
|
||||
conn_info,
|
||||
conn_id,
|
||||
session_id,
|
||||
latency_timer,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
info!("pool: reusing connection '{conn_info}'");
|
||||
client.session.send(session_id)?;
|
||||
latency_timer.pool_hit();
|
||||
latency_timer.success();
|
||||
return Ok(Client {
|
||||
conn_id: client.conn_id,
|
||||
inner: Some(client),
|
||||
span: Span::current(),
|
||||
pool,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
info!("pool: opening a new connection '{conn_info}'");
|
||||
connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
|
||||
connect_to_compute(
|
||||
self.proxy_config,
|
||||
conn_info,
|
||||
conn_id,
|
||||
session_id,
|
||||
latency_timer,
|
||||
)
|
||||
.await
|
||||
};
|
||||
|
||||
match &new_client {
|
||||
@@ -243,6 +260,7 @@ impl GlobalConnPool {
|
||||
}
|
||||
|
||||
new_client.map(|inner| Client {
|
||||
conn_id: inner.conn_id,
|
||||
inner: Some(inner),
|
||||
span: Span::current(),
|
||||
pool,
|
||||
@@ -250,16 +268,18 @@ impl GlobalConnPool {
|
||||
}
|
||||
|
||||
fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
|
||||
let conn_id = client.conn_id;
|
||||
|
||||
// We want to hold this open while we return. This ensures that the pool can't close
|
||||
// while we are in the middle of returning the connection.
|
||||
let closed = self.closed.read();
|
||||
if *closed {
|
||||
info!("pool: throwing away connection '{conn_info}' because pool is closed");
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if client.inner.is_closed() {
|
||||
info!("pool: throwing away connection '{conn_info}' because connection is closed");
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -291,9 +311,9 @@ impl GlobalConnPool {
|
||||
|
||||
// do logging outside of the mutex
|
||||
if returned {
|
||||
info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
|
||||
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
|
||||
} else {
|
||||
info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -340,6 +360,7 @@ impl GlobalConnPool {
|
||||
struct TokioMechanism<'a> {
|
||||
conn_info: &'a ConnInfo,
|
||||
session_id: uuid::Uuid,
|
||||
conn_id: uuid::Uuid,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -353,7 +374,14 @@ impl ConnectMechanism for TokioMechanism<'_> {
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError> {
|
||||
connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
|
||||
connect_to_compute_once(
|
||||
node_info,
|
||||
self.conn_info,
|
||||
timeout,
|
||||
self.conn_id,
|
||||
self.session_id,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
|
||||
@@ -366,6 +394,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
|
||||
async fn connect_to_compute(
|
||||
config: &config::ProxyConfig,
|
||||
conn_info: &ConnInfo,
|
||||
conn_id: uuid::Uuid,
|
||||
session_id: uuid::Uuid,
|
||||
latency_timer: LatencyTimer,
|
||||
) -> anyhow::Result<ClientInner> {
|
||||
@@ -401,6 +430,7 @@ async fn connect_to_compute(
|
||||
|
||||
crate::proxy::connect_to_compute(
|
||||
&TokioMechanism {
|
||||
conn_id,
|
||||
conn_info,
|
||||
session_id,
|
||||
},
|
||||
@@ -416,6 +446,7 @@ async fn connect_to_compute_once(
|
||||
node_info: &console::CachedNodeInfo,
|
||||
conn_info: &ConnInfo,
|
||||
timeout: time::Duration,
|
||||
conn_id: uuid::Uuid,
|
||||
mut session: uuid::Uuid,
|
||||
) -> Result<ClientInner, tokio_postgres::Error> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
@@ -430,7 +461,6 @@ async fn connect_to_compute_once(
|
||||
|
||||
let (tx, mut rx) = tokio::sync::watch::channel(session);
|
||||
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
let span = info_span!(parent: None, "connection", %conn_id);
|
||||
span.in_scope(|| {
|
||||
info!(%conn_info, %session, "new connection");
|
||||
@@ -484,6 +514,7 @@ async fn connect_to_compute_once(
|
||||
inner: client,
|
||||
session: tx,
|
||||
ids,
|
||||
conn_id,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -491,6 +522,7 @@ struct ClientInner {
|
||||
inner: tokio_postgres::Client,
|
||||
session: tokio::sync::watch::Sender<uuid::Uuid>,
|
||||
ids: Ids,
|
||||
conn_id: uuid::Uuid,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
@@ -500,12 +532,14 @@ impl Client {
|
||||
}
|
||||
|
||||
pub struct Client {
|
||||
conn_id: uuid::Uuid,
|
||||
span: Span,
|
||||
inner: Option<ClientInner>,
|
||||
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
}
|
||||
|
||||
pub struct Discard<'a> {
|
||||
conn_id: uuid::Uuid,
|
||||
pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
}
|
||||
|
||||
@@ -514,6 +548,7 @@ impl Client {
|
||||
let Self {
|
||||
inner,
|
||||
pool,
|
||||
conn_id,
|
||||
span: _,
|
||||
} = self;
|
||||
(
|
||||
@@ -521,7 +556,10 @@ impl Client {
|
||||
.as_mut()
|
||||
.expect("client inner should not be removed")
|
||||
.inner,
|
||||
Discard { pool },
|
||||
Discard {
|
||||
pool,
|
||||
conn_id: *conn_id,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@@ -537,13 +575,13 @@ impl Discard<'_> {
|
||||
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
|
||||
if status != ReadyForQueryStatus::Idle {
|
||||
if let Some((conn_info, _)) = self.pool.take() {
|
||||
info!("pool: throwing away connection '{conn_info}' because connection is not idle")
|
||||
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn discard(&mut self) {
|
||||
if let Some((conn_info, _)) = self.pool.take() {
|
||||
info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
|
||||
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
|
||||
}
|
||||
}
|
||||
}
|
||||
146
proxy/src/serverless/websocket.rs
Normal file
146
proxy/src/serverless/websocket.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
use crate::{
|
||||
cancellation::CancelMap,
|
||||
config::ProxyConfig,
|
||||
error::io_error,
|
||||
proxy::{handle_client, ClientMode},
|
||||
};
|
||||
use bytes::{Buf, Bytes};
|
||||
use futures::{Sink, Stream};
|
||||
use hyper::upgrade::Upgraded;
|
||||
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
|
||||
use pin_project_lite::pin_project;
|
||||
|
||||
use std::{
|
||||
pin::Pin,
|
||||
task::{ready, Context, Poll},
|
||||
};
|
||||
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
|
||||
use tracing::warn;
|
||||
|
||||
// TODO: use `std::sync::Exclusive` once it's stabilized.
|
||||
// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
|
||||
use sync_wrapper::SyncWrapper;
|
||||
|
||||
pin_project! {
|
||||
/// This is a wrapper around a [`WebSocketStream`] that
|
||||
/// implements [`AsyncRead`] and [`AsyncWrite`].
|
||||
pub struct WebSocketRw {
|
||||
#[pin]
|
||||
stream: SyncWrapper<WebSocketStream<Upgraded>>,
|
||||
bytes: Bytes,
|
||||
}
|
||||
}
|
||||
|
||||
impl WebSocketRw {
|
||||
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
|
||||
Self {
|
||||
stream: stream.into(),
|
||||
bytes: Bytes::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncWrite for WebSocketRw {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<io::Result<usize>> {
|
||||
let mut stream = self.project().stream.get_pin_mut();
|
||||
|
||||
ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
|
||||
match stream.as_mut().start_send(Message::Binary(buf.into())) {
|
||||
Ok(()) => Poll::Ready(Ok(buf.len())),
|
||||
Err(e) => Poll::Ready(Err(io_error(e))),
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
stream.poll_flush(cx).map_err(io_error)
|
||||
}
|
||||
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let stream = self.project().stream.get_pin_mut();
|
||||
stream.poll_close(cx).map_err(io_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncRead for WebSocketRw {
|
||||
fn poll_read(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &mut ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
if buf.remaining() > 0 {
|
||||
let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
|
||||
let len = std::cmp::min(bytes.len(), buf.remaining());
|
||||
buf.put_slice(&bytes[..len]);
|
||||
self.consume(len);
|
||||
}
|
||||
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncBufRead for WebSocketRw {
|
||||
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
|
||||
// Please refer to poll_fill_buf's documentation.
|
||||
const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
|
||||
|
||||
let mut this = self.project();
|
||||
loop {
|
||||
if !this.bytes.chunk().is_empty() {
|
||||
let chunk = (*this.bytes).chunk();
|
||||
return Poll::Ready(Ok(chunk));
|
||||
}
|
||||
|
||||
let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
|
||||
match res.transpose().map_err(io_error)? {
|
||||
Some(message) => match message {
|
||||
Message::Ping(_) => {}
|
||||
Message::Pong(_) => {}
|
||||
Message::Text(text) => {
|
||||
// We expect to see only binary messages.
|
||||
let error = "unexpected text message in the websocket";
|
||||
warn!(length = text.len(), error);
|
||||
return Poll::Ready(Err(io_error(error)));
|
||||
}
|
||||
Message::Frame(_) => {
|
||||
// This case is impossible according to Frame's doc.
|
||||
panic!("unexpected raw frame in the websocket");
|
||||
}
|
||||
Message::Binary(chunk) => {
|
||||
assert!(this.bytes.is_empty());
|
||||
*this.bytes = Bytes::from(chunk);
|
||||
}
|
||||
Message::Close(_) => return EOF,
|
||||
},
|
||||
None => return EOF,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn consume(self: Pin<&mut Self>, amount: usize) {
|
||||
self.project().bytes.advance(amount);
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn serve_websocket(
|
||||
websocket: HyperWebsocket,
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
hostname: Option<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
handle_client(
|
||||
config,
|
||||
cancel_map,
|
||||
session_id,
|
||||
WebSocketRw::new(websocket),
|
||||
ClientMode::Websockets { hostname },
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -24,7 +24,7 @@ backoff = "^2.2.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "^2.2.3"
|
||||
Werkzeug = "^3.0.1"
|
||||
pytest-order = "^1.1.0"
|
||||
allure-pytest = "^2.13.2"
|
||||
pytest-asyncio = "^0.21.0"
|
||||
|
||||
@@ -15,28 +15,15 @@ The script fetches the durations of benchmarks from the database and stores it i
|
||||
|
||||
BENCHMARKS_DURATION_QUERY = """
|
||||
SELECT
|
||||
DISTINCT parent_suite, suite, test,
|
||||
PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
|
||||
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
|
||||
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp,
|
||||
(jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms
|
||||
FROM
|
||||
regress_test_results
|
||||
WHERE
|
||||
reference = 'refs/heads/main'
|
||||
) data
|
||||
DISTINCT parent_suite, suite, name,
|
||||
PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration) as percentile_ms
|
||||
FROM results
|
||||
WHERE
|
||||
timestamp > CURRENT_DATE - INTERVAL '%s' day
|
||||
started_at > CURRENT_DATE - INTERVAL '%s' day
|
||||
AND parent_suite = 'test_runner.performance'
|
||||
AND status = 'passed'
|
||||
GROUP BY
|
||||
parent_suite, suite, test
|
||||
parent_suite, suite, name
|
||||
;
|
||||
"""
|
||||
|
||||
@@ -44,68 +31,69 @@ BENCHMARKS_DURATION_QUERY = """
|
||||
# the total duration varies from 8 to 40 minutes.
|
||||
# We use some pre-collected durations as a fallback to have a better distribution.
|
||||
FALLBACK_DURATION = {
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0,
|
||||
"test_runner/performance/test_compaction.py::test_compaction": 77.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0,
|
||||
"test_runner/performance/test_copy.py::test_copy[neon]": 12.0,
|
||||
"test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0,
|
||||
"test_runner/performance/test_layer_map.py::test_layer_map": 44.0,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0,
|
||||
"test_runner/performance/test_startup.py::test_startup_simple": 2.0,
|
||||
"test_runner/performance/test_startup.py::test_startup": 539.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
|
||||
"test_runner/performance/test_compaction.py::test_compaction": 110.222,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
|
||||
"test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
|
||||
"test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
|
||||
"test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
|
||||
"test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
|
||||
"test_runner/performance/test_startup.py::test_startup": 890.114,
|
||||
"test_runner/performance/test_startup.py::test_startup_simple": 2.51,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
|
||||
}
|
||||
|
||||
|
||||
@@ -130,7 +118,7 @@ def main(args: argparse.Namespace):
|
||||
res = FALLBACK_DURATION
|
||||
|
||||
for row in rows:
|
||||
pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}"
|
||||
pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['name']}"
|
||||
duration = row["percentile_ms"] / 1000
|
||||
logging.info(f"\t{pytest_name}: {duration}")
|
||||
res[pytest_name] = duration
|
||||
|
||||
@@ -9,28 +9,15 @@ from typing import DefaultDict, Dict
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
|
||||
FLAKY_TESTS_QUERY = """
|
||||
SELECT
|
||||
DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
reference,
|
||||
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
|
||||
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
|
||||
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
|
||||
FROM
|
||||
regress_test_results
|
||||
) data
|
||||
DISTINCT parent_suite, suite, name
|
||||
FROM results
|
||||
WHERE
|
||||
timestamp > CURRENT_DATE - INTERVAL '%s' day
|
||||
started_at > CURRENT_DATE - INTERVAL '%s' day
|
||||
AND (
|
||||
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
|
||||
OR retries_status_change::boolean
|
||||
OR flaky
|
||||
)
|
||||
;
|
||||
"""
|
||||
@@ -63,12 +50,14 @@ def main(args: argparse.Namespace):
|
||||
if row["parent_suite"] != "test_runner.regress":
|
||||
continue
|
||||
|
||||
deparametrized_test = row["deparametrized_test"]
|
||||
dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
|
||||
parametrized_test = deparametrized_test.replace(
|
||||
"[",
|
||||
f"[{build_type}-pg{pg_version}{dash_if_needed}",
|
||||
)
|
||||
if row["name"].endswith("]"):
|
||||
parametrized_test = row["name"].replace(
|
||||
"[",
|
||||
f"[{build_type}-pg{pg_version}-",
|
||||
)
|
||||
else:
|
||||
parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
|
||||
|
||||
res[row["parent_suite"]][row["suite"]][parametrized_test] = True
|
||||
|
||||
logging.info(
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::task::{Context, Poll};
|
||||
use std::time::Duration;
|
||||
use tonic::codegen::StdError;
|
||||
use tonic::transport::{ClientTlsConfig, Endpoint};
|
||||
use tonic::{transport::Channel, Code, Status};
|
||||
use tonic::{transport::Channel, Status};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use proto::{
|
||||
@@ -23,6 +23,7 @@ pub mod proto {
|
||||
pub mod metrics;
|
||||
|
||||
// Re-exports to avoid direct tonic dependency in user crates.
|
||||
pub use tonic::Code;
|
||||
pub use tonic::Request;
|
||||
pub use tonic::Streaming;
|
||||
|
||||
|
||||
@@ -22,6 +22,11 @@ https://docs.pytest.org/en/6.2.x/logging.html
|
||||
# log format is specified in pytest.ini file
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"formatters": {
|
||||
"standard": {
|
||||
"datefmt": "%m/%d/%Y %I:%M:%SZ %p %Z",
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"root": {"level": "INFO"},
|
||||
"root.safekeeper_async": {"level": "INFO"}, # a lot of logs on DEBUG level
|
||||
|
||||
@@ -1616,7 +1616,7 @@ class NeonPageserver(PgProtocol):
|
||||
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
|
||||
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
|
||||
".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline or pageserver is shutting down", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
|
||||
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
|
||||
".*task iteration took longer than the configured period.*",
|
||||
@@ -1707,7 +1707,7 @@ class NeonPageserver(PgProtocol):
|
||||
|
||||
@property
|
||||
def workdir(self) -> Path:
|
||||
return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}"))
|
||||
return self.env.repo_dir / f"pageserver_{self.id}"
|
||||
|
||||
def assert_no_errors(self):
|
||||
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
|
||||
@@ -1719,6 +1719,11 @@ class NeonPageserver(PgProtocol):
|
||||
break
|
||||
|
||||
if error_or_warn.search(line):
|
||||
# Is this a torn log line? This happens when force-killing a process and restarting
|
||||
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
|
||||
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
|
||||
continue
|
||||
|
||||
# It's an ERROR or WARN. Is it in the allow-list?
|
||||
for a in self.allowed_errors:
|
||||
if re.match(a, line):
|
||||
@@ -1768,6 +1773,16 @@ class NeonPageserver(PgProtocol):
|
||||
client = self.http_client()
|
||||
return client.tenant_attach(tenant_id, config, config_null, generation=generation)
|
||||
|
||||
def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
|
||||
# This API is only for use when generations are enabled
|
||||
assert self.env.attachment_service is not None
|
||||
|
||||
if config["mode"].startswith("Attached") and "generation" not in config:
|
||||
config["generation"] = self.env.attachment_service.attach_hook(tenant_id, self.id)
|
||||
|
||||
client = self.http_client()
|
||||
return client.tenant_location_conf(tenant_id, config, **kwargs)
|
||||
|
||||
|
||||
def append_pageserver_param_overrides(
|
||||
params_to_update: List[str],
|
||||
@@ -2624,6 +2639,7 @@ class EndpointFactory:
|
||||
lsn: Optional[Lsn] = None,
|
||||
hot_standby: bool = False,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -2643,6 +2659,7 @@ class EndpointFactory:
|
||||
lsn=lsn,
|
||||
hot_standby=hot_standby,
|
||||
config_lines=config_lines,
|
||||
pageserver_id=pageserver_id,
|
||||
)
|
||||
|
||||
def stop_all(self) -> "EndpointFactory":
|
||||
|
||||
@@ -247,6 +247,23 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_location_conf(
|
||||
self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
|
||||
):
|
||||
body = location_conf.copy()
|
||||
body["tenant_id"] = str(tenant_id)
|
||||
|
||||
params = {}
|
||||
if flush_ms is not None:
|
||||
params["flush_ms"] = str(flush_ms)
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
|
||||
json=body,
|
||||
params=params,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_delete(self, tenant_id: TenantId):
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
@@ -441,13 +458,13 @@ class PageserverHttpClient(requests.Session):
|
||||
assert res_json is None
|
||||
|
||||
def timeline_get_lsn_by_timestamp(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int
|
||||
):
|
||||
log.info(
|
||||
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
|
||||
)
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
@@ -650,6 +667,14 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
|
||||
self.verbose_error(res)
|
||||
|
||||
def secondary_tenant_upload(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/secondary/{tenant_id}/upload")
|
||||
self.verbose_error(res)
|
||||
|
||||
def secondary_tenant_download(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/secondary/{tenant_id}/download")
|
||||
self.verbose_error(res)
|
||||
|
||||
def post_tracing_event(self, level: str, message: str):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tracing/event",
|
||||
|
||||
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
|
||||
# this has been seen in the wild by tests with the below contradicting logging
|
||||
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
|
||||
# this seems like a mock_s3 issue
|
||||
log.warn(
|
||||
log.warning(
|
||||
f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
|
||||
)
|
||||
keys = 0
|
||||
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
|
||||
# this has been seen in one case with mock_s3:
|
||||
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
|
||||
# looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
|
||||
log.warn(
|
||||
log.warning(
|
||||
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
|
||||
)
|
||||
|
||||
|
||||
133
test_runner/fixtures/workload.py
Normal file
133
test_runner/fixtures/workload.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
Endpoint,
|
||||
NeonEnv,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
|
||||
class Workload:
|
||||
"""
|
||||
This is not a general purpose load generator: it exists for storage tests that need to inject some
|
||||
high level types of storage work via the postgres interface:
|
||||
- layer writes (`write_rows`)
|
||||
- work for compaction (`churn_rows`)
|
||||
- reads, checking we get the right data (`validate`)
|
||||
"""
|
||||
|
||||
def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.env = env
|
||||
self.tenant_id = tenant_id
|
||||
self.timeline_id = timeline_id
|
||||
self.table = "foo"
|
||||
|
||||
self.expect_rows = 0
|
||||
self.churn_cursor = 0
|
||||
|
||||
self.endpoints: dict[int, Endpoint] = {}
|
||||
|
||||
def endpoint(self, pageserver_id: int):
|
||||
if pageserver_id not in self.endpoints:
|
||||
self.endpoints[pageserver_id] = self.env.endpoints.create(
|
||||
"main",
|
||||
tenant_id=self.tenant_id,
|
||||
pageserver_id=pageserver_id,
|
||||
endpoint_id=f"ep-{pageserver_id}",
|
||||
)
|
||||
|
||||
endpoint = self.endpoints[pageserver_id]
|
||||
assert not endpoint.running
|
||||
endpoint.start(pageserver_id=pageserver_id)
|
||||
return endpoint
|
||||
|
||||
def init(self, pageserver_id: int):
|
||||
with self.endpoint(pageserver_id) as endpoint:
|
||||
endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
|
||||
endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
|
||||
last_flush_lsn_upload(
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
|
||||
def write_rows(self, n, pageserver_id):
|
||||
with self.endpoint(pageserver_id) as endpoint:
|
||||
start = self.expect_rows
|
||||
end = start + n - 1
|
||||
self.expect_rows += n
|
||||
dummy_value = "blah"
|
||||
endpoint.safe_psql(
|
||||
f"""
|
||||
INSERT INTO {self.table} (id, val)
|
||||
SELECT g, '{dummy_value}'
|
||||
FROM generate_series({start}, {end}) g
|
||||
"""
|
||||
)
|
||||
|
||||
return last_flush_lsn_upload(
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
|
||||
def churn_rows(self, n, pageserver_id, upload=True):
|
||||
assert self.expect_rows >= n
|
||||
|
||||
max_iters = 10
|
||||
with self.endpoint(pageserver_id) as endpoint:
|
||||
todo = n
|
||||
i = 0
|
||||
while todo > 0:
|
||||
i += 1
|
||||
if i > max_iters:
|
||||
raise RuntimeError("oops")
|
||||
start = self.churn_cursor % self.expect_rows
|
||||
n_iter = min((self.expect_rows - start), todo)
|
||||
todo -= n_iter
|
||||
|
||||
end = start + n_iter - 1
|
||||
|
||||
log.info(
|
||||
f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}"
|
||||
)
|
||||
|
||||
assert end < self.expect_rows
|
||||
|
||||
self.churn_cursor += n_iter
|
||||
dummy_value = "blah"
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
f"""
|
||||
INSERT INTO {self.table} (id, val)
|
||||
SELECT g, '{dummy_value}'
|
||||
FROM generate_series({start}, {end}) g
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET val = EXCLUDED.val
|
||||
""",
|
||||
f"VACUUM {self.table}",
|
||||
]
|
||||
)
|
||||
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
ps_http = self.env.get_pageserver(pageserver_id).http_client()
|
||||
wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
|
||||
|
||||
if upload:
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
|
||||
wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
|
||||
|
||||
def validate(self, pageserver_id):
|
||||
with self.endpoint(pageserver_id) as endpoint:
|
||||
result = endpoint.safe_psql_many(
|
||||
[
|
||||
"select clear_buffer_cache()",
|
||||
f"""
|
||||
SELECT COUNT(*) FROM {self.table}
|
||||
""",
|
||||
]
|
||||
)
|
||||
|
||||
log.info(f"validate({self.expect_rows}): {result}")
|
||||
assert result == [[("",)], [(self.expect_rows,)]]
|
||||
@@ -4,8 +4,10 @@ First make a release build. The `-s` flag silences a lot of output, and makes it
|
||||
easier to see if you have compile errors without scrolling up.
|
||||
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`
|
||||
|
||||
You may also need to run `./scripts/pysync`.
|
||||
|
||||
Then run the tests
|
||||
`NEON_BIN=./target/release poetry run pytest test_runner/performance"`
|
||||
`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance`
|
||||
|
||||
Some handy pytest flags for local development:
|
||||
- `-x` tells pytest to stop on first error
|
||||
|
||||
@@ -19,7 +19,7 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
|
||||
|
||||
# eviction might be the first one after an attach to access the layers
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction"
|
||||
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
|
||||
)
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
return env
|
||||
|
||||
@@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Failed to load delta layer.*",
|
||||
".*layer loading failed:.*",
|
||||
".*could not find data for key.*",
|
||||
".*is not active. Current state: Broken.*",
|
||||
".*will not become active. Current state: Broken.*",
|
||||
@@ -87,7 +87,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
# Second timeline will fail during basebackup, because the local layer file is corrupt.
|
||||
# It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
|
||||
# (We don't check layer file contents on startup, when loading the timeline)
|
||||
with pytest.raises(Exception, match="Failed to load delta layer") as err:
|
||||
with pytest.raises(Exception, match="layer loading failed:") as err:
|
||||
pg2.start()
|
||||
log.info(
|
||||
f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
|
||||
|
||||
@@ -247,34 +247,34 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
|
||||
def ensure_resident_and_remote_size_metrics():
|
||||
log.info("ensure that all the layers are gone")
|
||||
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
|
||||
# we have disabled all background loops, so, this should hold
|
||||
assert len(resident_layers) == 0
|
||||
assert len(resident_layers) == 0, "ensure that all the layers are gone"
|
||||
|
||||
info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info("layer map dump: %s", info)
|
||||
|
||||
log.info("ensure that resident_physical_size metric is zero")
|
||||
resident_physical_size_metric = ps_http.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_resident_physical_size"
|
||||
)
|
||||
assert resident_physical_size_metric == 0
|
||||
log.info("ensure that resident_physical_size metric corresponds to layer map dump")
|
||||
assert (
|
||||
resident_physical_size_metric == 0
|
||||
), "ensure that resident_physical_size metric is zero"
|
||||
assert resident_physical_size_metric == sum(
|
||||
[layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
|
||||
)
|
||||
layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
|
||||
), "ensure that resident_physical_size metric corresponds to layer map dump"
|
||||
|
||||
log.info("ensure that remote_physical_size metric matches layer map")
|
||||
remote_physical_size_metric = ps_http.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_remote_physical_size"
|
||||
)
|
||||
log.info("ensure that remote_physical_size metric corresponds to layer map dump")
|
||||
assert remote_physical_size_metric == sum(
|
||||
layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
|
||||
)
|
||||
), "ensure that remote_physical_size metric corresponds to layer map dump"
|
||||
|
||||
log.info("before runnning GC, ensure that remote_physical size is zero")
|
||||
# leaving index_part.json upload from successful compaction out will show
|
||||
# up here as a mismatch between remove_physical_size and summed up layermap
|
||||
# size
|
||||
ensure_resident_and_remote_size_metrics()
|
||||
|
||||
log.info("run GC")
|
||||
|
||||
@@ -8,6 +8,71 @@ from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
#
|
||||
# Test pageserver get_lsn_by_timestamp API
|
||||
#
|
||||
def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
|
||||
endpoint_main = env.endpoints.create_start("test_lsn_mapping")
|
||||
log.info("postgres is running on 'test_lsn_mapping' branch")
|
||||
|
||||
cur = endpoint_main.connect().cursor()
|
||||
# Create table, and insert rows, each in a separate transaction
|
||||
# Disable synchronous_commit to make this initialization go faster.
|
||||
#
|
||||
# Each row contains current insert LSN and the current timestamp, when
|
||||
# the row was inserted.
|
||||
cur.execute("SET synchronous_commit=off")
|
||||
cur.execute("CREATE TABLE foo (x integer)")
|
||||
tbl = []
|
||||
for i in range(1000):
|
||||
cur.execute("INSERT INTO foo VALUES(%s)", (i,))
|
||||
# Get the timestamp at UTC
|
||||
after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
|
||||
tbl.append([i, after_timestamp])
|
||||
|
||||
# Execute one more transaction with synchronous_commit enabled, to flush
|
||||
# all the previous transactions
|
||||
cur.execute("SET synchronous_commit=on")
|
||||
cur.execute("INSERT INTO foo VALUES (-1)")
|
||||
|
||||
# Wait until WAL is received by pageserver
|
||||
wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
|
||||
|
||||
with env.pageserver.http_client() as client:
|
||||
# Check edge cases: timestamp in the future
|
||||
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
|
||||
)
|
||||
assert result == "future"
|
||||
|
||||
# timestamp too the far history
|
||||
probe_timestamp = tbl[0][1] - timedelta(hours=10)
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
|
||||
)
|
||||
assert result == "past"
|
||||
|
||||
# Probe a bunch of timestamps in the valid range
|
||||
for i in range(1, len(tbl), 100):
|
||||
probe_timestamp = tbl[i][1]
|
||||
lsn = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
|
||||
)
|
||||
# Call get_lsn_by_timestamp to get the LSN
|
||||
# Launch a new read-only node at that LSN, and check that only the rows
|
||||
# that were supposed to be committed at that point in time are visible.
|
||||
endpoint_here = env.endpoints.create_start(
|
||||
branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
|
||||
)
|
||||
assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
|
||||
|
||||
endpoint_here.stop_and_destroy()
|
||||
|
||||
|
||||
#
|
||||
# Test pageserver get_lsn_by_timestamp API
|
||||
#
|
||||
@@ -45,23 +110,24 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
|
||||
# Check edge cases: timestamp in the future
|
||||
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
|
||||
)
|
||||
assert result == "future"
|
||||
assert result["kind"] == "future"
|
||||
|
||||
# timestamp too the far history
|
||||
probe_timestamp = tbl[0][1] - timedelta(hours=10)
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
|
||||
)
|
||||
assert result == "past"
|
||||
assert result["kind"] == "past"
|
||||
|
||||
# Probe a bunch of timestamps in the valid range
|
||||
for i in range(1, len(tbl), 100):
|
||||
probe_timestamp = tbl[i][1]
|
||||
lsn = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
|
||||
)
|
||||
lsn = result["lsn"]
|
||||
# Call get_lsn_by_timestamp to get the LSN
|
||||
# Launch a new read-only node at that LSN, and check that only the rows
|
||||
# that were supposed to be committed at that point in time are visible.
|
||||
|
||||
@@ -12,13 +12,12 @@ from fixtures.neon_fixtures import (
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
|
||||
from fixtures.types import Lsn
|
||||
@@ -384,7 +383,7 @@ def test_download_remote_layers_api(
|
||||
env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint",
|
||||
".*download failed: downloading evicted layer file failed.*",
|
||||
f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
|
||||
]
|
||||
)
|
||||
@@ -637,56 +636,5 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
|
||||
assert dict(kinds_after) == {"Delta": 4, "Image": 1}
|
||||
|
||||
|
||||
def test_ondemand_download_failure_to_replace(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
|
||||
|
||||
See: https://github.com/neondatabase/neon/issues/3533
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
# disable gc and compaction via default tenant config because config is lost while detaching
|
||||
# so that compaction will not be the one to download the layer but the http handler is
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
"""tenant_config={gc_period = "0s", compaction_period = "0s"}"""
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
assert timeline_id is not None
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
# remove layers so that they will be redownloaded
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
|
||||
|
||||
# requesting details with non-incremental size should trigger a download of the only layer
|
||||
# this will need to be adjusted if an index for logical sizes is ever implemented
|
||||
with pytest.raises(PageserverApiException):
|
||||
# PageserverApiException is expected because of the failpoint (timeline_detail building does something)
|
||||
# ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
|
||||
# but should it be added back, we would wait for 15s here.
|
||||
pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
|
||||
|
||||
actual_message = ".* ERROR .*layermap-replace-notfound"
|
||||
assert env.pageserver.log_contains(actual_message) is not None
|
||||
env.pageserver.allowed_errors.append(actual_message)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
|
||||
)
|
||||
# this might get to run and attempt on-demand, but not always
|
||||
env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
|
||||
|
||||
# if the above returned, then we didn't have a livelock, and all is well
|
||||
|
||||
|
||||
def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
|
||||
return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
|
||||
|
||||
@@ -24,12 +24,19 @@ from fixtures.neon_fixtures import (
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.utils import list_prefix
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
list_prefix,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.remote_storage import (
|
||||
RemoteStorageKind,
|
||||
)
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import print_gc_result, wait_until
|
||||
from fixtures.workload import Workload
|
||||
|
||||
# A tenant configuration that is convenient for generating uploads and deletions
|
||||
# without a large amount of postgres traffic.
|
||||
@@ -486,16 +493,20 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
|
||||
def evict_all_layers(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
initial_local_layers = sorted(
|
||||
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
)
|
||||
client = env.pageserver.http_client()
|
||||
for layer in initial_local_layers:
|
||||
if "ephemeral" in layer.name or "temp" in layer.name:
|
||||
|
||||
layer_map = client.layer_map_info(tenant_id, timeline_id)
|
||||
|
||||
for layer in layer_map.historic_layers:
|
||||
if layer.remote:
|
||||
log.info(
|
||||
f"Skipping trying to evict remote layer {tenant_id}/{timeline_id} {layer.layer_file_name}"
|
||||
)
|
||||
continue
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
|
||||
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.layer_file_name}")
|
||||
client.evict_layer(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.layer_file_name
|
||||
)
|
||||
|
||||
|
||||
def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -528,3 +539,91 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
|
||||
read_all(env, tenant_id, timeline_id)
|
||||
evict_all_layers(env, tenant_id, timeline_id)
|
||||
read_all(env, tenant_id, timeline_id)
|
||||
|
||||
|
||||
def test_multi_attach(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
|
||||
pageservers = env.pageservers
|
||||
http_clients = list([p.http_client() for p in pageservers])
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# We will intentionally create situations where stale deletions happen from non-latest-generation
|
||||
# nodes when the tenant is multiply-attached
|
||||
for ps in env.pageservers:
|
||||
ps.allowed_errors.extend(
|
||||
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
|
||||
)
|
||||
|
||||
# Initially, the tenant will be attached to the pageserver a (first is default in our test harness)
|
||||
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
|
||||
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
|
||||
with pytest.raises(PageserverApiException):
|
||||
http_clients[1].timeline_detail(tenant_id, timeline_id)
|
||||
with pytest.raises(PageserverApiException):
|
||||
http_clients[2].timeline_detail(tenant_id, timeline_id)
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(pageservers[0].id)
|
||||
workload.write_rows(1000, pageservers[0].id)
|
||||
|
||||
# Attach the tenant to the other two pageservers
|
||||
pageservers[1].tenant_attach(env.initial_tenant)
|
||||
pageservers[2].tenant_attach(env.initial_tenant)
|
||||
|
||||
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
|
||||
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
|
||||
|
||||
# Now they all have it attached
|
||||
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
|
||||
_detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
|
||||
_detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
|
||||
|
||||
# The endpoint can use any pageserver to service its reads
|
||||
for pageserver in pageservers:
|
||||
workload.validate(pageserver.id)
|
||||
|
||||
# If we write some more data, all the nodes can see it, including stale ones
|
||||
wrote_lsn = workload.write_rows(1000, pageservers[0].id)
|
||||
for ps_http in http_clients:
|
||||
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn)
|
||||
|
||||
# ...and indeed endpoints can see it via any of the pageservers
|
||||
for pageserver in pageservers:
|
||||
workload.validate(pageserver.id)
|
||||
|
||||
# Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage
|
||||
for ps_http in http_clients:
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn)
|
||||
|
||||
# Now, the contents of remote storage will be a set of layers from each pageserver, but with unique
|
||||
# generation numbers
|
||||
# TODO: validate remote storage contents
|
||||
|
||||
# Stop all pageservers
|
||||
for ps in pageservers:
|
||||
ps.stop()
|
||||
|
||||
# Returning to a normal healthy state: all pageservers will start, but only the one most
|
||||
# recently attached via the control plane will re-attach on startup
|
||||
for ps in pageservers:
|
||||
ps.start()
|
||||
|
||||
with pytest.raises(PageserverApiException):
|
||||
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
|
||||
with pytest.raises(PageserverApiException):
|
||||
_detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
|
||||
_detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
|
||||
|
||||
# All data we wrote while multi-attached remains readable
|
||||
workload.validate(pageservers[2].id)
|
||||
|
||||
468
test_runner/regress/test_pageserver_secondary.py
Normal file
468
test_runner/regress/test_pageserver_secondary.py
Normal file
@@ -0,0 +1,468 @@
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
|
||||
from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.workload import Workload
|
||||
|
||||
# A tenant configuration that is convenient for generating uploads and deletions
|
||||
# without a large amount of postgres traffic.
|
||||
TENANT_CONF = {
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
"compaction_threshold": "1",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
"pitr_interval": "0s",
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly, so that GC can remove some layers
|
||||
"image_creation_threshold": "1",
|
||||
}
|
||||
|
||||
|
||||
def evict_random_layers(
|
||||
rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
"""
|
||||
Evict 50% of the layers on a pageserver
|
||||
"""
|
||||
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
initial_local_layers = sorted(
|
||||
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||
)
|
||||
client = pageserver.http_client()
|
||||
for layer in initial_local_layers:
|
||||
if "ephemeral" in layer.name:
|
||||
continue
|
||||
|
||||
if rng.choice([True, False]):
|
||||
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
|
||||
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", [1, 2, 3])
|
||||
def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
|
||||
"""
|
||||
Issue many location configuration changes, ensure that tenants
|
||||
remain readable & we don't get any unexpected errors. We should
|
||||
have no ERROR in the log, and no 500s in the API.
|
||||
|
||||
The location_config API is intentionally designed so that all destination
|
||||
states are valid, so that we may test it in this way: the API should always
|
||||
work as long as the tenant exists.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
|
||||
pageservers = env.pageservers
|
||||
list([p.http_client() for p in pageservers])
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# We will make no effort to avoid stale attachments
|
||||
for ps in env.pageservers:
|
||||
ps.allowed_errors.extend(
|
||||
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
|
||||
)
|
||||
|
||||
# these can happen, if we shutdown at a good time. to be fixed as part of #5172.
|
||||
message = ".*duplicated L1 layer layer=.*"
|
||||
ps.allowed_errors.append(message)
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(env.pageservers[0].id)
|
||||
workload.write_rows(256, env.pageservers[0].id)
|
||||
|
||||
# We use a fixed seed to make the test reproducible: we want a randomly
|
||||
# chosen order, but not to change the order every time we run the test.
|
||||
rng = random.Random(seed)
|
||||
|
||||
initial_generation = 1
|
||||
last_state = {
|
||||
env.pageservers[0].id: ("AttachedSingle", initial_generation),
|
||||
env.pageservers[1].id: ("Detached", None),
|
||||
env.pageservers[2].id: ("Detached", None),
|
||||
}
|
||||
|
||||
latest_attached = env.pageservers[0].id
|
||||
|
||||
for _i in range(0, 64):
|
||||
# Pick a pageserver
|
||||
pageserver = rng.choice(env.pageservers)
|
||||
|
||||
# Pick a pseudorandom state
|
||||
modes = [
|
||||
"AttachedSingle",
|
||||
"AttachedMulti",
|
||||
"AttachedStale",
|
||||
"Secondary",
|
||||
"Detached",
|
||||
"_Evictions",
|
||||
"_Restart",
|
||||
]
|
||||
|
||||
mode = rng.choice(modes)
|
||||
|
||||
last_state_ps = last_state[pageserver.id]
|
||||
if mode == "_Evictions":
|
||||
if last_state_ps[0].startswith("Attached"):
|
||||
log.info(f"Action: evictions on pageserver {pageserver.id}")
|
||||
evict_random_layers(rng, pageserver, tenant_id, timeline_id)
|
||||
else:
|
||||
log.info(
|
||||
f"Action: skipping evictions on pageserver {pageserver.id}, is not attached"
|
||||
)
|
||||
elif mode == "_Restart":
|
||||
log.info(f"Action: restarting pageserver {pageserver.id}")
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
|
||||
log.info("Entering postgres...")
|
||||
workload.churn_rows(rng.randint(128, 256), pageserver.id)
|
||||
workload.validate(pageserver.id)
|
||||
elif last_state_ps[0].startswith("Attached"):
|
||||
# The `attachment_service` will only re-attach on startup when a pageserver was the
|
||||
# holder of the latest generation: otherwise the pageserver will revert to detached
|
||||
# state if it was running attached with a stale generation
|
||||
last_state[pageserver.id] = ("Detached", None)
|
||||
else:
|
||||
secondary_conf: Optional[Dict[str, Any]] = None
|
||||
if mode == "Secondary":
|
||||
secondary_conf = {"warm": rng.choice([True, False])}
|
||||
|
||||
location_conf: Dict[str, Any] = {
|
||||
"mode": mode,
|
||||
"secondary_conf": secondary_conf,
|
||||
"tenant_conf": {},
|
||||
}
|
||||
|
||||
log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}")
|
||||
|
||||
# Select a generation number
|
||||
if mode.startswith("Attached"):
|
||||
if last_state_ps[1] is not None:
|
||||
if rng.choice([True, False]):
|
||||
# Move between attached states, staying in the same generation
|
||||
generation = last_state_ps[1]
|
||||
else:
|
||||
# Switch generations, while also jumping between attached states
|
||||
generation = env.attachment_service.attach_hook(tenant_id, pageserver.id)
|
||||
latest_attached = pageserver.id
|
||||
else:
|
||||
generation = env.attachment_service.attach_hook(tenant_id, pageserver.id)
|
||||
latest_attached = pageserver.id
|
||||
else:
|
||||
generation = None
|
||||
|
||||
location_conf["generation"] = generation
|
||||
|
||||
pageserver.tenant_location_configure(tenant_id, location_conf)
|
||||
last_state[pageserver.id] = (mode, generation)
|
||||
|
||||
if mode.startswith("Attached"):
|
||||
# TODO: a variant of this test that runs background endpoint workloads, as well as
|
||||
# the inter-step workloads.
|
||||
|
||||
workload.churn_rows(
|
||||
rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale"
|
||||
)
|
||||
workload.validate(pageserver.id)
|
||||
|
||||
# Attach all pageservers
|
||||
for ps in env.pageservers:
|
||||
location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
|
||||
ps.tenant_location_configure(tenant_id, location_conf)
|
||||
|
||||
# Confirm that all are readable
|
||||
for ps in env.pageservers:
|
||||
workload.validate(ps.id)
|
||||
|
||||
# Detach all pageservers
|
||||
for ps in env.pageservers:
|
||||
location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}}
|
||||
ps.tenant_location_configure(tenant_id, location_conf)
|
||||
|
||||
# Confirm that all local disk state was removed on detach
|
||||
# TODO
|
||||
|
||||
|
||||
def test_live_migration(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the sequence of location states that are used in a live migration.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
pageserver_a = env.pageservers[0]
|
||||
pageserver_b = env.pageservers[1]
|
||||
|
||||
initial_generation = 1
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(env.pageservers[0].id)
|
||||
workload.write_rows(256, env.pageservers[0].id)
|
||||
|
||||
# Make the destination a secondary location
|
||||
pageserver_b.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Secondary",
|
||||
"secondary_conf": {"warm": True},
|
||||
"tenant_conf": {},
|
||||
},
|
||||
)
|
||||
|
||||
workload.churn_rows(64, pageserver_a.id, upload=False)
|
||||
|
||||
# Set origin attachment to stale
|
||||
log.info("Setting origin to AttachedStale")
|
||||
pageserver_a.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "AttachedStale",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": initial_generation,
|
||||
},
|
||||
flush_ms=5000,
|
||||
)
|
||||
|
||||
migrated_generation = env.attachment_service.attach_hook(tenant_id, pageserver_b.id)
|
||||
log.info(f"Acquired generation {migrated_generation} for destination pageserver")
|
||||
assert migrated_generation == initial_generation + 1
|
||||
|
||||
# Writes and reads still work in AttachedStale.
|
||||
workload.validate(pageserver_a.id)
|
||||
workload.validate(pageserver_a.id)
|
||||
|
||||
# Ensure that secondary location's timeline directory is populated: we will then
|
||||
# do some more writes on top of that to ensure that the newly attached pageserver
|
||||
# properly makes use of the downloaded layers as well as ingesting WAL to catch up.
|
||||
pageserver_a.http_client().secondary_tenant_upload(tenant_id)
|
||||
pageserver_b.http_client().secondary_tenant_download(tenant_id)
|
||||
|
||||
# Generate some more dirty writes
|
||||
workload.churn_rows(64, pageserver_a.id)
|
||||
|
||||
# Attach the destination
|
||||
log.info("Setting destination to AttachedMulti")
|
||||
pageserver_b.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "AttachedMulti",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": migrated_generation,
|
||||
},
|
||||
)
|
||||
|
||||
# Wait for destination LSN to catch up with origin
|
||||
origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[
|
||||
"last_record_lsn"
|
||||
]
|
||||
|
||||
def caught_up():
|
||||
destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[
|
||||
"last_record_lsn"
|
||||
]
|
||||
log.info(
|
||||
f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}"
|
||||
)
|
||||
assert destination_lsn >= origin_lsn
|
||||
|
||||
wait_until(100, 0.1, caught_up)
|
||||
|
||||
# The destination should accept writes
|
||||
workload.churn_rows(64, pageserver_b.id)
|
||||
|
||||
# Dual attached: both are readable.
|
||||
workload.validate(pageserver_a.id)
|
||||
workload.validate(pageserver_b.id)
|
||||
|
||||
# Revert the origin to secondary
|
||||
log.info("Setting origin to Secondary")
|
||||
pageserver_a.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Secondary",
|
||||
"secondary_conf": {"warm": True},
|
||||
"tenant_conf": {},
|
||||
},
|
||||
)
|
||||
|
||||
workload.churn_rows(64, pageserver_b.id)
|
||||
|
||||
# Put the destination into final state
|
||||
pageserver_b.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "AttachedSingle",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": migrated_generation,
|
||||
},
|
||||
)
|
||||
|
||||
workload.churn_rows(64, pageserver_b.id)
|
||||
workload.validate(pageserver_b.id)
|
||||
|
||||
|
||||
def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
|
||||
"""
|
||||
Inspect local storage on a pageserver to discover which layer files are present.
|
||||
|
||||
:return: list of relative paths to layers, from the timeline root.
|
||||
"""
|
||||
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
|
||||
def relative(p: Path) -> Path:
|
||||
return p.relative_to(timeline_path)
|
||||
|
||||
return sorted(
|
||||
list(
|
||||
map(
|
||||
relative,
|
||||
filter(
|
||||
lambda path: path.name != "metadata"
|
||||
and "ephemeral" not in path.name
|
||||
and "temp" not in path.name,
|
||||
timeline_path.glob("*"),
|
||||
),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the overall data flow in secondary mode:
|
||||
- Heatmap uploads from the attached location
|
||||
- Heatmap & layer downloads from the secondary location
|
||||
- Eviction of layers on the attached location results in deletion
|
||||
on the secondary location as well.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
ps_attached = env.pageservers[0]
|
||||
ps_secondary = env.pageservers[1]
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(env.pageservers[0].id)
|
||||
workload.write_rows(256, ps_attached.id)
|
||||
|
||||
# Configure a secondary location
|
||||
log.info("Setting up secondary location...")
|
||||
ps_secondary.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Secondary",
|
||||
"secondary_conf": {"warm": True},
|
||||
"tenant_conf": {},
|
||||
},
|
||||
)
|
||||
|
||||
# Explicit upload/download cycle
|
||||
# ==============================
|
||||
log.info("Synchronizing after initial write...")
|
||||
ps_attached.http_client().secondary_tenant_upload(tenant_id)
|
||||
|
||||
ps_secondary.http_client().secondary_tenant_download(tenant_id)
|
||||
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
# Make changes on attached pageserver, check secondary downloads them
|
||||
# ===================================================================
|
||||
log.info("Synchronizing after subsequent write...")
|
||||
workload.churn_rows(128, ps_attached.id)
|
||||
ps_attached.http_client().secondary_tenant_upload(tenant_id)
|
||||
ps_secondary.http_client().secondary_tenant_download(tenant_id)
|
||||
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
# Do evictions on attached pageserver, check secondary follows along
|
||||
# ==================================================================
|
||||
log.info("Evicting a layer...")
|
||||
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
|
||||
ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
|
||||
|
||||
log.info("Synchronizing after eviction...")
|
||||
ps_attached.http_client().secondary_tenant_upload(tenant_id)
|
||||
ps_secondary.http_client().secondary_tenant_download(tenant_id)
|
||||
|
||||
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
# Scrub the remote storage
|
||||
# ========================
|
||||
# This confirms that the scrubber isn't upset by the presence of the heatmap
|
||||
# TODO: depends on `jcsp/scrubber-index-part` branch.
|
||||
|
||||
# Detach secondary and delete tenant
|
||||
# ===================================
|
||||
# This confirms that the heatmap gets cleaned up as well as other normal content.
|
||||
log.info("Detaching secondary location...")
|
||||
ps_secondary.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Detached",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
},
|
||||
)
|
||||
|
||||
log.info("Deleting tenant...")
|
||||
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# def test_secondary_download_loop
|
||||
# Configure some short check intervals, and validate that layers are downloaded by secondary
|
||||
# without any explicit admin API calls.
|
||||
|
||||
# def test_secondary_eviction(neon_env_builder: NeonEnvBuilder):
|
||||
#
|
||||
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
import psycopg2
|
||||
@@ -364,10 +365,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
|
||||
|
||||
pid1 = get_pid(200, "http")["rows"][0]["pid"]
|
||||
|
||||
time.sleep(0.02)
|
||||
|
||||
# query should be on the same connection
|
||||
rows = get_pid(200, "http")["rows"]
|
||||
assert rows == [{"pid": pid1}]
|
||||
|
||||
time.sleep(0.02)
|
||||
|
||||
# incorrect password should not work
|
||||
res = get_pid(400, "foobar")
|
||||
assert "password authentication failed for user" in res["message"]
|
||||
@@ -378,10 +383,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
|
||||
pid2 = get_pid(200, "http2")["rows"][0]["pid"]
|
||||
assert pid1 != pid2
|
||||
|
||||
time.sleep(0.02)
|
||||
|
||||
# query should be on an existing connection
|
||||
pid = get_pid(200, "http2")["rows"][0]["pid"]
|
||||
assert pid in [pid1, pid2]
|
||||
|
||||
time.sleep(0.02)
|
||||
|
||||
# old password should not work
|
||||
res = get_pid(400, "http")
|
||||
assert "password authentication failed for user" in res["message"]
|
||||
@@ -419,6 +428,7 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
|
||||
)
|
||||
|
||||
pid1 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
|
||||
time.sleep(0.02)
|
||||
query(200, "BEGIN")
|
||||
pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
|
||||
assert pid1 != pid2
|
||||
|
||||
@@ -586,7 +586,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
log.info("sending delete request")
|
||||
checkpoint_allowed_to_fail.set()
|
||||
env.pageserver.allowed_errors.append(
|
||||
".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
|
||||
".* ERROR .*Error processing HTTP request: InternalServerError\\(The timeline or pageserver is shutting down"
|
||||
".* ERROR .*[Cc]ould not flush frozen layer.*"
|
||||
)
|
||||
|
||||
@@ -757,12 +757,14 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
|
||||
create_thread.join()
|
||||
|
||||
|
||||
# Regression test for a race condition where L0 layers are compacted before the upload,
|
||||
# resulting in the uploading complaining about the file not being found
|
||||
# https://github.com/neondatabase/neon/issues/4526
|
||||
def test_compaction_delete_before_upload(
|
||||
def test_compaction_waits_for_upload(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Compaction waits for outstanding uploads to complete, so that it avoids deleting layers
|
||||
files that have not yet been uploaded. This test forces a race between upload and
|
||||
compaction.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
@@ -792,50 +794,81 @@ def test_compaction_delete_before_upload(
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
# Now make the flushing hang and update one small piece of data
|
||||
client.configure_failpoints(("flush-frozen-pausable", "pause"))
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "pause"))
|
||||
|
||||
endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
|
||||
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
|
||||
barrier = threading.Barrier(2)
|
||||
checkpoint_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
|
||||
compact_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
|
||||
compact_barrier = threading.Barrier(2)
|
||||
|
||||
def checkpoint_in_background():
|
||||
barrier.wait()
|
||||
try:
|
||||
log.info("Checkpoint starting")
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
q.put(None)
|
||||
log.info("Checkpoint complete")
|
||||
checkpoint_result.put(None)
|
||||
except PageserverApiException as e:
|
||||
q.put(e)
|
||||
log.info("Checkpoint errored: {e}")
|
||||
checkpoint_result.put(e)
|
||||
|
||||
create_thread = threading.Thread(target=checkpoint_in_background)
|
||||
create_thread.start()
|
||||
def compact_in_background():
|
||||
compact_barrier.wait()
|
||||
try:
|
||||
log.info("Compaction starting")
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
log.info("Compaction complete")
|
||||
compact_result.put(None)
|
||||
except PageserverApiException as e:
|
||||
log.info("Compaction errored: {e}")
|
||||
compact_result.put(e)
|
||||
|
||||
checkpoint_thread = threading.Thread(target=checkpoint_in_background)
|
||||
checkpoint_thread.start()
|
||||
|
||||
compact_thread = threading.Thread(target=compact_in_background)
|
||||
compact_thread.start()
|
||||
|
||||
try:
|
||||
barrier.wait()
|
||||
# Start the checkpoint, see that it blocks
|
||||
log.info("Waiting to see checkpoint hang...")
|
||||
time.sleep(5)
|
||||
assert checkpoint_result.empty()
|
||||
|
||||
time.sleep(4)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
# Start the compaction, see that it finds work to do but blocks
|
||||
compact_barrier.wait()
|
||||
log.info("Waiting to see compaction hang...")
|
||||
time.sleep(5)
|
||||
assert compact_result.empty()
|
||||
|
||||
client.configure_failpoints(("flush-frozen-pausable", "off"))
|
||||
# This is logged once compaction is started, but before we wait for operations to complete
|
||||
assert env.pageserver.log_contains("compact_level0_phase1 stats available.")
|
||||
|
||||
conflict = q.get()
|
||||
# Once we unblock uploads the compaction should complete successfully
|
||||
log.info("Disabling failpoint")
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "off"))
|
||||
log.info("Awaiting compaction result")
|
||||
assert compact_result.get(timeout=10) is None
|
||||
log.info("Awaiting checkpoint result")
|
||||
assert checkpoint_result.get(timeout=10) is None
|
||||
|
||||
assert conflict is None
|
||||
except Exception:
|
||||
# Log the actual failure's backtrace here, before we proceed to join threads
|
||||
log.exception("Failure, cleaning up...")
|
||||
raise
|
||||
finally:
|
||||
create_thread.join()
|
||||
compact_barrier.abort()
|
||||
|
||||
# Add a delay for the uploads to run into either the file not found or the
|
||||
time.sleep(4)
|
||||
checkpoint_thread.join()
|
||||
compact_thread.join()
|
||||
|
||||
# Ensure that this actually terminates
|
||||
wait_upload_queue_empty(client, tenant_id, timeline_id)
|
||||
|
||||
# For now we are hitting this message.
|
||||
# Maybe in the future the underlying race condition will be fixed,
|
||||
# but until then, ensure that this message is hit instead.
|
||||
assert env.pageserver.log_contains(
|
||||
# We should not have hit the error handling path in uploads where the remote file is gone
|
||||
assert not env.pageserver.log_contains(
|
||||
"File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
|
||||
)
|
||||
|
||||
|
||||
@@ -227,9 +227,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*removing local file .* because it has unexpected length.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*removing local file .* because .*")
|
||||
|
||||
# FIXME: Are these expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
|
||||
Reference in New Issue
Block a user