mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 03:30:36 +00:00
Compare commits
28 Commits
release-pr
...
problame/l
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4b7fddeabe | ||
|
|
68386c19a2 | ||
|
|
db787dd6e0 | ||
|
|
6c5e8c6bb6 | ||
|
|
c5259dcf32 | ||
|
|
112008519c | ||
|
|
5917a54719 | ||
|
|
dbe3290f89 | ||
|
|
bfcde8f9e6 | ||
|
|
dbb8377983 | ||
|
|
d91539b888 | ||
|
|
61fac1ab0b | ||
|
|
8d3e8078f7 | ||
|
|
373fa7c2ac | ||
|
|
1556234d9a | ||
|
|
55106aa981 | ||
|
|
64680b1373 | ||
|
|
b86cd24a23 | ||
|
|
d85baac608 | ||
|
|
f06f274b38 | ||
|
|
d98575f5a6 | ||
|
|
33d0072342 | ||
|
|
174bceccb1 | ||
|
|
f5bbba5014 | ||
|
|
868cf8aeb5 | ||
|
|
9f03dd24c2 | ||
|
|
dc96a7604a | ||
|
|
d7c94e67ce |
12
Cargo.lock
generated
12
Cargo.lock
generated
@@ -2393,6 +2393,17 @@ dependencies = [
|
||||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nostarve_queue"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"rand",
|
||||
"scopeguard",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "notify"
|
||||
version = "5.2.0"
|
||||
@@ -2704,6 +2715,7 @@ dependencies = [
|
||||
"itertools",
|
||||
"metrics",
|
||||
"nix 0.26.2",
|
||||
"nostarve_queue",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
|
||||
@@ -26,6 +26,7 @@ members = [
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/nostarve_queue",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -180,6 +181,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
nostarve_queue = { path = "./libs/nostarve_queue" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
@@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead};
|
||||
// it's waiting. If the process hasn't started/stopped after 5 seconds,
|
||||
// it prints a notice that it's taking long, but keeps waiting.
|
||||
//
|
||||
const RETRY_UNTIL_SECS: u64 = 10;
|
||||
const RETRY_UNTIL_SECS: u64 = 40;
|
||||
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
|
||||
const RETRY_INTERVAL_MILLIS: u64 = 100;
|
||||
const DOT_EVERY_RETRIES: u64 = 10;
|
||||
|
||||
@@ -18,7 +18,7 @@ use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::blocking::{Client, ClientBuilder, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::auth::{Claims, Scope};
|
||||
@@ -93,7 +93,7 @@ impl PageServerNode {
|
||||
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
|
||||
conf: conf.clone(),
|
||||
env: env.clone(),
|
||||
http_client: Client::new(),
|
||||
http_client: ClientBuilder::new().timeout(None).build().unwrap(),
|
||||
http_base_url: format!("http://{}/v1", conf.listen_http_addr),
|
||||
}
|
||||
}
|
||||
|
||||
20
download_all_layers.py
Normal file
20
download_all_layers.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import requests
|
||||
|
||||
tenants = requests.get("http://localhost:15003/v1/tenant")
|
||||
tenants.raise_for_status()
|
||||
tenants = tenants.json()
|
||||
|
||||
for tenant in tenants:
|
||||
id = tenant["id"]
|
||||
timelines = requests.get(f"http://localhost:15003/v1/tenant/{id}/timeline")
|
||||
timelines.raise_for_status()
|
||||
for timeline in timelines.json():
|
||||
tid = timeline["tenant_id"]
|
||||
tlid = timeline["timeline_id"]
|
||||
layers = requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer")
|
||||
layers.raise_for_status()
|
||||
layers = layers.json()
|
||||
for l in layers["historic_layers"]:
|
||||
if l["remote"] == False:
|
||||
requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer/{l['layer_file_name']}")
|
||||
|
||||
14
libs/nostarve_queue/Cargo.toml
Normal file
14
libs/nostarve_queue/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "nostarve_queue"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
scopeguard.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
futures.workspace = true
|
||||
rand.workspace = true
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] }
|
||||
316
libs/nostarve_queue/src/lib.rs
Normal file
316
libs/nostarve_queue/src/lib.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
//! Synchronization primitive to prevent starvation among concurrent tasks that do the same work.
|
||||
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
fmt,
|
||||
future::poll_fn,
|
||||
sync::Mutex,
|
||||
task::{Poll, Waker},
|
||||
};
|
||||
|
||||
pub struct Queue<T> {
|
||||
inner: Mutex<Inner<T>>,
|
||||
}
|
||||
|
||||
struct Inner<T> {
|
||||
waiters: VecDeque<usize>,
|
||||
free: VecDeque<usize>,
|
||||
slots: Vec<Option<(Option<Waker>, Option<T>)>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Position<'q, T> {
|
||||
idx: usize,
|
||||
queue: &'q Queue<T>,
|
||||
}
|
||||
|
||||
impl<T> fmt::Debug for Position<'_, T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Position").field("idx", &self.idx).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Inner<T> {
|
||||
#[cfg(not(test))]
|
||||
#[inline]
|
||||
fn integrity_check(&self) {}
|
||||
|
||||
#[cfg(test)]
|
||||
fn integrity_check(&self) {
|
||||
use std::collections::HashSet;
|
||||
let waiters = self.waiters.iter().copied().collect::<HashSet<_>>();
|
||||
let free = self.free.iter().copied().collect::<HashSet<_>>();
|
||||
for (slot_idx, slot) in self.slots.iter().enumerate() {
|
||||
match slot {
|
||||
None => {
|
||||
assert!(!waiters.contains(&slot_idx));
|
||||
assert!(free.contains(&slot_idx));
|
||||
}
|
||||
Some((None, None)) => {
|
||||
assert!(waiters.contains(&slot_idx));
|
||||
assert!(!free.contains(&slot_idx));
|
||||
}
|
||||
Some((Some(_), Some(_))) => {
|
||||
assert!(!waiters.contains(&slot_idx));
|
||||
assert!(!free.contains(&slot_idx));
|
||||
}
|
||||
Some((Some(_), None)) => {
|
||||
assert!(waiters.contains(&slot_idx));
|
||||
assert!(!free.contains(&slot_idx));
|
||||
}
|
||||
Some((None, Some(_))) => {
|
||||
assert!(!waiters.contains(&slot_idx));
|
||||
assert!(!free.contains(&slot_idx));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Queue<T> {
|
||||
pub fn new(size: usize) -> Self {
|
||||
Queue {
|
||||
inner: Mutex::new(Inner {
|
||||
waiters: VecDeque::new(),
|
||||
free: (0..size).collect(),
|
||||
slots: {
|
||||
let mut v = Vec::with_capacity(size);
|
||||
v.resize_with(size, || None);
|
||||
v
|
||||
},
|
||||
}),
|
||||
}
|
||||
}
|
||||
pub fn begin(&self) -> Result<Position<T>, ()> {
|
||||
#[cfg(test)]
|
||||
tracing::trace!("get in line locking inner");
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.integrity_check();
|
||||
let my_waitslot_idx = inner
|
||||
.free
|
||||
.pop_front()
|
||||
.expect("can't happen, len(slots) = len(waiters");
|
||||
inner.waiters.push_back(my_waitslot_idx);
|
||||
let prev = inner.slots[my_waitslot_idx].replace((None, None));
|
||||
assert!(prev.is_none());
|
||||
inner.integrity_check();
|
||||
Ok(Position {
|
||||
idx: my_waitslot_idx,
|
||||
queue: &self,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'q, T> Position<'q, T> {
|
||||
pub fn complete_and_wait(self, datum: T) -> impl std::future::Future<Output = T> + 'q {
|
||||
#[cfg(test)]
|
||||
tracing::trace!("found victim locking waiters");
|
||||
let mut inner = self.queue.inner.lock().unwrap();
|
||||
inner.integrity_check();
|
||||
let winner_idx = inner.waiters.pop_front().expect("we put ourselves in");
|
||||
#[cfg(test)]
|
||||
tracing::trace!(winner_idx, "putting victim into next waiters slot");
|
||||
let winner_slot = inner.slots[winner_idx].as_mut().unwrap();
|
||||
let prev = winner_slot.1.replace(datum);
|
||||
assert!(
|
||||
prev.is_none(),
|
||||
"ensure we didn't mess up this simple ring buffer structure"
|
||||
);
|
||||
if let Some(waker) = winner_slot.0.take() {
|
||||
#[cfg(test)]
|
||||
tracing::trace!(winner_idx, "waking up winner");
|
||||
waker.wake()
|
||||
}
|
||||
inner.integrity_check();
|
||||
drop(inner); // the poll_fn locks it again
|
||||
|
||||
let mut poll_num = 0;
|
||||
let mut drop_guard = Some(scopeguard::guard((), |()| {
|
||||
panic!("must not drop this future until Ready");
|
||||
}));
|
||||
|
||||
// take the victim that was found by someone else
|
||||
poll_fn(move |cx| {
|
||||
let my_waitslot_idx = self.idx;
|
||||
poll_num += 1;
|
||||
#[cfg(test)]
|
||||
tracing::trace!(poll_num, "poll_fn locking waiters");
|
||||
let mut inner = self.queue.inner.lock().unwrap();
|
||||
inner.integrity_check();
|
||||
let my_waitslot = inner.slots[self.idx].as_mut().unwrap();
|
||||
// assert!(
|
||||
// poll_num <= 2,
|
||||
// "once we place the waker in the slot, next wakeup should have a result: {}",
|
||||
// my_waitslot.1.is_some()
|
||||
// );
|
||||
if let Some(res) = my_waitslot.1.take() {
|
||||
#[cfg(test)]
|
||||
tracing::trace!(poll_num, "have cache slot");
|
||||
// above .take() resets the waiters slot to None
|
||||
debug_assert!(my_waitslot.0.is_none());
|
||||
debug_assert!(my_waitslot.1.is_none());
|
||||
inner.slots[my_waitslot_idx] = None;
|
||||
inner.free.push_back(my_waitslot_idx);
|
||||
let _ = scopeguard::ScopeGuard::into_inner(drop_guard.take().unwrap());
|
||||
inner.integrity_check();
|
||||
return Poll::Ready(res);
|
||||
}
|
||||
// assert_eq!(poll_num, 1);
|
||||
if !my_waitslot
|
||||
.0
|
||||
.as_ref()
|
||||
.map(|existing| cx.waker().will_wake(existing))
|
||||
.unwrap_or(false)
|
||||
{
|
||||
let prev = my_waitslot.0.replace(cx.waker().clone());
|
||||
#[cfg(test)]
|
||||
tracing::trace!(poll_num, prev_is_some = prev.is_some(), "updating waker");
|
||||
}
|
||||
inner.integrity_check();
|
||||
#[cfg(test)]
|
||||
tracing::trace!(poll_num, "waiting to be woken up");
|
||||
Poll::Pending
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::{
|
||||
sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
},
|
||||
task::Poll,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use rand::RngCore;
|
||||
|
||||
#[tokio::test]
|
||||
async fn in_order_completion_and_wait() {
|
||||
let queue = super::Queue::new(2);
|
||||
|
||||
let q1 = queue.begin().unwrap();
|
||||
let q2 = queue.begin().unwrap();
|
||||
|
||||
assert_eq!(q1.complete_and_wait(23).await, 23);
|
||||
assert_eq!(q2.complete_and_wait(42).await, 42);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn out_of_order_completion_and_wait() {
|
||||
let queue = super::Queue::new(2);
|
||||
|
||||
let q1 = queue.begin().unwrap();
|
||||
let q2 = queue.begin().unwrap();
|
||||
|
||||
let mut q2compfut = q2.complete_and_wait(23);
|
||||
|
||||
match futures::poll!(&mut q2compfut) {
|
||||
Poll::Pending => {}
|
||||
Poll::Ready(_) => panic!("should not be ready yet, it's queued after q1"),
|
||||
}
|
||||
|
||||
let q1res = q1.complete_and_wait(42).await;
|
||||
assert_eq!(q1res, 23);
|
||||
|
||||
let q2res = q2compfut.await;
|
||||
assert_eq!(q2res, 42);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn in_order_completion_out_of_order_wait() {
|
||||
let queue = super::Queue::new(2);
|
||||
|
||||
let q1 = queue.begin().unwrap();
|
||||
let q2 = queue.begin().unwrap();
|
||||
|
||||
let mut q1compfut = q1.complete_and_wait(23);
|
||||
|
||||
let mut q2compfut = q2.complete_and_wait(42);
|
||||
|
||||
match futures::poll!(&mut q2compfut) {
|
||||
Poll::Pending => {
|
||||
unreachable!("q2 should be ready, it wasn't first but q1 is serviced already")
|
||||
}
|
||||
Poll::Ready(x) => assert_eq!(x, 42),
|
||||
}
|
||||
|
||||
assert_eq!(futures::poll!(&mut q1compfut), Poll::Ready(23));
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
async fn stress() {
|
||||
let ntasks = 8;
|
||||
let queue_size = 8;
|
||||
let queue = Arc::new(super::Queue::new(queue_size));
|
||||
|
||||
let stop = Arc::new(AtomicBool::new(false));
|
||||
|
||||
let mut tasks = vec![];
|
||||
for i in 0..ntasks {
|
||||
let jh = tokio::spawn({
|
||||
let queue = Arc::clone(&queue);
|
||||
let stop = Arc::clone(&stop);
|
||||
async move {
|
||||
while !stop.load(Ordering::Relaxed) {
|
||||
let q = queue.begin().unwrap();
|
||||
for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
|
||||
std::hint::spin_loop();
|
||||
}
|
||||
q.complete_and_wait(i).await;
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
});
|
||||
tasks.push(jh);
|
||||
}
|
||||
|
||||
tokio::time::sleep(Duration::from_secs(10)).await;
|
||||
|
||||
stop.store(true, Ordering::Relaxed);
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stress_two_runtimes_shared_queue() {
|
||||
std::thread::scope(|s| {
|
||||
let ntasks = 8;
|
||||
let queue_size = 8;
|
||||
let queue = Arc::new(super::Queue::new(queue_size));
|
||||
|
||||
let stop = Arc::new(AtomicBool::new(false));
|
||||
|
||||
for i in 0..ntasks {
|
||||
s.spawn({
|
||||
let queue = Arc::clone(&queue);
|
||||
let stop = Arc::clone(&stop);
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
rt.block_on(async move {
|
||||
while !stop.load(Ordering::Relaxed) {
|
||||
let q = queue.begin().unwrap();
|
||||
for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
|
||||
std::hint::spin_loop();
|
||||
}
|
||||
q.complete_and_wait(i).await;
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
std::thread::sleep(Duration::from_secs(10));
|
||||
|
||||
stop.store(true, Ordering::Relaxed);
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -37,6 +37,7 @@ humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
nix.workspace = true
|
||||
nostarve_queue.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
num_cpus = { version = "1.15" }
|
||||
num-traits.workspace = true
|
||||
|
||||
@@ -580,6 +580,31 @@ fn start_pageserver(
|
||||
);
|
||||
}
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::BackgroundRuntimeTurnaroundMeasure,
|
||||
None,
|
||||
None,
|
||||
"background runtime turnaround measure",
|
||||
true,
|
||||
async move {
|
||||
let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
|
||||
let server = server
|
||||
.serve(hyper::service::make_service_fn(|_| async move {
|
||||
Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
|
||||
move |_: hyper::Request<hyper::Body>| async move {
|
||||
Ok::<_, std::convert::Infallible>(hyper::Response::new(
|
||||
hyper::Body::from(format!("alive")),
|
||||
))
|
||||
},
|
||||
))
|
||||
}))
|
||||
.with_graceful_shutdown(task_mgr::shutdown_watcher());
|
||||
server.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
@@ -143,7 +144,7 @@ pub async fn collect_metrics(
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
metric_collection_interval,
|
||||
"consumption_metrics_collect_metrics",
|
||||
BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -268,6 +269,11 @@ async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
|
||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
|
||||
// TODO should we just use concurrent_background_tasks_rate_limit().
|
||||
// We can put in some prioritization for consumption metrics.
|
||||
// Same for the loop that fetches computed metrics.
|
||||
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
||||
// which turns out is really handy to understand the system.
|
||||
if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
|
||||
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
|
||||
}
|
||||
@@ -277,7 +283,7 @@ async fn calculate_synthetic_size_worker(
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
synthetic_size_calculation_interval,
|
||||
"consumption_metrics_synthetic_size_worker",
|
||||
BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -314,7 +314,6 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum PageCacheErrorKind {
|
||||
AcquirePinnedSlotTimeout,
|
||||
EvictIterLimit,
|
||||
}
|
||||
|
||||
pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
|
||||
@@ -1061,6 +1060,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
|
||||
Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
|
||||
Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_period_overrun_count",
|
||||
|
||||
@@ -66,8 +66,7 @@
|
||||
//! inserted to the mapping, but you must hold the write-lock on the slot until
|
||||
//! the contents are valid. If you need to release the lock without initializing
|
||||
//! the contents, you must remove the mapping first. We make that easy for the
|
||||
//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
|
||||
//! page, the caller must explicitly call guard.mark_valid() after it has
|
||||
//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
|
||||
//! initialized it. If the guard is dropped without calling mark_valid(), the
|
||||
//! mapping is automatically removed and the slot is marked free.
|
||||
//!
|
||||
@@ -84,6 +83,7 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::instrument;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -253,6 +253,9 @@ pub struct PageCache {
|
||||
next_evict_slot: AtomicUsize,
|
||||
|
||||
size_metrics: &'static PageCacheSizeMetrics,
|
||||
|
||||
find_victim_waiters:
|
||||
nostarve_queue::Queue<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
|
||||
}
|
||||
|
||||
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
@@ -286,23 +289,25 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
|
||||
///
|
||||
/// Counterintuitively, this is used even for a read, if the requested page is not
|
||||
/// currently found in the page cache. In that case, the caller of lock_for_read()
|
||||
/// is expected to fill in the page contents and call mark_valid(). Similarly
|
||||
/// lock_for_write() can return an invalid buffer that the caller is expected to
|
||||
/// to initialize.
|
||||
///
|
||||
/// is expected to fill in the page contents and call mark_valid().
|
||||
pub struct PageWriteGuard<'i> {
|
||||
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
|
||||
state: PageWriteGuardState<'i>,
|
||||
}
|
||||
|
||||
_permit: PinnedSlotsPermit,
|
||||
|
||||
// Are the page contents currently valid?
|
||||
// Used to mark pages as invalid that are assigned but not yet filled with data.
|
||||
valid: bool,
|
||||
enum PageWriteGuardState<'i> {
|
||||
Invalid {
|
||||
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
|
||||
_permit: PinnedSlotsPermit,
|
||||
},
|
||||
Downgraded,
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for PageWriteGuard<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.inner.buf
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -310,25 +315,37 @@ impl std::ops::Deref for PageWriteGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.inner.buf
|
||||
match &self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
|
||||
fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
|
||||
self.inner.buf
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => todo!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageWriteGuard<'_> {
|
||||
impl<'a> PageWriteGuard<'a> {
|
||||
/// Mark that the buffer contents are now valid.
|
||||
pub fn mark_valid(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
assert!(
|
||||
!self.valid,
|
||||
"mark_valid called on a buffer that was already valid"
|
||||
);
|
||||
self.valid = true;
|
||||
#[must_use]
|
||||
pub fn mark_valid(mut self) -> PageReadGuard<'a> {
|
||||
let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
|
||||
match prev {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => {
|
||||
assert!(inner.key.is_some());
|
||||
PageReadGuard {
|
||||
_permit: Arc::new(_permit),
|
||||
slot_guard: inner.downgrade(),
|
||||
}
|
||||
}
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,11 +356,14 @@ impl Drop for PageWriteGuard<'_> {
|
||||
/// initializing it, remove the mapping from the page cache.
|
||||
///
|
||||
fn drop(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
if !self.valid {
|
||||
let self_key = self.inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
self.inner.key = None;
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => {
|
||||
assert!(inner.key.is_some());
|
||||
let self_key = inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
inner.key = None;
|
||||
}
|
||||
PageWriteGuardState::Downgraded => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -354,12 +374,6 @@ pub enum ReadBufResult<'a> {
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
/// lock_for_write() return value
|
||||
pub enum WriteBufResult<'a> {
|
||||
Found(PageWriteGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
impl PageCache {
|
||||
//
|
||||
// Section 1.1: Public interface functions for looking up and memorizing materialized page
|
||||
@@ -429,8 +443,9 @@ impl PageCache {
|
||||
///
|
||||
/// Store an image of the given page in the cache.
|
||||
///
|
||||
// #[cfg_attr(test, instrument(skip_all, level = "trace", fields(%key, %lsn)))]
|
||||
pub async fn memorize_materialized_page(
|
||||
&self,
|
||||
&'static self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key: Key,
|
||||
@@ -446,26 +461,84 @@ impl PageCache {
|
||||
lsn,
|
||||
};
|
||||
|
||||
match self.lock_for_write(&cache_key).await? {
|
||||
WriteBufResult::Found(write_guard) => {
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(*write_guard == img);
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
if inner.key.as_ref() == Some(&cache_key) {
|
||||
slot.inc_usage_count();
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
debug_assert_eq!(inner.buf.len(), img.len());
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(inner.buf == img);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
WriteBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(img);
|
||||
write_guard.mark_valid();
|
||||
}
|
||||
}
|
||||
debug_assert!(permit.is_some());
|
||||
|
||||
Ok(())
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self
|
||||
.find_victim(permit.as_ref().unwrap())
|
||||
.await
|
||||
.context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
// Create a write guard for the slot so we go through the expected motions.
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
let mut write_guard = PageWriteGuard {
|
||||
state: PageWriteGuardState::Invalid {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
},
|
||||
};
|
||||
write_guard.copy_from_slice(img);
|
||||
let _ = write_guard.mark_valid();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with immutable file pages.
|
||||
|
||||
// #[cfg_attr(test, instrument(skip_all, level = "trace", fields(?file_id, ?blkno)))]
|
||||
pub async fn read_immutable_buf(
|
||||
&self,
|
||||
&'static self,
|
||||
file_id: FileId,
|
||||
blkno: u32,
|
||||
ctx: &RequestContext,
|
||||
@@ -571,7 +644,7 @@ impl PageCache {
|
||||
/// ```
|
||||
///
|
||||
async fn lock_for_read(
|
||||
&self,
|
||||
&'static self,
|
||||
cache_key: &mut CacheKey,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
@@ -638,99 +711,10 @@ impl PageCache {
|
||||
);
|
||||
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: false,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up a page in the cache and lock it in write mode. If it's not
|
||||
/// found, returns None.
|
||||
///
|
||||
/// When locking a page for writing, the search criteria is always "exact".
|
||||
async fn try_lock_for_write(
|
||||
&self,
|
||||
cache_key: &CacheKey,
|
||||
permit: &mut Option<PinnedSlotsPermit>,
|
||||
) -> Option<PageWriteGuard> {
|
||||
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
return Some(PageWriteGuard {
|
||||
state: PageWriteGuardState::Invalid {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Return a write-locked buffer for given block.
|
||||
///
|
||||
/// Similar to lock_for_read(), but the returned buffer is write-locked and
|
||||
/// may be modified by the caller even if it's already found in the cache.
|
||||
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
|
||||
debug_assert!(permit.is_none());
|
||||
return Ok(WriteBufResult::Found(write_guard));
|
||||
}
|
||||
debug_assert!(permit.is_some());
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self
|
||||
.find_victim(permit.as_ref().unwrap())
|
||||
.await
|
||||
.context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: false,
|
||||
}));
|
||||
}
|
||||
}
|
||||
@@ -775,7 +759,7 @@ impl PageCache {
|
||||
///
|
||||
/// Like 'search_mapping, but performs an "exact" search. Used for
|
||||
/// allocating a new buffer.
|
||||
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
||||
fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
|
||||
match key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
@@ -882,10 +866,15 @@ impl PageCache {
|
||||
///
|
||||
/// On return, the slot is empty and write-locked.
|
||||
async fn find_victim(
|
||||
&self,
|
||||
&'static self,
|
||||
_permit_witness: &PinnedSlotsPermit,
|
||||
) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
|
||||
let iter_limit = self.slots.len() * 10;
|
||||
let nostarve_position = self.find_victim_waiters.begin()
|
||||
.expect("we initialize the nostarve queue to the same size as the slots semaphore, and the caller is presenting a permit");
|
||||
|
||||
// let span = tracing::trace_span!("find_victim", ?nostarve_position);
|
||||
// let _enter = span.enter();
|
||||
|
||||
let mut iters = 0;
|
||||
loop {
|
||||
iters += 1;
|
||||
@@ -897,41 +886,8 @@ impl PageCache {
|
||||
let mut inner = match slot.inner.try_write() {
|
||||
Ok(inner) => inner,
|
||||
Err(_err) => {
|
||||
if iters > iter_limit {
|
||||
// NB: Even with the permits, there's no hard guarantee that we will find a slot with
|
||||
// any particular number of iterations: other threads might race ahead and acquire and
|
||||
// release pins just as we're scanning the array.
|
||||
//
|
||||
// Imagine that nslots is 2, and as starting point, usage_count==1 on all
|
||||
// slots. There are two threads running concurrently, A and B. A has just
|
||||
// acquired the permit from the semaphore.
|
||||
//
|
||||
// A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
|
||||
// B: Acquire permit.
|
||||
// B: Look at slot 2, decrement its usage_count to zero and continue the search
|
||||
// B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
|
||||
// B: Release pin and permit again
|
||||
// B: Acquire permit.
|
||||
// B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
|
||||
// B: Release pin and permit again
|
||||
//
|
||||
// Now we're back in the starting situation that both slots have
|
||||
// usage_count 1, but A has now been through one iteration of the
|
||||
// find_victim() loop. This can repeat indefinitely and on each
|
||||
// iteration, A's iteration count increases by one.
|
||||
//
|
||||
// So, even though the semaphore for the permits is fair, the victim search
|
||||
// itself happens in parallel and is not fair.
|
||||
// Hence even with a permit, a task can theoretically be starved.
|
||||
// To avoid this, we'd need tokio to give priority to tasks that are holding
|
||||
// permits for longer.
|
||||
// Note that just yielding to tokio during iteration without such
|
||||
// priority boosting is likely counter-productive. We'd just give more opportunities
|
||||
// for B to bump usage count, further starving A.
|
||||
crate::metrics::page_cache_errors_inc(
|
||||
crate::metrics::PageCacheErrorKind::EvictIterLimit,
|
||||
);
|
||||
anyhow::bail!("exceeded evict iter limit");
|
||||
if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
|
||||
unreachable!("find_victim_waiters prevents starvation");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -942,7 +898,8 @@ impl PageCache {
|
||||
inner.key = None;
|
||||
}
|
||||
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
|
||||
return Ok((slot_idx, inner));
|
||||
|
||||
return Ok(nostarve_position.complete_and_wait((slot_idx, inner)).await);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -986,6 +943,7 @@ impl PageCache {
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
size_metrics,
|
||||
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
|
||||
find_victim_waiters: ::nostarve_queue::Queue::new(num_pages),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -293,6 +293,8 @@ pub enum TaskKind {
|
||||
|
||||
DebugTool,
|
||||
|
||||
BackgroundRuntimeTurnaroundMeasure,
|
||||
|
||||
#[cfg(test)]
|
||||
UnitTest,
|
||||
}
|
||||
|
||||
@@ -186,26 +186,21 @@ impl FileBlockReader {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => break Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
Ok(write_guard.mark_valid().into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,36 +72,32 @@ impl EphemeralFile {
|
||||
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
|
||||
if flushed_blknums.contains(&(blknum as u64)) {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum, self.file.path, e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum, self.file.path, e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
self.file
|
||||
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
}
|
||||
};
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
@@ -171,7 +167,7 @@ impl EphemeralFile {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
|
||||
write_guard.mark_valid();
|
||||
let _ = write_guard.mark_valid();
|
||||
// pre-warm successful
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -864,11 +864,11 @@ impl DeltaLayerInner {
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
actual_summary,
|
||||
expected_summary
|
||||
);
|
||||
// bail!(
|
||||
// "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
// actual_summary,
|
||||
// expected_summary
|
||||
// );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -457,11 +457,11 @@ impl ImageLayerInner {
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
actual_summary,
|
||||
expected_summary
|
||||
);
|
||||
// bail!(
|
||||
// "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
// actual_summary,
|
||||
// expected_summary
|
||||
// );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,73 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::completion;
|
||||
|
||||
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
|
||||
let permits = usize::max(
|
||||
1,
|
||||
// while a lot of the work is done on spawn_blocking, we still do
|
||||
// repartitioning in the async context. this should give leave us some workers
|
||||
// unblocked to be blocked on other work, hopefully easing any outside visible
|
||||
// effects of restarts.
|
||||
//
|
||||
// 6/8 is a guess; previously we ran with unlimited 8 and more from
|
||||
// spawn_blocking.
|
||||
(total_threads * 3).checked_div(4).unwrap_or(0),
|
||||
);
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(
|
||||
permits < total_threads,
|
||||
"need threads avail for shorter work"
|
||||
);
|
||||
tokio::sync::Semaphore::new(permits)
|
||||
});
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub(crate) enum BackgroundLoopKind {
|
||||
Compaction,
|
||||
Gc,
|
||||
Eviction,
|
||||
ConsumptionMetricsCollectMetrics,
|
||||
ConsumptionMetricsSyntheticSizeWorker,
|
||||
}
|
||||
|
||||
impl BackgroundLoopKind {
|
||||
fn as_static_str(&self) -> &'static str {
|
||||
let s: &'static str = self.into();
|
||||
s
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum RateLimitError {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
pub(crate) async fn concurrent_background_tasks_rate_limit(
|
||||
loop_kind: BackgroundLoopKind,
|
||||
_ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<impl Drop, RateLimitError> {
|
||||
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
|
||||
.with_label_values(&[loop_kind.as_static_str()])
|
||||
.inc();
|
||||
scopeguard::defer!(
|
||||
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
|
||||
);
|
||||
tokio::select! {
|
||||
permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
|
||||
match permit {
|
||||
Ok(permit) => Ok(permit),
|
||||
Err(_closed) => unreachable!("we never close the semaphore"),
|
||||
}
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
Err(RateLimitError::Cancelled)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Start per tenant background loops: compaction and gc.
|
||||
pub fn start_background_loops(
|
||||
tenant: &Arc<Tenant>,
|
||||
@@ -116,7 +183,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
@@ -184,7 +251,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "gc");
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
@@ -258,7 +325,11 @@ pub(crate) async fn random_init_delay(
|
||||
}
|
||||
|
||||
/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
|
||||
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
|
||||
pub(crate) fn warn_when_period_overrun(
|
||||
elapsed: Duration,
|
||||
period: Duration,
|
||||
task: BackgroundLoopKind,
|
||||
) {
|
||||
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
|
||||
if elapsed >= period && period != Duration::ZERO {
|
||||
// humantime does no significant digits clamping whereas Duration's debug is a bit more
|
||||
@@ -267,11 +338,11 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
|
||||
warn!(
|
||||
?elapsed,
|
||||
period = %humantime::format_duration(period),
|
||||
task,
|
||||
?task,
|
||||
"task iteration took longer than the configured period"
|
||||
);
|
||||
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
|
||||
.with_label_values(&[task, &format!("{}", period.as_secs())])
|
||||
.with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||
use crate::tenant::storage_layer::{
|
||||
DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
|
||||
};
|
||||
use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -158,7 +159,7 @@ pub struct Timeline {
|
||||
|
||||
/// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
|
||||
/// Never changes for the lifetime of this [`Timeline`] object.
|
||||
///
|
||||
///
|
||||
/// This duplicates the generation stored in LocationConf, but that structure is mutable:
|
||||
/// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
|
||||
generation: Generation,
|
||||
@@ -684,37 +685,17 @@ impl Timeline {
|
||||
) -> anyhow::Result<()> {
|
||||
const ROUNDS: usize = 2;
|
||||
|
||||
static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
|
||||
let permits = usize::max(
|
||||
1,
|
||||
// while a lot of the work is done on spawn_blocking, we still do
|
||||
// repartitioning in the async context. this should give leave us some workers
|
||||
// unblocked to be blocked on other work, hopefully easing any outside visible
|
||||
// effects of restarts.
|
||||
//
|
||||
// 6/8 is a guess; previously we ran with unlimited 8 and more from
|
||||
// spawn_blocking.
|
||||
(total_threads * 3).checked_div(4).unwrap_or(0),
|
||||
);
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(
|
||||
permits < total_threads,
|
||||
"need threads avail for shorter work"
|
||||
);
|
||||
tokio::sync::Semaphore::new(permits)
|
||||
});
|
||||
|
||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let _permit = tokio::select! {
|
||||
permit = CONCURRENT_COMPACTIONS.acquire() => {
|
||||
permit
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
return Ok(());
|
||||
}
|
||||
let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
|
||||
BackgroundLoopKind::Compaction,
|
||||
ctx,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(permit) => permit,
|
||||
Err(RateLimitError::Cancelled) => return Ok(()),
|
||||
};
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
|
||||
@@ -30,6 +30,7 @@ use crate::{
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
tasks::{BackgroundLoopKind, RateLimitError},
|
||||
timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
@@ -129,7 +130,11 @@ impl Timeline {
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
elapsed,
|
||||
p.period,
|
||||
BackgroundLoopKind::Eviction,
|
||||
);
|
||||
crate::metrics::EVICTION_ITERATION_DURATION
|
||||
.get_metric_with_label_values(&[
|
||||
&format!("{}", p.period.as_secs()),
|
||||
@@ -150,6 +155,17 @@ impl Timeline {
|
||||
) -> ControlFlow<()> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
|
||||
BackgroundLoopKind::Eviction,
|
||||
ctx,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(permit) => permit,
|
||||
Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
// If we evict layers but keep cached values derived from those layers, then
|
||||
// we face a storm of on-demand downloads after pageserver restart.
|
||||
// The reason is that the restart empties the caches, and so, the values
|
||||
|
||||
@@ -18,7 +18,8 @@ use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tokio::time::Instant;
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
@@ -110,7 +111,7 @@ impl OpenFiles {
|
||||
///
|
||||
/// On return, we hold a lock on the slot, and its 'tag' has been updated
|
||||
/// recently_used has been set. It's all ready for reuse.
|
||||
fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
|
||||
async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
|
||||
//
|
||||
// Run the clock algorithm to find a slot to replace.
|
||||
//
|
||||
@@ -142,7 +143,7 @@ impl OpenFiles {
|
||||
}
|
||||
retries += 1;
|
||||
} else {
|
||||
slot_guard = slot.inner.write().unwrap();
|
||||
slot_guard = slot.inner.write().await;
|
||||
index = next;
|
||||
break;
|
||||
}
|
||||
@@ -153,7 +154,7 @@ impl OpenFiles {
|
||||
// old file.
|
||||
//
|
||||
if let Some(old_file) = slot_guard.file.take() {
|
||||
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
|
||||
// the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
|
||||
// distinguish the two.
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::CloseByReplace)
|
||||
@@ -208,6 +209,29 @@ impl CrashsafeOverwriteError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Observe duration for the given storage I/O operation
|
||||
///
|
||||
/// Unlike `observe_closure_duration`, this supports async,
|
||||
/// where "support" means that we measure wall clock time.
|
||||
macro_rules! observe_duration {
|
||||
($op:expr, $($body:tt)*) => {{
|
||||
let instant = Instant::now();
|
||||
let result = $($body)*;
|
||||
let elapsed = instant.elapsed().as_secs_f64();
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get($op)
|
||||
.observe(elapsed);
|
||||
result
|
||||
}}
|
||||
}
|
||||
|
||||
macro_rules! with_file {
|
||||
($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
|
||||
let $ident = $this.lock_file().await?;
|
||||
observe_duration!($op, $($body)*)
|
||||
}};
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
/// Open a file in read-only mode. Like File::open.
|
||||
pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
|
||||
@@ -244,11 +268,9 @@ impl VirtualFile {
|
||||
tenant_id = "*".to_string();
|
||||
timeline_id = "*".to_string();
|
||||
}
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
|
||||
|
||||
let file = STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Open)
|
||||
.observe_closure_duration(|| open_options.open(path))?;
|
||||
let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
|
||||
|
||||
// Strip all options other than read and write.
|
||||
//
|
||||
@@ -331,22 +353,24 @@ impl VirtualFile {
|
||||
|
||||
/// Call File::sync_all() on the underlying File.
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
|
||||
.await?
|
||||
with_file!(self, StorageIoOperation::Fsync, |file| file
|
||||
.as_ref()
|
||||
.sync_all())
|
||||
}
|
||||
|
||||
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
|
||||
self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
|
||||
.await?
|
||||
with_file!(self, StorageIoOperation::Metadata, |file| file
|
||||
.as_ref()
|
||||
.metadata())
|
||||
}
|
||||
|
||||
/// Helper function that looks up the underlying File for this VirtualFile,
|
||||
/// opening it and evicting some other File if necessary. It calls 'func'
|
||||
/// with the physical File.
|
||||
async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
|
||||
where
|
||||
F: FnMut(&File) -> R,
|
||||
{
|
||||
/// Helper function internal to `VirtualFile` that looks up the underlying File,
|
||||
/// opens it and evicts some other File if necessary. The passed parameter is
|
||||
/// assumed to be a function available for the physical `File`.
|
||||
///
|
||||
/// We are doing it via a macro as Rust doesn't support async closures that
|
||||
/// take on parameters with lifetimes.
|
||||
async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
|
||||
let open_files = get_open_files();
|
||||
|
||||
let mut handle_guard = {
|
||||
@@ -356,27 +380,23 @@ impl VirtualFile {
|
||||
// We only need to hold the handle lock while we read the current handle. If
|
||||
// another thread closes the file and recycles the slot for a different file,
|
||||
// we will notice that the handle we read is no longer valid and retry.
|
||||
let mut handle = *self.handle.read().unwrap();
|
||||
let mut handle = *self.handle.read().await;
|
||||
loop {
|
||||
// Check if the slot contains our File
|
||||
{
|
||||
let slot = &open_files.slots[handle.index];
|
||||
let slot_guard = slot.inner.read().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
if let Some(file) = &slot_guard.file {
|
||||
// Found a cached file descriptor.
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
return Ok(STORAGE_IO_TIME_METRIC
|
||||
.get(op)
|
||||
.observe_closure_duration(|| func(file)));
|
||||
}
|
||||
let slot_guard = slot.inner.read().await;
|
||||
if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
|
||||
// Found a cached file descriptor.
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
return Ok(FileGuard { slot_guard });
|
||||
}
|
||||
}
|
||||
|
||||
// The slot didn't contain our File. We will have to open it ourselves,
|
||||
// but before that, grab a write lock on handle in the VirtualFile, so
|
||||
// that no other thread will try to concurrently open the same file.
|
||||
let handle_guard = self.handle.write().unwrap();
|
||||
let handle_guard = self.handle.write().await;
|
||||
|
||||
// If another thread changed the handle while we were not holding the lock,
|
||||
// then the handle might now be valid again. Loop back to retry.
|
||||
@@ -390,17 +410,10 @@ impl VirtualFile {
|
||||
|
||||
// We need to open the file ourselves. The handle in the VirtualFile is
|
||||
// now locked in write-mode. Find a free slot to put it in.
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot();
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot().await;
|
||||
|
||||
// Open the physical file
|
||||
let file = STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Open)
|
||||
.observe_closure_duration(|| self.open_options.open(&self.path))?;
|
||||
|
||||
// Perform the requested operation on it
|
||||
let result = STORAGE_IO_TIME_METRIC
|
||||
.get(op)
|
||||
.observe_closure_duration(|| func(&file));
|
||||
let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;
|
||||
|
||||
// Store the File in the slot and update the handle in the VirtualFile
|
||||
// to point to it.
|
||||
@@ -408,7 +421,9 @@ impl VirtualFile {
|
||||
|
||||
*handle_guard = handle;
|
||||
|
||||
Ok(result)
|
||||
return Ok(FileGuard {
|
||||
slot_guard: slot_guard.downgrade(),
|
||||
});
|
||||
}
|
||||
|
||||
pub fn remove(self) {
|
||||
@@ -423,11 +438,9 @@ impl VirtualFile {
|
||||
self.pos = offset;
|
||||
}
|
||||
SeekFrom::End(offset) => {
|
||||
self.pos = self
|
||||
.with_file(StorageIoOperation::Seek, |mut file| {
|
||||
file.seek(SeekFrom::End(offset))
|
||||
})
|
||||
.await??
|
||||
self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
|
||||
.as_ref()
|
||||
.seek(SeekFrom::End(offset)))?
|
||||
}
|
||||
SeekFrom::Current(offset) => {
|
||||
let pos = self.pos as i128 + offset as i128;
|
||||
@@ -515,9 +528,9 @@ impl VirtualFile {
|
||||
}
|
||||
|
||||
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = self
|
||||
.with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
|
||||
.await?;
|
||||
let result = with_file!(self, StorageIoOperation::Read, |file| file
|
||||
.as_ref()
|
||||
.read_at(buf, offset));
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["read", &self.tenant_id, &self.timeline_id])
|
||||
@@ -527,9 +540,9 @@ impl VirtualFile {
|
||||
}
|
||||
|
||||
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
let result = self
|
||||
.with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
|
||||
.await?;
|
||||
let result = with_file!(self, StorageIoOperation::Write, |file| file
|
||||
.as_ref()
|
||||
.write_at(buf, offset));
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&["write", &self.tenant_id, &self.timeline_id])
|
||||
@@ -539,6 +552,18 @@ impl VirtualFile {
|
||||
}
|
||||
}
|
||||
|
||||
struct FileGuard<'a> {
|
||||
slot_guard: RwLockReadGuard<'a, SlotInner>,
|
||||
}
|
||||
|
||||
impl<'a> AsRef<File> for FileGuard<'a> {
|
||||
fn as_ref(&self) -> &File {
|
||||
// This unwrap is safe because we only create `FileGuard`s
|
||||
// if we know that the file is Some.
|
||||
self.slot_guard.file.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl VirtualFile {
|
||||
pub(crate) async fn read_blk(
|
||||
@@ -571,20 +596,39 @@ impl VirtualFile {
|
||||
impl Drop for VirtualFile {
|
||||
/// If a VirtualFile is dropped, close the underlying file if it was open.
|
||||
fn drop(&mut self) {
|
||||
let handle = self.handle.get_mut().unwrap();
|
||||
let handle = self.handle.get_mut();
|
||||
|
||||
// We could check with a read-lock first, to avoid waiting on an
|
||||
// unrelated I/O.
|
||||
let slot = &get_open_files().slots[handle.index];
|
||||
let mut slot_guard = slot.inner.write().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
slot.recently_used.store(false, Ordering::Relaxed);
|
||||
// there is also operation "close-by-replace" for closes done on eviction for
|
||||
// comparison.
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Close)
|
||||
.observe_closure_duration(|| drop(slot_guard.file.take()));
|
||||
fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
|
||||
if slot_guard.tag == tag {
|
||||
slot.recently_used.store(false, Ordering::Relaxed);
|
||||
// there is also the `CloseByReplace` operation for closes done on eviction for
|
||||
// comparison.
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Close)
|
||||
.observe_closure_duration(|| drop(slot_guard.file.take()));
|
||||
}
|
||||
}
|
||||
|
||||
// We don't have async drop so we cannot directly await the lock here.
|
||||
// Instead, first do a best-effort attempt at closing the underlying
|
||||
// file descriptor by using `try_write`, and if that fails, spawn
|
||||
// a tokio task to do it asynchronously: we just want it to be
|
||||
// cleaned up eventually.
|
||||
// Most of the time, the `try_lock` should succeed though,
|
||||
// as we have `&mut self` access. In other words, if the slot
|
||||
// is still occupied by our file, there should be no access from
|
||||
// other I/O operations; the only other possible place to lock
|
||||
// the slot is the lock algorithm looking for free slots.
|
||||
let slot = &get_open_files().slots[handle.index];
|
||||
if let Ok(slot_guard) = slot.inner.try_write() {
|
||||
clean_slot(slot, slot_guard, handle.tag);
|
||||
} else {
|
||||
let tag = handle.tag;
|
||||
tokio::spawn(async move {
|
||||
let slot_guard = slot.inner.write().await;
|
||||
clean_slot(slot, slot_guard, tag);
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
import queue
|
||||
import threading
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
|
||||
from fixtures.types import TenantId
|
||||
|
||||
"""
|
||||
553 sudo mkfs.ext4 /dev/nvme1n1
|
||||
555 mkdir test_output
|
||||
556 sudo mount /dev/nvme1n1 test_output
|
||||
557 htop
|
||||
559 ./scripts/pysync
|
||||
560 NEON_BIN=/home/admin/neon/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
|
||||
561 sudo chown -R admin:admin test_output
|
||||
|
||||
cargo build_testing --release
|
||||
|
||||
562 NEON_BIN=$PWD/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
|
||||
|
||||
cd test_output/test_pageserver_startup_many_tenants/repo
|
||||
|
||||
sudo env NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000 ../../../target/release/neon_local start
|
||||
# watch initial load complete, then background jobs start. That's the interesting part.
|
||||
sudo env NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000 ../../../target/release/neon_local stop
|
||||
# usually pageserver won't be responsive, kill with
|
||||
sudo pkill -9 pageserver
|
||||
"""
|
||||
def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# below doesn't work because summaries contain tenant and timeline ids and we check for them
|
||||
|
||||
tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
|
||||
pshttp = env.pageserver.http_client()
|
||||
ep = env.endpoints.create_start("main")
|
||||
ep.safe_psql("create table foo(b text)")
|
||||
for i in range(0, 8):
|
||||
ep.safe_psql("insert into foo(b) values ('some text')")
|
||||
# pg_bin.run_capture(["pgbench", "-i", "-s1", ep.connstr()])
|
||||
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
|
||||
pshttp.timeline_checkpoint(tenant_id, timeline_id)
|
||||
ep.stop_and_destroy()
|
||||
|
||||
env.pageserver.stop()
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
tenant_dir = env.repo_dir / "pageserver_1" / "tenants" / str(env.initial_tenant)
|
||||
|
||||
for i in range(0, 20_000):
|
||||
import shutil
|
||||
|
||||
shutil.copytree(tenant_dir, tenant_dir.parent / str(TenantId.generate()))
|
||||
Reference in New Issue
Block a user